Merge branch 'master' into ctl-unique-test

2025-08-03 16:30:29 +00:00 · 2024-08-31 15:54:57 -07:00 · 2024-08-31 15:54:57 -07:00 · 5757eaf70d
commit 5757eaf70d
parent a525345db4 e1528a71e2
198 changed files with 199788 additions and 647 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,5 +1,5 @@
 {
-    "C_Cpp.default.compilerPath": ".cosmocc/3.7.1/bin/aarch64-linux-cosmo-c++",
+    "C_Cpp.default.compilerPath": ".cosmocc/3.8.0/bin/aarch64-linux-cosmo-c++",
    "C_Cpp.default.compilerArgs": [
        "-nostdinc",
        "-nostdlib",
@ -33,4 +33,4 @@
    "files.associations": {
        "log.h": "c"
    }
-}
+}
--- a/4
+++ b/4
@ -147,10 +147,10 @@ export MODE
 export SOURCE_DATE_EPOCH
 export TMPDIR
-COSMOCC = .cosmocc/3.7.1
+COSMOCC = .cosmocc/3.8.0
 BOOTSTRAP = $(COSMOCC)/bin
 TOOLCHAIN = $(COSMOCC)/bin/$(ARCH)-linux-cosmo-
-DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.7.1 13b65b0e659b493bd82f3d0a319d0265d66f849839e484aa2a54191024711e85)
+DOWNLOAD := $(shell build/download-cosmocc.sh $(COSMOCC) 3.8.0 813c6b2f95062d2e0a845307a79505424cb98cb038e8013334f8a22e3b92a474)
 IGNORE := $(shell $(MKDIR) $(TMPDIR))
--- a/ape/aarch64.lds
+++ b/ape/aarch64.lds
@ -103,10 +103,8 @@ SECTIONS {
    *(.eh_frame_entry .eh_frame_entry.*)
  }
-  .eh_frame : ONLY_IF_RO {
+  __eh_frame_hdr_start = SIZEOF(.eh_frame_hdr) > 0 ? ADDR(.eh_frame_hdr) : 0;
-    KEEP(*(.eh_frame))
+  __eh_frame_hdr_end = SIZEOF(.eh_frame_hdr) > 0 ? . : 0;
    *(.eh_frame.*)
  }
  .gcc_except_table : ONLY_IF_RO {
    *(.gcc_except_table .gcc_except_table.*)
@ -127,9 +125,11 @@ SECTIONS {
  . += CONSTANT(MAXPAGESIZE);
  . = DATA_SEGMENT_ALIGN(CONSTANT(MAXPAGESIZE), CONSTANT(COMMONPAGESIZE));
-  .eh_frame : ONLY_IF_RW {
+  .eh_frame : {
    __eh_frame_start = .;
    KEEP(*(.eh_frame))
    *(.eh_frame.*)
    __eh_frame_end = .;
  }
  .gnu_extab : ONLY_IF_RW {
--- a/ape/ape.lds
+++ b/ape/ape.lds
@ -329,6 +329,10 @@ SECTIONS {
    *(.ubsan.types)
    *(.ubsan.data)
    __eh_frame_hdr_start_actual = .;
    *(.eh_frame_hdr)
    __eh_frame_hdr_end_actual = .;
    /* Legal Notices */
    __notices = .;
    KEEP(*(.notice))
@ -422,6 +426,11 @@ SECTIONS {
    KEEP(*(.dtors))
    __fini_array_end = .;
    __eh_frame_start = .;
    KEEP(*(.eh_frame))
    *(.eh_frame.*)
    __eh_frame_end = .;
 /*BEGIN: Post-Initialization Read-Only */
    . = ALIGN(. != 0 ? __SIZEOF_POINTER__ : 0);
    KEEP(*(SORT_BY_NAME(.piro.relo.sort.*)))
@ -601,6 +610,9 @@ ape_text_memsz = ape_text_filesz;
 ape_text_align = CONSTANT(COMMONPAGESIZE);
 ape_text_rva = RVA(ape_text_vaddr);
 __eh_frame_hdr_start = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_start_actual : 0;
 __eh_frame_hdr_end = __eh_frame_hdr_end_actual > __eh_frame_hdr_start_actual ? __eh_frame_hdr_end_actual : 0;
 /* we roundup here because xnu wants the file load segments page-aligned */
 /* but we don't want to add the nop padding to the ape program, so we'll */
 /* let ape.S dd read past the end of the file into the wrapping binaries */
--- a/build/definitions.mk
+++ b/build/definitions.mk
@ -92,10 +92,7 @@ DEFAULT_COPTS ?=							\
 	-fno-gnu-unique							\
 	-fstrict-aliasing						\
 	-fstrict-overflow						\
-	-fno-semantic-interposition					\
+	-fno-semantic-interposition
 	-fno-dwarf2-cfi-asm						\
 	-fno-unwind-tables						\
 	-fno-asynchronous-unwind-tables
 ifeq ($(ARCH), x86_64)
 # Microsoft says "[a]ny memory below the stack beyond the red zone
@ -139,8 +136,6 @@ DEFAULT_CFLAGS =							\
 DEFAULT_CXXFLAGS =							\
 	-std=gnu++23							\
 	-fno-rtti							\
 	-fno-exceptions							\
 	-fuse-cxa-atexit						\
 	-Wno-int-in-bool-context					\
 	-Wno-narrowing							\
--- a/build/objdump
+++ b/build/objdump
@ -6,14 +6,14 @@ if [ -n "$OBJDUMP" ]; then
 fi
 find_objdump() {
-  if [ -x .cosmocc/3.6.0/bin/$1-linux-cosmo-objdump ]; then
+  if [ -x .cosmocc/3.8.0/bin/$1-linux-cosmo-objdump ]; then
-    OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump
+    OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump
-  elif [ -x .cosmocc/3.6.0/bin/$1-linux-musl-objdump ]; then
+  elif [ -x .cosmocc/3.8.0/bin/$1-linux-musl-objdump ]; then
-    OBJDUMP=.cosmocc/3.6.0/bin/$1-linux-musl-objdump
+    OBJDUMP=.cosmocc/3.8.0/bin/$1-linux-musl-objdump
-  elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump" ]; then
+  elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump" ]; then
-    OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-cosmo-objdump"
+    OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-cosmo-objdump"
-  elif [ -x "$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump" ]; then
+  elif [ -x "$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump" ]; then
-    OBJDUMP="$COSMO/.cosmocc/3.6.0/bin/$1-linux-musl-objdump"
+    OBJDUMP="$COSMO/.cosmocc/3.8.0/bin/$1-linux-musl-objdump"
  else
    echo "error: toolchain not found (try running 'cosmocc --update' or 'make' in the cosmo monorepo)" >&2
    exit 1
--- a/ctl/conditional.h
+++ b/ctl/conditional.h
@ -17,6 +17,9 @@ struct conditional<false, T, F>
    typedef F type;
 };
 template<bool B, typename T, typename F>
 using conditional_t = typename conditional<B, T, F>::type;
 } // namespace ctl
 #endif // CTL_CONDITIONAL_H_
--- a/ctl/is_void.h
+++ b/ctl/is_void.h
@ -19,6 +19,9 @@ template<typename _Tp>
 struct is_void : public is_void_<typename ctl::remove_cv<_Tp>::type>::type
 {};
 template<typename T>
 inline constexpr bool is_void_v = is_void<T>::value;
 } // namespace ctl
 #endif // CTL_IS_VOID_H_
--- a/ctl/set.h
+++ b/ctl/set.h
@ -241,8 +241,9 @@ class set
      private:
        friend class set;
        node_type* node_;
        node_type* root_;
-        explicit reverse_iterator(node_type* node) : node_(node)
+        explicit reverse_iterator(node_type* node, node_type* root) : node_(node), root_(root)
        {
        }
    };
@ -347,17 +348,17 @@ class set
    reverse_iterator rbegin()
    {
-        return reverse_iterator(rightmost(root_));
+        return reverse_iterator(rightmost(root_), root_);
    }
    const_reverse_iterator rbegin() const
    {
-        return const_reverse_iterator(rightmost(root_));
+        return const_reverse_iterator(rightmost(root_), root_);
    }
    const_reverse_iterator crbegin() const
    {
-        return const_reverse_iterator(rightmost(root_));
+        return const_reverse_iterator(rightmost(root_), root_);
    }
    iterator end() noexcept
@ -377,17 +378,17 @@ class set
    reverse_iterator rend()
    {
-        return reverse_iterator(nullptr);
+        return reverse_iterator(nullptr, root_);
    }
    const_reverse_iterator rend() const
    {
-        return const_reverse_iterator(nullptr);
+        return const_reverse_iterator(nullptr, root_);
    }
    const_reverse_iterator crend() const
    {
-        return const_reverse_iterator(nullptr);
+        return const_reverse_iterator(nullptr, root_);
    }
    void clear() noexcept
--- a/ctl/shared_ptr.h
+++ b/ctl/shared_ptr.h
@ -0,0 +1,454 @@
 // -*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-
 // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 #ifndef CTL_SHARED_PTR_H_
 #define CTL_SHARED_PTR_H_
 #include "exception.h"
 #include "is_convertible.h"
 #include "remove_extent.h"
 #include "unique_ptr.h"
 // XXX currently needed to use placement-new syntax (move to cxx.inc?)
 void*
 operator new(size_t, void*) noexcept;
 namespace ctl {
 class bad_weak_ptr : public exception
 {
  public:
    const char* what() const noexcept override
    {
        return "ctl::bad_weak_ptr";
    }
 };
 namespace __ {
 template<typename T>
 struct ptr_ref
 {
    using type = T&;
 };
 template<>
 struct ptr_ref<void>
 {
    using type = void;
 };
 static inline __attribute__((always_inline)) void
 incref(size_t* r) noexcept
 {
 #ifdef NDEBUG
    __atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
 #else
    size_t refs = __atomic_fetch_add(r, 1, __ATOMIC_RELAXED);
    if (refs > ((size_t)-1) >> 1)
        __builtin_trap();
 #endif
 }
 static inline __attribute__((always_inline)) bool
 decref(size_t* r) noexcept
 {
    if (!__atomic_fetch_sub(r, 1, __ATOMIC_RELEASE)) {
        __atomic_thread_fence(__ATOMIC_ACQUIRE);
        return true;
    }
    return false;
 }
 class shared_ref
 {
  public:
    constexpr shared_ref() noexcept = default;
    shared_ref(const shared_ref&) = delete;
    shared_ref& operator=(const shared_ref&) = delete;
    virtual ~shared_ref() = default;
    void keep_shared() noexcept
    {
        incref(&shared);
    }
    void drop_shared() noexcept
    {
        if (decref(&shared)) {
            dispose();
            drop_weak();
        }
    }
    void keep_weak() noexcept
    {
        incref(&weak);
    }
    void drop_weak() noexcept
    {
        if (decref(&weak)) {
            delete this;
        }
    }
    size_t use_count() const noexcept
    {
        return shared + 1;
    }
    size_t weak_count() const noexcept
    {
        return weak;
    }
  private:
    virtual void dispose() noexcept = 0;
    size_t shared = 0;
    size_t weak = 0;
 };
 template<typename T, typename D>
 class shared_pointer : public shared_ref
 {
  public:
    static shared_pointer* make(T* const p, D d)
    {
        return make(unique_ptr<T, D>(p, move(d)));
    }
    static shared_pointer* make(unique_ptr<T, D> p)
    {
        return new shared_pointer(p.release(), move(p.get_deleter()));
    }
  private:
    shared_pointer(T* const p, D d) noexcept : p(p), d(move(d))
    {
    }
    void dispose() noexcept override
    {
        move(d)(p);
    }
    T* const p;
    [[no_unique_address]] D d;
 };
 template<typename T>
 class shared_emplace : public shared_ref
 {
  public:
    union
    {
        T t;
    };
    template<typename... Args>
    void construct(Args&&... args)
    {
        ::new (&t) T(forward<Args>(args)...);
    }
    static unique_ptr<shared_emplace> make()
    {
        return unique_ptr(new shared_emplace());
    }
  private:
    explicit constexpr shared_emplace() noexcept = default;
    void dispose() noexcept override
    {
        t.~T();
    }
 };
 template<typename T, typename U>
 concept shared_ptr_compatible = is_convertible_v<U*, T*>;
 } // namespace __
 template<typename T>
 class weak_ptr;
 template<typename T>
 class shared_ptr
 {
  public:
    using element_type = remove_extent_t<T>;
    using weak_type = weak_ptr<T>;
    constexpr shared_ptr() noexcept = default;
    constexpr shared_ptr(nullptr_t) noexcept
    {
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    explicit shared_ptr(U* const p) : shared_ptr(p, default_delete<U>())
    {
    }
    template<typename U, typename D>
        requires __::shared_ptr_compatible<T, U>
    shared_ptr(U* const p, D d)
      : p(p), rc(__::shared_pointer<U, D>::make(p, move(d)))
    {
    }
    template<typename U>
    shared_ptr(const shared_ptr<U>& r, element_type* p) noexcept
      : p(p), rc(r.rc)
    {
        if (rc)
            rc->keep_shared();
    }
    template<typename U>
    shared_ptr(shared_ptr<U>&& r, element_type* p) noexcept : p(p), rc(r.rc)
    {
        r.p = nullptr;
        r.rc = nullptr;
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    shared_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
    {
        if (rc)
            rc->keep_shared();
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    shared_ptr(shared_ptr<U>&& r) noexcept : p(r.p), rc(r.rc)
    {
        r.p = nullptr;
        r.rc = nullptr;
    }
    shared_ptr(const shared_ptr& r) noexcept : p(r.p), rc(r.rc)
    {
        if (rc)
            rc->keep_shared();
    }
    shared_ptr(shared_ptr&& r) noexcept : p(r.p), rc(r.rc)
    {
        r.p = nullptr;
        r.rc = nullptr;
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    explicit shared_ptr(const weak_ptr<U>& r) : p(r.p), rc(r.rc)
    {
        if (r.expired()) {
            throw bad_weak_ptr();
        }
        rc->keep_shared();
    }
    template<typename U, typename D>
        requires __::shared_ptr_compatible<T, U>
    shared_ptr(unique_ptr<U, D>&& r)
      : p(r.p), rc(__::shared_pointer<U, D>::make(move(r)))
    {
    }
    ~shared_ptr()
    {
        if (rc)
            rc->drop_shared();
    }
    shared_ptr& operator=(shared_ptr r) noexcept
    {
        swap(r);
        return *this;
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    shared_ptr& operator=(shared_ptr<U> r) noexcept
    {
        shared_ptr<T>(move(r)).swap(*this);
        return *this;
    }
    void reset() noexcept
    {
        shared_ptr().swap(*this);
    }
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    void reset(U* const p2)
    {
        shared_ptr<T>(p2).swap(*this);
    }
    template<typename U, typename D>
        requires __::shared_ptr_compatible<T, U>
    void reset(U* const p2, D d)
    {
        shared_ptr<T>(p2, d).swap(*this);
    }
    void swap(shared_ptr& r) noexcept
    {
        using ctl::swap;
        swap(p, r.p);
        swap(rc, r.rc);
    }
    element_type* get() const noexcept
    {
        return p;
    }
    typename __::ptr_ref<T>::type operator*() const noexcept
    {
        if (!p)
            __builtin_trap();
        return *p;
    }
    T* operator->() const noexcept
    {
        if (!p)
            __builtin_trap();
        return p;
    }
    long use_count() const noexcept
    {
        return rc ? rc->use_count() : 0;
    }
    explicit operator bool() const noexcept
    {
        return p;
    }
    template<typename U>
    bool owner_before(const shared_ptr<U>& r) const noexcept
    {
        return p < r.p;
    }
    template<typename U>
    bool owner_before(const weak_ptr<U>& r) const noexcept
    {
        return !r.owner_before(*this);
    }
  private:
    template<typename U>
    friend class weak_ptr;
    template<typename U>
    friend class shared_ptr;
    template<typename U, typename... Args>
    friend shared_ptr<U> make_shared(Args&&... args);
    element_type* p = nullptr;
    __::shared_ref* rc = nullptr;
 };
 template<typename T>
 class weak_ptr
 {
  public:
    using element_type = remove_extent_t<T>;
    constexpr weak_ptr() noexcept = default;
    template<typename U>
        requires __::shared_ptr_compatible<T, U>
    weak_ptr(const shared_ptr<U>& r) noexcept : p(r.p), rc(r.rc)
    {
        if (rc)
            rc->keep_weak();
    }
    ~weak_ptr()
    {
        if (rc)
            rc->drop_weak();
    }
    long use_count() const noexcept
    {
        return rc ? rc->use_count() : 0;
    }
    bool expired() const noexcept
    {
        return !use_count();
    }
    void reset() noexcept
    {
        weak_ptr().swap(*this);
    }
    void swap(weak_ptr& r) noexcept
    {
        using ctl::swap;
        swap(p, r.p);
        swap(rc, r.rc);
    }
    shared_ptr<T> lock() const noexcept
    {
        if (expired())
            return nullptr;
        shared_ptr<T> r;
        r.p = p;
        r.rc = rc;
        if (rc)
            rc->keep_shared();
        return r;
    }
    template<typename U>
    bool owner_before(const weak_ptr<U>& r) const noexcept
    {
        return p < r.p;
    }
    template<typename U>
    bool owner_before(const shared_ptr<U>& r) const noexcept
    {
        return p < r.p;
    }
  private:
    template<typename U>
    friend class shared_ptr;
    element_type* p = nullptr;
    __::shared_ref* rc = nullptr;
 };
 template<typename T, typename... Args>
 shared_ptr<T>
 make_shared(Args&&... args)
 {
    auto rc = __::shared_emplace<T>::make();
    rc->construct(forward<Args>(args)...);
    shared_ptr<T> r;
    r.p = &rc->t;
    r.rc = rc.release();
    return r;
 }
 } // namespace ctl
 #endif // CTL_SHARED_PTR_H_
--- a/examples/BUILD.mk
+++ b/examples/BUILD.mk
@ -94,6 +94,7 @@ EXAMPLES_DIRECTDEPS =						\
 	THIRD_PARTY_VQSORT					\
 	THIRD_PARTY_XED						\
 	THIRD_PARTY_LIBCXXABI					\
 	THIRD_PARTY_LIBUNWIND					\
 	THIRD_PARTY_ZLIB					\
 	TOOL_ARGS						\
 	TOOL_BUILD_LIB						\
--- a/libc/calls/finddebugbinary.c
+++ b/libc/calls/finddebugbinary.c
@ -38,6 +38,7 @@
 #include "libc/sysv/consts/prot.h"
 static struct {
  atomic_uint once;
  const char *res;
  char buf[PATH_MAX];
 } g_comdbg;
@ -124,10 +125,11 @@ static void FindDebugBinaryInit(void) {
 * @asyncsignalsafe
 */
 const char *FindDebugBinary(void) {
  cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
  return g_comdbg.res;
 }
 // pay startup cost to make this signal safe from the user's perspective
 __attribute__((__constructor__(10))) static void FindDebugBinaryCtor(void) {
-  FindDebugBinaryInit();
+  cosmo_once(&g_comdbg.once, FindDebugBinaryInit);
 }
--- a/libc/dlopen/stubs.c
+++ b/libc/dlopen/stubs.c
@ -42,7 +42,3 @@ void *dlsym(void *, const char *) {
 int dlclose(void *) {
  return -1;
 }
 int dl_iterate_phdr(int (*)(void *, size_t, void *), void *) {
  return -1;
 }
--- a/libc/integral/normalize.inc
+++ b/libc/integral/normalize.inc
@ -3,8 +3,8 @@
 #endif
 #define __COSMOPOLITAN_MAJOR__ 3
-#define __COSMOPOLITAN_MINOR__ 7
+#define __COSMOPOLITAN_MINOR__ 8
-#define __COSMOPOLITAN_PATCH__ 1
+#define __COSMOPOLITAN_PATCH__ 0
 #define __COSMOPOLITAN__                                                   \
  (100000000 * __COSMOPOLITAN_MAJOR__ + 1000000 * __COSMOPOLITAN_MINOR__ + \
   __COSMOPOLITAN_PATCH__)
@ -93,6 +93,30 @@
 #include "libc/integral/llp64.inc"
 #endif
 #undef __INT_FAST16_MAX__
 #undef __INT_FAST16_TYPE__
 #undef __UINT_FAST16_MAX__
 #undef __INT_FAST16_WIDTH__
 #undef __UINT_FAST16_TYPE__
 #define __INT_FAST16_MAX__   2147483647
 #define __INT_FAST16_TYPE__  int
 #define __UINT_FAST16_MAX__  4294967295U
 #define __INT_FAST16_WIDTH__ 32
 #define __UINT_FAST16_TYPE__ unsigned int
 #undef __INT_FAST32_MAX__
 #undef __INT_FAST32_TYPE__
 #undef __UINT_FAST32_MAX__
 #undef __INT_FAST32_WIDTH__
 #undef __UINT_FAST32_TYPE__
 #define __INT_FAST32_MAX__   2147483647
 #define __INT_FAST32_TYPE__  int
 #define __UINT_FAST32_MAX__  4294967295U
 #define __INT_FAST32_WIDTH__ 32
 #define __UINT_FAST32_TYPE__ unsigned int
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 #ifdef __STDC__
 #include "libc/integral/c.inc"
--- a/libc/intrin/personality.c
+++ b/libc/intrin/personality.c
@ -0,0 +1,22 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │ Copyright 2024 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
 │ above copyright notice and this permission notice appear in all copies.      │
 │                                                                              │
 │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
 │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
 │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
 │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
 │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
 │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 __attribute__((__weak__)) void __gxx_personality_v0() {
  __builtin_trap();
 }
--- a/libc/isystem/ammintrin.h
+++ b/libc/isystem/ammintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/ammintrin.h"
 #else
 #include "third_party/intel/ammintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/amxcomplexintrin.h
+++ b/libc/isystem/amxcomplexintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/amxcomplexintrin.h"
 #else
 #include "third_party/intel/amxcomplexintrin.internal.h"
 #endif
--- a/libc/isystem/amxfp16intrin.h
+++ b/libc/isystem/amxfp16intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/amxfp16intrin.h"
 #else
 #include "third_party/intel/amxfp16intrin.internal.h"
 #endif
--- a/libc/isystem/arm_acle.h
+++ b/libc/isystem/arm_acle.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_acle.h"
 #else
 #include "third_party/aarch64/arm_acle.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_ACLE_H_ */
--- a/libc/isystem/arm_bf16.h
+++ b/libc/isystem/arm_bf16.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_bf16.h"
 #else
 #include "third_party/aarch64/arm_bf16.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_BF16_H_ */
--- a/libc/isystem/arm_fp16.h
+++ b/libc/isystem/arm_fp16.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_fp16.h"
 #else
 #include "third_party/aarch64/arm_fp16.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_FP16_H_ */
--- a/libc/isystem/arm_neon.h
+++ b/libc/isystem/arm_neon.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_neon.h"
 #else
 #include "third_party/aarch64/arm_neon.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_NEON_H_ */
--- a/libc/isystem/arm_sve.h
+++ b/libc/isystem/arm_sve.h
@ -0,0 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_sve.h"
 #else
 #include "third_party/aarch64/arm_sve.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_SVE_H_ */
--- a/libc/isystem/arm_vector_types.h
+++ b/libc/isystem/arm_vector_types.h
@ -0,0 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_
 #ifdef __clang__
 #include "third_party/aarch64/clang/arm_vector_types.h"
 #else
 #include "third_party/aarch64/arm_vector_types.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_ARM_VECTOR_TYPES_H_ */
--- a/libc/isystem/avxifmaintrin.h
+++ b/libc/isystem/avxifmaintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/avxifmaintrin.h"
 #else
 #include "third_party/intel/avxifmaintrin.internal.h"
 #endif
--- a/libc/isystem/avxneconvertintrin.h
+++ b/libc/isystem/avxneconvertintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/avxneconvertintrin.h"
 #else
 #include "third_party/intel/avxneconvertintrin.internal.h"
 #endif
--- a/libc/isystem/avxvnniint16intrin.h
+++ b/libc/isystem/avxvnniint16intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/avxvnniint16intrin.h"
 #else
 #include "third_party/intel/avxvnniint16intrin.internal.h"
 #endif
--- a/libc/isystem/avxvnniint8intrin.h
+++ b/libc/isystem/avxvnniint8intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/avxvnniint8intrin.h"
 #else
 #include "third_party/intel/avxvnniint8intrin.internal.h"
 #endif
--- a/libc/isystem/clzerointrin.h
+++ b/libc/isystem/clzerointrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/clzerointrin.h"
 #else
 #include "third_party/intel/clzerointrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */
--- a/libc/isystem/cmpccxaddintrin.h
+++ b/libc/isystem/cmpccxaddintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/cmpccxaddintrin.h"
 #else
 #include "third_party/intel/cmpccxaddintrin.internal.h"
 #endif
--- a/libc/isystem/emmintrin.h
+++ b/libc/isystem/emmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/emmintrin.h"
 #else
 #include "third_party/intel/emmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/immintrin.h
+++ b/libc/isystem/immintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/immintrin.h"
 #else
 #include "third_party/intel/immintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/mm_malloc.h
+++ b/libc/isystem/mm_malloc.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/mm_malloc.h"
 #else
 #include "third_party/intel/mm_malloc.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */
--- a/libc/isystem/mmintrin.h
+++ b/libc/isystem/mmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/mmintrin.h"
 #else
 #include "third_party/intel/mmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/mwaitxintrin.h
+++ b/libc/isystem/mwaitxintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/mwaitxintrin.h"
 #else
 #include "third_party/intel/mwaitxintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */
--- a/libc/isystem/nmmintrin.h
+++ b/libc/isystem/nmmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/nmmintrin.h"
 #else
 #include "third_party/intel/nmmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/pmmintrin.h
+++ b/libc/isystem/pmmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/pmmintrin.h"
 #else
 #include "third_party/intel/pmmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/popcntintrin.h
+++ b/libc/isystem/popcntintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/popcntintrin.h"
 #else
 #include "third_party/intel/popcntintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */
--- a/libc/isystem/prfchiintrin.h
+++ b/libc/isystem/prfchiintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/prfchiintrin.h"
 #else
 #include "third_party/intel/prfchiintrin.internal.h"
 #endif
--- a/libc/isystem/raointintrin.h
+++ b/libc/isystem/raointintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/raointintrin.h"
 #else
 #include "third_party/intel/raointintrin.internal.h"
 #endif
--- a/libc/isystem/sgxintrin.h
+++ b/libc/isystem/sgxintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/sgxintrin.h"
 #else
 #include "third_party/intel/sgxintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */
--- a/libc/isystem/sha512intrin.h
+++ b/libc/isystem/sha512intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/sha512intrin.h"
 #else
 #include "third_party/intel/sha512intrin.internal.h"
 #endif
--- a/libc/isystem/sm3intrin.h
+++ b/libc/isystem/sm3intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/sm3intrin.h"
 #else
 #include "third_party/intel/sm3intrin.internal.h"
 #endif
--- a/libc/isystem/sm4intrin.h
+++ b/libc/isystem/sm4intrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/sm4intrin.h"
 #else
 #include "third_party/intel/sm4intrin.internal.h"
 #endif
--- a/libc/isystem/smmintrin.h
+++ b/libc/isystem/smmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/smmintrin.h"
 #else
 #include "third_party/intel/smmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/tmmintrin.h
+++ b/libc/isystem/tmmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/tmmintrin.h"
 #else
 #include "third_party/intel/tmmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/usermsrintrin.h
+++ b/libc/isystem/usermsrintrin.h
@ -1 +1,5 @@
 #ifdef __clang__
 #include "third_party/intel/clang/usermsrintrin.h"
 #else
 #include "third_party/intel/usermsrintrin.internal.h"
 #endif
--- a/libc/isystem/wmmintrin.h
+++ b/libc/isystem/wmmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/wmmintrin.h"
 #else
 #include "third_party/intel/wmmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */
--- a/libc/isystem/x86intrin.h
+++ b/libc/isystem/x86intrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/x86intrin.h"
 #else
 #include "third_party/intel/x86intrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */
--- a/libc/isystem/xmmintrin.h
+++ b/libc/isystem/xmmintrin.h
@ -1,4 +1,8 @@
 #ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
 #define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
 #ifdef __clang__
 #include "third_party/intel/clang/xmmintrin.h"
 #else
 #include "third_party/intel/xmmintrin.internal.h"
 #endif
 #endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */
--- a/libc/mem/BUILD.mk
+++ b/libc/mem/BUILD.mk
@ -42,7 +42,8 @@ $(LIBC_MEM_A_OBJS): private				\
 		COPTS +=				\
 			-fno-sanitize=all		\
 			-Wframe-larger-than=4096	\
-			-Walloca-larger-than=4096
+			-Walloca-larger-than=4096	\
 			-fexceptions
 o/$(MODE)/libc/mem/asan.o: private			\
 		CFLAGS +=				\
--- a/libc/mem/alg.h
+++ b/libc/mem/alg.h
@ -7,10 +7,10 @@ void *bsearch(const void *, const void *, size_t, size_t,
 void *bsearch_r(const void *, const void *, size_t, size_t,
                int (*)(const void *, const void *, void *), void *)
    paramsnonnull((1, 2, 5)) nosideeffect;
-void qsort3(void *, size_t, size_t,
+void qsort3(void *, size_t, size_t, int (*)(const void *, const void *))
-            int (*)(const void *, const void *)) libcesque paramsnonnull();
+    paramsnonnull();
-void qsort(void *, size_t, size_t,
+void qsort(void *, size_t, size_t, int (*)(const void *, const void *))
-           int (*)(const void *, const void *)) libcesque paramsnonnull();
+    paramsnonnull();
 void qsort_r(void *, size_t, size_t,
             int (*)(const void *, const void *, void *), void *)
    paramsnonnull((1, 4));
--- a/libc/stdio/fmt.c
+++ b/libc/stdio/fmt.c
@ -76,9 +76,9 @@
 #define FLAGS_PRECISION 0x20
 #define FLAGS_ISSIGNED  0x40
 #define FLAGS_NOQUOTE   0x80
 #define FLAGS_REPR      0x100
 #define FLAGS_QUOTE     FLAGS_SPACE
 #define FLAGS_GROUPING  FLAGS_NOQUOTE
 #define FLAGS_REPR      FLAGS_PLUS
 #define __FMT_PUT(C)              \
  do {                            \
--- a/libc/testlib/testmain.c
+++ b/libc/testlib/testmain.c
@ -105,7 +105,7 @@ int main(int argc, char *argv[]) {
  __log_level = kLogInfo;
  GetOpts(argc, argv);
-  for (fd = 3; fd < 10; ++fd) {
+  for (fd = 3; fd < 100; ++fd) {
    close(fd);
  }
--- a/test/ctl/shared_ptr_test.cc
+++ b/test/ctl/shared_ptr_test.cc
@ -0,0 +1,248 @@
 // -*- mode:c++; indent-tabs-mode:nil; c-basic-offset:4; coding:utf-8 -*-
 // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
 //
 // Copyright 2024 Justine Alexandra Roberts Tunney
 //
 // Permission to use, copy, modify, and/or distribute this software for
 // any purpose with or without fee is hereby granted, provided that the
 // above copyright notice and this permission notice appear in all copies.
 //
 // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
 // WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
 // WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
 // AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
 // DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 // PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 // PERFORMANCE OF THIS SOFTWARE.
 #include "ctl/shared_ptr.h"
 #include "ctl/vector.h"
 #include "libc/mem/leaks.h"
 // #include <memory>
 // #include <vector>
 // #define ctl std
 using ctl::bad_weak_ptr;
 using ctl::make_shared;
 using ctl::move;
 using ctl::shared_ptr;
 using ctl::unique_ptr;
 using ctl::vector;
 using ctl::weak_ptr;
 #undef ctl
 static int g = 0;
 struct ConstructG
 {
    ConstructG()
    {
        ++g;
    }
 };
 struct DestructG
 {
    ~DestructG()
    {
        ++g;
    }
 };
 struct CallG
 {
    void operator()(auto*) const noexcept
    {
        ++g;
    }
 };
 struct Base
 {};
 struct Derived : Base
 {};
 int
 main()
 {
    int a, b;
    {
        // Shouldn't cause memory leaks.
        shared_ptr<int> x(new int(5));
    }
    {
        // Objects get destroyed when the last shared_ptr is reset.
        shared_ptr<int> x(&a, CallG());
        shared_ptr<int> y(x);
        x.reset();
        if (g)
            return 1;
        y.reset();
        if (g != 1)
            return 2;
    }
    {
        g = 0;
        // Weak pointers don't prevent object destruction.
        shared_ptr<int> x(&a, CallG());
        weak_ptr<int> y(x);
        x.reset();
        if (g != 1)
            return 3;
    }
    {
        g = 0;
        // Weak pointers can be promoted to shared pointers.
        shared_ptr<int> x(&a, CallG());
        weak_ptr<int> y(x);
        auto z = y.lock();
        x.reset();
        if (g)
            return 4;
        y.reset();
        if (g)
            return 5;
        z.reset();
        if (g != 1)
            return 6;
    }
    {
        // Shared null pointers are falsey.
        shared_ptr<int> x;
        if (x)
            return 7;
        x.reset(new int);
        if (!x)
            return 8;
    }
    {
        // You can cast a shared pointer validly.
        shared_ptr<Derived> x(new Derived);
        shared_ptr<Base> y(x);
        // But not invalidly:
        // shared_ptr<Base> x(new Derived);
        // shared_ptr<Derived> y(x);
    }
    {
        // You can cast a shared pointer to void to retain a reference.
        shared_ptr<int> x(new int);
        shared_ptr<void> y(x);
    }
    {
        // You can also create a shared pointer to void in the first place.
        shared_ptr<void> x(new int);
    }
    {
        // You can take a shared pointer to a subobject, and it will free the
        // base object.
        shared_ptr<vector<int>> x(new vector<int>);
        x->push_back(5);
        shared_ptr<int> y(x, &x->at(0));
        x.reset();
        if (*y != 5)
            return 9;
    }
    {
        g = 0;
        // You can create a shared_ptr from a unique_ptr.
        unique_ptr<int, CallG> x(&a, CallG());
        shared_ptr<int> y(move(x));
        if (x)
            return 10;
        y.reset();
        if (g != 1)
            return 11;
    }
    {
        g = 0;
        // You can reassign shared_ptrs.
        shared_ptr<int> x(&a, CallG());
        shared_ptr<int> y;
        y = x;
        x.reset();
        if (g)
            return 12;
        y.reset();
        if (g != 1)
            return 13;
    }
    {
        // owner_before works across shared and weak pointers.
        shared_ptr<int> x(&a, CallG());
        shared_ptr<int> y(&b, CallG());
        if (!x.owner_before(y))
            return 14;
        if (!x.owner_before(weak_ptr<int>(y)))
            return 15;
    }
    {
        // Use counts work like you'd expect
        shared_ptr<int> x(new int);
        if (x.use_count() != 1)
            return 16;
        shared_ptr<int> y(x);
        if (x.use_count() != 2 || y.use_count() != 2)
            return 17;
        x.reset();
        if (x.use_count() != 0 || y.use_count() != 1)
            return 18;
    }
    {
        // There is a make_shared that will allocate an object for you safely.
        auto x = make_shared<int>(5);
        if (!x)
            return 19;
        if (*x != 5)
            return 20;
    }
    {
        // Expired weak pointers lock to nullptr, and throw when promoted to
        // shared pointer by constructor.
        auto x = make_shared<int>();
        weak_ptr<int> y(x);
        x.reset();
        if (y.lock())
            return 21;
        int caught = 0;
        try {
            shared_ptr<int> z(y);
        } catch (bad_weak_ptr& e) {
            caught = 1;
        }
        if (!caught)
            return 22;
    }
    {
        // nullptr is always expired.
        shared_ptr<int> x(nullptr);
        weak_ptr<int> y(x);
        if (!y.expired())
            return 23;
    }
    // TODO(mrdomino): exercise threads / races. The reference count should be
    // atomically maintained.
    CheckForMemoryLeaks();
    return 0;
 }
--- a/test/libc/stdio/snprintf_test.c
+++ b/test/libc/stdio/snprintf_test.c
@ -27,3 +27,11 @@ TEST(snprintf, testVeryLargePrecision) {
  ASSERT_EQ(i, 9999);
  ASSERT_EQ(strlen(buf), 511);
 }
 TEST(snprintf, testPlusFlagOnChar) {
  char buf[10] = {};
  int i = snprintf(buf, sizeof(buf), "%+c", '=');
  ASSERT_EQ(i, 1);
  ASSERT_STREQ(buf, "=");
 }
--- a/test/libc/tinymath/BUILD.mk
+++ b/test/libc/tinymath/BUILD.mk
@ -48,6 +48,8 @@ TEST_LIBC_TINYMATH_DIRECTDEPS =					\
 	THIRD_PARTY_DOUBLECONVERSION				\
 	THIRD_PARTY_GDTOA					\
 	THIRD_PARTY_LIBCXX					\
 	THIRD_PARTY_LIBCXXABI					\
 	THIRD_PARTY_LIBUNWIND					\
 TEST_LIBC_TINYMATH_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
--- a/test/libcxx/cexception_test.cc
+++ b/test/libcxx/cexception_test.cc
@ -21,13 +21,6 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 // this dontthrow keyword SHOULD break this test. it's probably passing
 // because we're currently using SjLj exceptions. the day we can change
 // things, remove `dontthrow` and this test will still be a useful help
 extern "C" dontthrow void qsort_(void *, size_t, size_t,
                                 int (*)(const void *,
                                         const void *)) asm("qsort");
 struct Resource {
  char *p;
  Resource() {
@ -60,7 +53,7 @@ int A[3] = {3, 2, 1};
 int Work(void) {
  Resource r;
  pPoke(r.p);
-  qsort_(A, 3, sizeof(int), cmp);
+  qsort(A, 3, sizeof(int), cmp);
  return A[0];
 }
 int (*pWork)(void) = Work;
--- a/third_party/aarch64/BUILD.mk
+++ b/third_party/aarch64/BUILD.mk
@ -3,4 +3,4 @@
 PKGS += THIRD_PARTY_AARCH64
 THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
-THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*)
+THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*) $(wildcard third_party/aarch64/clang/*)
--- a/third_party/aarch64/clang/arm64intr.h
+++ b/third_party/aarch64/clang/arm64intr.h
@ -0,0 +1,35 @@
 /*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 /* Only include this if we're compiling for the windows platform. */
 #ifndef _MSC_VER
 #include_next <arm64intr.h>
 #else
 #ifndef __ARM64INTR_H
 #define __ARM64INTR_H
 typedef enum
 {
  _ARM64_BARRIER_SY    = 0xF,
  _ARM64_BARRIER_ST    = 0xE,
  _ARM64_BARRIER_LD    = 0xD,
  _ARM64_BARRIER_ISH   = 0xB,
  _ARM64_BARRIER_ISHST = 0xA,
  _ARM64_BARRIER_ISHLD = 0x9,
  _ARM64_BARRIER_NSH   = 0x7,
  _ARM64_BARRIER_NSHST = 0x6,
  _ARM64_BARRIER_NSHLD = 0x5,
  _ARM64_BARRIER_OSH   = 0x3,
  _ARM64_BARRIER_OSHST = 0x2,
  _ARM64_BARRIER_OSHLD = 0x1
 } _ARM64INTR_BARRIER_TYPE;
 #endif /* __ARM64INTR_H */
 #endif /* _MSC_VER */
--- a/third_party/aarch64/clang/arm_acle.h
+++ b/third_party/aarch64/clang/arm_acle.h
@ -0,0 +1,888 @@
 /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 * The Arm C Language Extensions specifications can be found in the following
 * link: https://github.com/ARM-software/acle/releases
 *
 * The ACLE section numbers are subject to change. When consulting the
 * specifications, it is recommended to search using section titles if
 * the section numbers look outdated.
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ARM_ACLE_H
 #define __ARM_ACLE_H
 #ifndef __ARM_ACLE
 #error "ACLE intrinsics support not enabled."
 #endif
 #include <stdint.h>
 #if defined(__cplusplus)
 extern "C" {
 #endif
 /* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
 /* 7.3 Memory barriers */
 #if !__has_builtin(__dmb)
 #define __dmb(i) __builtin_arm_dmb(i)
 #endif
 #if !__has_builtin(__dsb)
 #define __dsb(i) __builtin_arm_dsb(i)
 #endif
 #if !__has_builtin(__isb)
 #define __isb(i) __builtin_arm_isb(i)
 #endif
 /* 7.4 Hints */
 #if !__has_builtin(__wfi)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
  __builtin_arm_wfi();
 }
 #endif
 #if !__has_builtin(__wfe)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
  __builtin_arm_wfe();
 }
 #endif
 #if !__has_builtin(__sev)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
  __builtin_arm_sev();
 }
 #endif
 #if !__has_builtin(__sevl)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
  __builtin_arm_sevl();
 }
 #endif
 #if !__has_builtin(__yield)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
  __builtin_arm_yield();
 }
 #endif
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define _CHKFEAT_GCS 1
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __chkfeat(uint64_t __features) {
  return __builtin_arm_chkfeat(__features) ^ __features;
 }
 #endif
 /* 7.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
  uint32_t v;
  do
    v = __builtin_arm_ldrex(__p);
  while (__builtin_arm_strex(__x, __p));
  return v;
 }
 /* 7.6 Memory prefetch intrinsics */
 /* 7.6.1 Data prefetch */
 #define __pld(addr) __pldx(0, 0, 0, addr)
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __pldx(access_kind, cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, access_kind, 1)
 #else
 #define __pldx(access_kind, cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
 #endif
 /* 7.6.2 Instruction prefetch */
 #define __pli(addr) __plix(0, 0, addr)
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __plix(cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, 0, 0)
 #else
 #define __plix(cache_level, retention_policy, addr) \
  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
 #endif
 /* 7.7 NOP */
 #if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
  __builtin_arm_nop();
 }
 #endif
 /* 8 DATA-PROCESSING INTRINSICS */
 /* 8.2 Miscellaneous data-processing intrinsics */
 /* ROR */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __ror(uint32_t __x, uint32_t __y) {
  __y %= 32;
  if (__y == 0)
    return __x;
  return (__x >> __y) | (__x << (32 - __y));
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rorll(uint64_t __x, uint32_t __y) {
  __y %= 64;
  if (__y == 0)
    return __x;
  return (__x >> __y) | (__x << (64 - __y));
 }
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __rorl(unsigned long __x, uint32_t __y) {
 #if __SIZEOF_LONG__ == 4
  return __ror(__x, __y);
 #else
  return __rorll(__x, __y);
 #endif
 }
 /* CLZ */
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __clz(uint32_t __t) {
  return __builtin_arm_clz(__t);
 }
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __clzl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
  return __builtin_arm_clz(__t);
 #else
  return __builtin_arm_clz64(__t);
 #endif
 }
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __clzll(uint64_t __t) {
  return __builtin_arm_clz64(__t);
 }
 /* CLS */
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __cls(uint32_t __t) {
  return __builtin_arm_cls(__t);
 }
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __clsl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
  return __builtin_arm_cls(__t);
 #else
  return __builtin_arm_cls64(__t);
 #endif
 }
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
 __clsll(uint64_t __t) {
  return __builtin_arm_cls64(__t);
 }
 /* REV */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __rev(uint32_t __t) {
  return __builtin_bswap32(__t);
 }
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __revl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
  return __builtin_bswap32(__t);
 #else
  return __builtin_bswap64(__t);
 #endif
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __revll(uint64_t __t) {
  return __builtin_bswap64(__t);
 }
 /* REV16 */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __rev16(uint32_t __t) {
  return __ror(__rev(__t), 16);
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rev16ll(uint64_t __t) {
  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
 }
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __rev16l(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
    return __rev16(__t);
 #else
    return __rev16ll(__t);
 #endif
 }
 /* REVSH */
 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
 __revsh(int16_t __t) {
  return (int16_t)__builtin_bswap16((uint16_t)__t);
 }
 /* RBIT */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __rbit(uint32_t __t) {
  return __builtin_arm_rbit(__t);
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rbitll(uint64_t __t) {
 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
         __builtin_arm_rbit(__t >> 32);
 #else
  return __builtin_arm_rbit64(__t);
 #endif
 }
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __rbitl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
  return __rbit(__t);
 #else
  return __rbitll(__t);
 #endif
 }
 /* 8.3 16-bit multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbb(int32_t __a, int32_t __b) {
  return __builtin_arm_smulbb(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbt(int32_t __a, int32_t __b) {
  return __builtin_arm_smulbt(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smultb(int32_t __a, int32_t __b) {
  return __builtin_arm_smultb(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smultt(int32_t __a, int32_t __b) {
  return __builtin_arm_smultt(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulwb(int32_t __a, int32_t __b) {
  return __builtin_arm_smulwb(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulwt(int32_t __a, int32_t __b) {
  return __builtin_arm_smulwt(__a, __b);
 }
 #endif
 /*
 * 8.4 Saturating intrinsics
 *
 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
 * intrinsics are implemented and the flag is enabled.
 */
 /* 8.4.1 Width-specified saturation intrinsics */
 #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif
 /* 8.4.2 Saturating addition and subtraction intrinsics */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
  return __builtin_arm_qadd(__t, __v);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qsub(int32_t __t, int32_t __v) {
  return __builtin_arm_qsub(__t, __v);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qdbl(int32_t __t) {
  return __builtin_arm_qadd(__t, __t);
 }
 #endif
 /* 8.4.3 Accumulating multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlabb(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlabt(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlatb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlatb(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlatt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlatt(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlawb(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlawb(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlawt(int32_t __a, int32_t __b, int32_t __c) {
  return __builtin_arm_smlawt(__a, __b, __c);
 }
 #endif
 /* 8.5.4 Parallel 16-bit saturation */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
 #define __usat16(x, y) __builtin_arm_usat16(x, y)
 #endif
 /* 8.5.5 Packing and unpacking */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 typedef int32_t int8x4_t;
 typedef int32_t int16x2_t;
 typedef uint32_t uint8x4_t;
 typedef uint32_t uint16x2_t;
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __sxtab16(int16x2_t __a, int8x4_t __b) {
  return __builtin_arm_sxtab16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __sxtb16(int8x4_t __a) {
  return __builtin_arm_sxtb16(__a);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __uxtab16(int16x2_t __a, int8x4_t __b) {
  return __builtin_arm_uxtab16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __uxtb16(int8x4_t __a) {
  return __builtin_arm_uxtb16(__a);
 }
 #endif
 /* 8.5.6 Parallel selection */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __sel(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_sel(__a, __b);
 }
 #endif
 /* 8.5.7 Parallel 8-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qadd8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_qadd8(__a, __b);
 }
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qsub8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_qsub8(__a, __b);
 }
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __sadd8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_sadd8(__a, __b);
 }
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __shadd8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_shadd8(__a, __b);
 }
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __shsub8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_shsub8(__a, __b);
 }
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __ssub8(int8x4_t __a, int8x4_t __b) {
  return __builtin_arm_ssub8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __uadd8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_uadd8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __uhadd8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_uhadd8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __uhsub8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_uhsub8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __uqadd8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_uqadd8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __uqsub8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_uqsub8(__a, __b);
 }
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __usub8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_usub8(__a, __b);
 }
 #endif
 /* 8.5.8 Sum of 8-bit absolute differences */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usad8(uint8x4_t __a, uint8x4_t __b) {
  return __builtin_arm_usad8(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
  return __builtin_arm_usada8(__a, __b, __c);
 }
 #endif
 /* 8.5.9 Parallel 16-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qadd16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_qadd16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qasx(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_qasx(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qsax(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_qsax(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qsub16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_qsub16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __sadd16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_sadd16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __sasx(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_sasx(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __shadd16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_shadd16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __shasx(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_shasx(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __shsax(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_shsax(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __shsub16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_shsub16(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __ssax(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_ssax(__a, __b);
 }
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __ssub16(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_ssub16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uadd16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uadd16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uasx(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uasx(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uhadd16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uhadd16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uhasx(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uhasx(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uhsax(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uhsax(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uhsub16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uhsub16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uqadd16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uqadd16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uqasx(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uqasx(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uqsax(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uqsax(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __uqsub16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_uqsub16(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __usax(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_usax(__a, __b);
 }
 static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
 __usub16(uint16x2_t __a, uint16x2_t __b) {
  return __builtin_arm_usub16(__a, __b);
 }
 #endif
 /* 8.5.10 Parallel 16-bit multiplication */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
  return __builtin_arm_smlad(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
  return __builtin_arm_smladx(__a, __b, __c);
 }
 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
 __smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
  return __builtin_arm_smlald(__a, __b, __c);
 }
 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
 __smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
  return __builtin_arm_smlaldx(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
  return __builtin_arm_smlsd(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
  return __builtin_arm_smlsdx(__a, __b, __c);
 }
 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
 __smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
  return __builtin_arm_smlsld(__a, __b, __c);
 }
 static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
 __smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
  return __builtin_arm_smlsldx(__a, __b, __c);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smuad(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_smuad(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smuadx(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_smuadx(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smusd(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_smusd(__a, __b);
 }
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smusdx(int16x2_t __a, int16x2_t __b) {
  return __builtin_arm_smusdx(__a, __b);
 }
 #endif
 /* 8.6 Floating-point data-processing intrinsics */
 #if (defined(__ARM_FEATURE_DIRECTED_ROUNDING)    &&                         \
  (__ARM_FEATURE_DIRECTED_ROUNDING))             &&                         \
  (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ double __attribute__((__always_inline__, __nodebug__))
 __rintn(double __a) {
  return __builtin_roundeven(__a);
 }
 static __inline__ float __attribute__((__always_inline__, __nodebug__))
 __rintnf(float __a) {
  return __builtin_roundevenf(__a);
 }
 #endif
 /* 8.8 CRC32 intrinsics */
 #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32b(uint32_t __a, uint8_t __b) {
  return __builtin_arm_crc32b(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32h(uint32_t __a, uint16_t __b) {
  return __builtin_arm_crc32h(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32w(uint32_t __a, uint32_t __b) {
  return __builtin_arm_crc32w(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32d(uint32_t __a, uint64_t __b) {
  return __builtin_arm_crc32d(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cb(uint32_t __a, uint8_t __b) {
  return __builtin_arm_crc32cb(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32ch(uint32_t __a, uint16_t __b) {
  return __builtin_arm_crc32ch(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cw(uint32_t __a, uint32_t __b) {
  return __builtin_arm_crc32cw(__a, __b);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cd(uint32_t __a, uint64_t __b) {
  return __builtin_arm_crc32cd(__a, __b);
 }
 #endif
 /* 8.6 Floating-point data-processing intrinsics */
 /* Armv8.3-A Javascript conversion intrinsic */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
 __jcvt(double __a) {
  return __builtin_arm_jcvt(__a);
 }
 #endif
 /* Armv8.5-A FP rounding intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint32zf(float __a) {
  return __builtin_arm_rint32zf(__a);
 }
 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint32z(double __a) {
  return __builtin_arm_rint32z(__a);
 }
 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint64zf(float __a) {
  return __builtin_arm_rint64zf(__a);
 }
 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint64z(double __a) {
  return __builtin_arm_rint64z(__a);
 }
 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint32xf(float __a) {
  return __builtin_arm_rint32xf(__a);
 }
 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint32x(double __a) {
  return __builtin_arm_rint32x(__a);
 }
 static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint64xf(float __a) {
  return __builtin_arm_rint64xf(__a);
 }
 static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
 __rint64x(double __a) {
  return __builtin_arm_rint64x(__a);
 }
 #endif
 /* 8.9 Armv8.7-A load/store 64-byte intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 typedef struct {
    uint64_t val[8];
 } data512_t;
 static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_ld64b(const void *__addr) {
  data512_t __value;
  __builtin_arm_ld64b(__addr, __value.val);
  return __value;
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64b(void *__addr, data512_t __value) {
  __builtin_arm_st64b(__addr, __value.val);
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv(void *__addr, data512_t __value) {
  return __builtin_arm_st64bv(__addr, __value.val);
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv0(void *__addr, data512_t __value) {
  return __builtin_arm_st64bv0(__addr, __value.val);
 }
 #endif
 /* 11.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
 #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
 #define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
 /* 10.3 MTE intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
 #define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 /* 18 memcpy family of operations intrinsics - MOPS */
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif
 /* 11.3 Coprocessor Intrinsics */
 #if defined(__ARM_FEATURE_COPROC)
 #if (__ARM_FEATURE_COPROC & 0x1)
 #if (__ARM_ARCH < 8)
 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
 #endif /* __ARM_ARCH < 8 */
 #define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
 #define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
 #define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2)                         \
  __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
 #define __arm_mrc(coproc, opc1, CRn, CRm, opc2)                                \
  __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
 #if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
 #endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
 #if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
 #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
 #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
 #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
 #endif /* ___ARM_ARCH_8M_MAIN__ */
 #endif /* __ARM_FEATURE_COPROC & 0x1 */
 #if (__ARM_FEATURE_COPROC & 0x2)
 #define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)                          \
  __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
 #define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
 #define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
 #define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
 #define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
 #define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)                        \
  __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
 #define __arm_mrc2(coproc, opc1, CRn, CRm, opc2)                               \
  __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
 #endif
 #if (__ARM_FEATURE_COPROC & 0x4)
 #define __arm_mcrr(coproc, opc1, value, CRm)                                   \
  __builtin_arm_mcrr(coproc, opc1, value, CRm)
 #define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
 #endif
 #if (__ARM_FEATURE_COPROC & 0x8)
 #define __arm_mcrr2(coproc, opc1, value, CRm)                                  \
  __builtin_arm_mcrr2(coproc, opc1, value, CRm)
 #define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
 #endif
 #endif // __ARM_FEATURE_COPROC
 /* 17 Transactional Memory Extension (TME) Intrinsics */
 #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
 #define _TMFAILURE_REASON  0x00007fffu
 #define _TMFAILURE_RTRY    0x00008000u
 #define _TMFAILURE_CNCL    0x00010000u
 #define _TMFAILURE_MEM     0x00020000u
 #define _TMFAILURE_IMP     0x00040000u
 #define _TMFAILURE_ERR     0x00080000u
 #define _TMFAILURE_SIZE    0x00100000u
 #define _TMFAILURE_NEST    0x00200000u
 #define _TMFAILURE_DBG     0x00400000u
 #define _TMFAILURE_INT     0x00800000u
 #define _TMFAILURE_TRIVIAL 0x01000000u
 #define __tstart()        __builtin_arm_tstart()
 #define __tcommit()       __builtin_arm_tcommit()
 #define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
 #define __ttest()         __builtin_arm_ttest()
 #endif /* __ARM_FEATURE_TME */
 /* 8.7 Armv8.5-A Random number generation intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndr(uint64_t *__p) {
  return __builtin_arm_rndr(__p);
 }
 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndrrs(uint64_t *__p) {
  return __builtin_arm_rndrrs(__p);
 }
 #endif
 /* 11.2 Guarded Control Stack intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ void * __attribute__((__always_inline__, __nodebug__))
 __gcspr() {
  return (void *)__builtin_arm_rsr64("gcspr_el0");
 }
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
 __gcspopm() {
  return __builtin_arm_gcspopm(0);
 }
 static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
 __gcsss(const void *__stack) {
  return __builtin_arm_gcsss(__stack);
 }
 #endif
 #if defined(__cplusplus)
 }
 #endif
 #endif /* __ARM_ACLE_H */
--- a/third_party/aarch64/clang/arm_bf16.h
+++ b/third_party/aarch64/clang/arm_bf16.h
@ -0,0 +1,20 @@
 /*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ARM_BF16_H
 #define __ARM_BF16_H
 typedef __bf16 bfloat16_t;
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
 #undef __ai
 #endif
--- a/third_party/aarch64/clang/arm_cde.h
+++ b/third_party/aarch64/clang/arm_cde.h
@ -0,0 +1,410 @@
 /*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ARM_CDE_H
 #define __ARM_CDE_H
 #if !__ARM_FEATURE_CDE
 #error "CDE support not enabled"
 #endif
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
 uint32_t __arm_cx1(int, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
 uint32_t __arm_cx1a(int, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
 uint64_t __arm_cx1d(int, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
 uint64_t __arm_cx1da(int, uint64_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
 uint32_t __arm_cx2(int, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
 uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
 uint64_t __arm_cx2d(int, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
 uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
 uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
 uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
 uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
 uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
 uint32_t __arm_vcx1_u32(int, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
 uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
 uint64_t __arm_vcx1d_u64(int, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
 uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
 uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
 uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
 uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
 uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
 uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
 uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
 uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
 uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
 #if __ARM_FEATURE_MVE
 typedef uint16_t mve_pred16_t;
 typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
 typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
 typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
 typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
 typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
 typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
 typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
 typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
 int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
 int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
 int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
 int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
 uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
 uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
 uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
 uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
 uint8x16_t __arm_vcx1q_u8(int, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
 int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
 int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
 int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
 int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
 uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
 uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
 uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
 uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
 int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
 int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
 int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
 int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
 uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
 uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
 uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
 uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
 int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
 int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
 int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
 int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
 uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
 uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
 uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
 uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
 int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
 int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
 int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
 int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
 uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
 uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
 uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
 uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
 uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
 uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
 uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
 uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
 uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
 uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
 uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
 uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
 int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
 int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
 int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
 int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
 uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
 uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
 uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
 uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
 int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
 int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
 int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
 int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
 uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
 uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
 uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
 uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
 int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
 int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
 int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
 int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
 uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
 uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
 uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
 uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
 int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
 int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
 int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
 int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
 uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
 uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
 uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
 uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
 uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
 uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
 uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
 uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
 uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
 uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
 uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
 uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
 int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
 int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
 int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
 int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
 uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
 uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
 uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
 uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
 int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
 int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
 int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
 int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
 uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
 uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
 uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
 uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
 int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
 int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
 int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
 int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
 uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
 uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
 uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
 uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
 uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
 uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
 uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
 uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
 uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
 uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
 uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
 #define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
 #define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
 #define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
 #define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
 #define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
 #define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
 #define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
 #define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
 #endif /* __ARM_FEATURE_MVE */
 #if __ARM_FEATURE_MVE & 2
 typedef __fp16 float16_t;
 typedef float float32_t;
 typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
 typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
 float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
 float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
 float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
 float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
 float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
 float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
 float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
 float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
 float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
 float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
 uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
 uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
 float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
 float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
 float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
 float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
 float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
 float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
 float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
 float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
 uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
 uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
 float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
 float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
 float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
 float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
 float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
 static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
 float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
 uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
 static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
 uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
 #endif /* __ARM_FEATURE_MVE & 2 */
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 #endif /* __ARM_CDE_H */
--- a/third_party/aarch64/clang/arm_cmse.h
+++ b/third_party/aarch64/clang/arm_cmse.h
@ -0,0 +1,217 @@
 //===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef __ARM_CMSE_H
 #define __ARM_CMSE_H
 #if (__ARM_FEATURE_CMSE & 0x1)
 #include <stddef.h>
 #include <stdint.h>
 #define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
 #define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
 #define CMSE_AU_NONSECURE  2 /* checks if permissions have secure field unset */
 #define CMSE_MPU_UNPRIV    4 /* sets T flag on TT insrtuction */
 #define CMSE_MPU_READ      8 /* checks if read_ok field is set */
 #define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
 #define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
 #define cmse_check_pointed_object(p, f) \
  cmse_check_address_range((p), sizeof(*(p)), (f))
 #if defined(__cplusplus)
 extern "C" {
 #endif
 typedef union {
  struct cmse_address_info {
 #ifdef __ARM_BIG_ENDIAN
    /* __ARM_BIG_ENDIAN */
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned idau_region : 8;
    unsigned idau_region_valid : 1;
    unsigned secure : 1;
    unsigned nonsecure_readwrite_ok : 1;
    unsigned nonsecure_read_ok : 1;
 #else
    unsigned : 12;
 #endif
    unsigned readwrite_ok : 1;
    unsigned read_ok : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region_valid : 1;
 #else
    unsigned : 1;
 #endif
    unsigned mpu_region_valid : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region : 8;
 #else
    unsigned : 8;
 #endif
    unsigned mpu_region : 8;
 #else /* __ARM_LITTLE_ENDIAN */
    unsigned mpu_region : 8;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region : 8;
 #else
    unsigned : 8;
 #endif
    unsigned mpu_region_valid : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region_valid : 1;
 #else
    unsigned : 1;
 #endif
    unsigned read_ok : 1;
    unsigned readwrite_ok : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned nonsecure_read_ok : 1;
    unsigned nonsecure_readwrite_ok : 1;
    unsigned secure : 1;
    unsigned idau_region_valid : 1;
    unsigned idau_region : 8;
 #else
    unsigned : 12;
 #endif
 #endif /*__ARM_LITTLE_ENDIAN */
  } flags;
  unsigned value;
 } cmse_address_info_t;
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TT(__p);
  return __u;
 }
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTT(__p);
  return __u;
 }
 #if __ARM_CMSE_SECURE_MODE
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTA(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTA(__p);
  return __u;
 }
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTAT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTAT(__p);
  return __u;
 }
 #endif
 #define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
 #define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
 #if __ARM_CMSE_SECURE_MODE
 #define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
 #define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
 #endif
 static void *__attribute__((__always_inline__))
 cmse_check_address_range(void *__pb, size_t __s, int __flags) {
  uintptr_t __begin = (uintptr_t)__pb;
  uintptr_t __end = __begin + __s - 1;
  if (__end < __begin)
    return NULL; /* wrap around check */
  /* Check whether the range crosses a 32-bytes aligned address */
  const int __single_check = (__begin ^ __end) < 0x20u;
  /* execute the right variant of the TT instructions */
  void *__pe = (void *)__end;
  cmse_address_info_t __permb, __perme;
  switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
  case 0:
    __permb = cmse_TT(__pb);
    __perme = __single_check ? __permb : cmse_TT(__pe);
    break;
  case CMSE_MPU_UNPRIV:
    __permb = cmse_TTT(__pb);
    __perme = __single_check ? __permb : cmse_TTT(__pe);
    break;
 #if __ARM_CMSE_SECURE_MODE
  case CMSE_MPU_NONSECURE:
    __permb = cmse_TTA(__pb);
    __perme = __single_check ? __permb : cmse_TTA(__pe);
    break;
  case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
    __permb = cmse_TTAT(__pb);
    __perme = __single_check ? __permb : cmse_TTAT(__pe);
    break;
 #endif
  /* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
  default:
    return NULL;
  }
  /* check that the range does not cross MPU, SAU, or IDAU region boundaries */
  if (__permb.value != __perme.value)
    return NULL;
 #if !(__ARM_CMSE_SECURE_MODE)
  /* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
  if (__flags & CMSE_AU_NONSECURE)
    return NULL;
 #endif
  /* check the permission on the range */
  switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
 #if (__ARM_CMSE_SECURE_MODE)
  case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
  case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
    return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
  case CMSE_MPU_READ | CMSE_AU_NONSECURE:
    return __permb.flags.nonsecure_read_ok ? __pb : NULL;
  case CMSE_AU_NONSECURE:
    return __permb.flags.secure ? NULL : __pb;
 #endif
  case CMSE_MPU_READ | CMSE_MPU_READWRITE:
  case CMSE_MPU_READWRITE:
    return __permb.flags.readwrite_ok ? __pb : NULL;
  case CMSE_MPU_READ:
    return __permb.flags.read_ok ? __pb : NULL;
  default:
    return NULL;
  }
 }
 #if __ARM_CMSE_SECURE_MODE
 static int __attribute__((__always_inline__, __nodebug__))
 cmse_nonsecure_caller(void) {
  return !((uintptr_t)__builtin_return_address(0) & 1);
 }
 #define cmse_nsfptr_create(p)                                                  \
  __builtin_bit_cast(__typeof__(p),                                            \
                     (__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
 #define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
 #endif /* __ARM_CMSE_SECURE_MODE */
 void __attribute__((__noreturn__)) cmse_abort(void);
 #if defined(__cplusplus)
 }
 #endif
 #endif /* (__ARM_FEATURE_CMSE & 0x1) */
 #endif /* __ARM_CMSE_H */
--- a/third_party/aarch64/clang/arm_fp16.h
+++ b/third_party/aarch64/clang/arm_fp16.h
@ -0,0 +1,596 @@
 /*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ARM_FP16_H
 #define __ARM_FP16_H
 #include <stdint.h>
 typedef __fp16 float16_t;
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
 #if defined(__aarch64__) || defined(__arm64ec__)
 #define vabdh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
  __ret; \
 })
 #define vabsh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
  __ret; \
 })
 #define vaddh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
  __ret; \
 })
 #define vcageh_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
  __ret; \
 })
 #define vcagth_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
  __ret; \
 })
 #define vcaleh_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
  __ret; \
 })
 #define vcalth_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
  __ret; \
 })
 #define vceqh_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
  __ret; \
 })
 #define vceqzh_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
  __ret; \
 })
 #define vcgeh_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
  __ret; \
 })
 #define vcgezh_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
  __ret; \
 })
 #define vcgth_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
  __ret; \
 })
 #define vcgtzh_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
  __ret; \
 })
 #define vcleh_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
  __ret; \
 })
 #define vclezh_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
  __ret; \
 })
 #define vclth_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
  __ret; \
 })
 #define vcltzh_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
  __ret; \
 })
 #define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_s16_f16(__p0) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
  __ret; \
 })
 #define vcvth_s32_f16(__p0) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
  __ret; \
 })
 #define vcvth_s64_f16(__p0) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
  __ret; \
 })
 #define vcvth_u16_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
  __ret; \
 })
 #define vcvth_u32_f16(__p0) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
  __ret; \
 })
 #define vcvth_u64_f16(__p0) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
  __ret; \
 })
 #define vcvtah_s16_f16(__p0) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
  __ret; \
 })
 #define vcvtah_s32_f16(__p0) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
  __ret; \
 })
 #define vcvtah_s64_f16(__p0) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
  __ret; \
 })
 #define vcvtah_u16_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
  __ret; \
 })
 #define vcvtah_u32_f16(__p0) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
  __ret; \
 })
 #define vcvtah_u64_f16(__p0) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
  __ret; \
 })
 #define vcvth_f16_u16(__p0) __extension__ ({ \
  float16_t __ret; \
  uint16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
  __ret; \
 })
 #define vcvth_f16_s16(__p0) __extension__ ({ \
  float16_t __ret; \
  int16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
  __ret; \
 })
 #define vcvth_f16_u32(__p0) __extension__ ({ \
  float16_t __ret; \
  uint32_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
  __ret; \
 })
 #define vcvth_f16_s32(__p0) __extension__ ({ \
  float16_t __ret; \
  int32_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
  __ret; \
 })
 #define vcvth_f16_u64(__p0) __extension__ ({ \
  float16_t __ret; \
  uint64_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
  __ret; \
 })
 #define vcvth_f16_s64(__p0) __extension__ ({ \
  float16_t __ret; \
  int64_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
  __ret; \
 })
 #define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  uint32_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  int32_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  uint64_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  int64_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  uint16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
  __ret; \
 })
 #define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  int16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
  __ret; \
 })
 #define vcvtmh_s16_f16(__p0) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
  __ret; \
 })
 #define vcvtmh_s32_f16(__p0) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
  __ret; \
 })
 #define vcvtmh_s64_f16(__p0) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
  __ret; \
 })
 #define vcvtmh_u16_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
  __ret; \
 })
 #define vcvtmh_u32_f16(__p0) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
  __ret; \
 })
 #define vcvtmh_u64_f16(__p0) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_s16_f16(__p0) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_s32_f16(__p0) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_s64_f16(__p0) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_u16_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_u32_f16(__p0) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
  __ret; \
 })
 #define vcvtnh_u64_f16(__p0) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
  __ret; \
 })
 #define vcvtph_s16_f16(__p0) __extension__ ({ \
  int16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
  __ret; \
 })
 #define vcvtph_s32_f16(__p0) __extension__ ({ \
  int32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
  __ret; \
 })
 #define vcvtph_s64_f16(__p0) __extension__ ({ \
  int64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
  __ret; \
 })
 #define vcvtph_u16_f16(__p0) __extension__ ({ \
  uint16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
  __ret; \
 })
 #define vcvtph_u32_f16(__p0) __extension__ ({ \
  uint32_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
  __ret; \
 })
 #define vcvtph_u64_f16(__p0) __extension__ ({ \
  uint64_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
  __ret; \
 })
 #define vdivh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
  __ret; \
 })
 #define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  float16_t __s2 = __p2; \
  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
  __ret; \
 })
 #define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  float16_t __s2 = __p2; \
  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
  __ret; \
 })
 #define vmaxh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
  __ret; \
 })
 #define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
  __ret; \
 })
 #define vminh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
  __ret; \
 })
 #define vminnmh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
  __ret; \
 })
 #define vmulh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
  __ret; \
 })
 #define vmulxh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
  __ret; \
 })
 #define vnegh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
  __ret; \
 })
 #define vrecpeh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
  __ret; \
 })
 #define vrecpsh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
  __ret; \
 })
 #define vrecpxh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
  __ret; \
 })
 #define vrndh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
  __ret; \
 })
 #define vrndah_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
  __ret; \
 })
 #define vrndih_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
  __ret; \
 })
 #define vrndmh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
  __ret; \
 })
 #define vrndnh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
  __ret; \
 })
 #define vrndph_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
  __ret; \
 })
 #define vrndxh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
  __ret; \
 })
 #define vrsqrteh_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
  __ret; \
 })
 #define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
  __ret; \
 })
 #define vsqrth_f16(__p0) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
  __ret; \
 })
 #define vsubh_f16(__p0, __p1) __extension__ ({ \
  float16_t __ret; \
  float16_t __s0 = __p0; \
  float16_t __s1 = __p1; \
  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
  __ret; \
 })
 #endif
 #undef __ai
 #endif /* __ARM_FP16_H */
--- a/third_party/aarch64/clang/arm_mve.h
+++ b/third_party/aarch64/clang/arm_mve.h
--- a/third_party/aarch64/clang/arm_neon.h
+++ b/third_party/aarch64/clang/arm_neon.h
--- a/third_party/aarch64/clang/arm_neon_sve_bridge.h
+++ b/third_party/aarch64/clang/arm_neon_sve_bridge.h
@ -0,0 +1,182 @@
 /*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ARM_NEON_SVE_BRIDGE_H
 #define __ARM_NEON_SVE_BRIDGE_H
 #include <arm_neon.h>
 #include <arm_sve.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Function attributes */
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
 #define __aio                                                                  \
  static __inline__                                                            \
      __attribute__((__always_inline__, __nodebug__, __overloadable__))
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
 svint8_t svset_neonq(svint8_t, int8x16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
 svint16_t svset_neonq(svint16_t, int16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
 svint32_t svset_neonq(svint32_t, int32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
 svint64_t svset_neonq(svint64_t, int64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
 svuint8_t svset_neonq(svuint8_t, uint8x16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
 svuint16_t svset_neonq(svuint16_t, uint16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
 svuint32_t svset_neonq(svuint32_t, uint32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
 svuint64_t svset_neonq(svuint64_t, uint64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
 svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
 svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
 svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
 svint8_t svset_neonq_s8(svint8_t, int8x16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
 svint16_t svset_neonq_s16(svint16_t, int16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
 svint32_t svset_neonq_s32(svint32_t, int32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
 svint64_t svset_neonq_s64(svint64_t, int64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
 svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
 svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
 svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
 svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
 svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
 svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
 svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
 int8x16_t svget_neonq(svint8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
 int16x8_t svget_neonq(svint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
 int32x4_t svget_neonq(svint32_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
 int64x2_t svget_neonq(svint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
 uint8x16_t svget_neonq(svuint8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
 uint16x8_t svget_neonq(svuint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
 uint32x4_t svget_neonq(svuint32_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
 uint64x2_t svget_neonq(svuint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
 float16x8_t svget_neonq(svfloat16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
 float32x4_t svget_neonq(svfloat32_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
 float64x2_t svget_neonq(svfloat64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
 int8x16_t svget_neonq_s8(svint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
 int16x8_t svget_neonq_s16(svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
 int32x4_t svget_neonq_s32(svint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
 int64x2_t svget_neonq_s64(svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
 uint8x16_t svget_neonq_u8(svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
 uint16x8_t svget_neonq_u16(svuint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
 uint32x4_t svget_neonq_u32(svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
 uint64x2_t svget_neonq_u64(svuint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
 float16x8_t svget_neonq_f16(svfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
 float32x4_t svget_neonq_f32(svfloat32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
 float64x2_t svget_neonq_f64(svfloat64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
 svint8_t svdup_neonq(int8x16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
 svint16_t svdup_neonq(int16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
 svint32_t svdup_neonq(int32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
 svint64_t svdup_neonq(int64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
 svuint8_t svdup_neonq(uint8x16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
 svuint16_t svdup_neonq(uint16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
 svuint32_t svdup_neonq(uint32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
 svuint64_t svdup_neonq(uint64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
 svfloat16_t svdup_neonq(float16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
 svfloat32_t svdup_neonq(float32x4_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
 svfloat64_t svdup_neonq(float64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
 svint8_t svdup_neonq_s8(int8x16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
 svint16_t svdup_neonq_s16(int16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
 svint32_t svdup_neonq_s32(int32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
 svint64_t svdup_neonq_s64(int64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
 svuint8_t svdup_neonq_u8(uint8x16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
 svuint16_t svdup_neonq_u16(uint16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
 svuint32_t svdup_neonq_u32(uint32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
 svuint64_t svdup_neonq_u64(uint64x2_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
 svfloat16_t svdup_neonq_f16(float16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
 svfloat32_t svdup_neonq_f32(float32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
 svfloat64_t svdup_neonq_f64(float64x2_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
 svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
 svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
 bfloat16x8_t svget_neonq(svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
 bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
 svbfloat16_t svdup_neonq(bfloat16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
 svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
 #undef __ai
 #undef __aio
 #ifdef __cplusplus
 } // extern "C"
 #endif
 #endif //__ARM_NEON_SVE_BRIDGE_H
--- a/third_party/aarch64/clang/arm_sme.h
+++ b/third_party/aarch64/clang/arm_sme.h
--- a/third_party/aarch64/clang/arm_sve.h
+++ b/third_party/aarch64/clang/arm_sve.h
--- a/third_party/aarch64/clang/arm_vector_types.h
+++ b/third_party/aarch64/clang/arm_vector_types.h
@ -0,0 +1,345 @@
 /*===---- arm_vector_types - ARM vector type ------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
 #error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
 #endif
 #ifndef __ARM_NEON_TYPES_H
 #define __ARM_NEON_TYPES_H
 typedef float float32_t;
 typedef __fp16 float16_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
 typedef double float64_t;
 #endif
 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
 typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
 typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
 typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
 typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
 typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
 typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
 typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
 typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
 typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
 typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
 typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
 typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
 typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
 typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
 typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
 #endif
 typedef struct int8x8x2_t {
  int8x8_t val[2];
 } int8x8x2_t;
 typedef struct int8x16x2_t {
  int8x16_t val[2];
 } int8x16x2_t;
 typedef struct int16x4x2_t {
  int16x4_t val[2];
 } int16x4x2_t;
 typedef struct int16x8x2_t {
  int16x8_t val[2];
 } int16x8x2_t;
 typedef struct int32x2x2_t {
  int32x2_t val[2];
 } int32x2x2_t;
 typedef struct int32x4x2_t {
  int32x4_t val[2];
 } int32x4x2_t;
 typedef struct int64x1x2_t {
  int64x1_t val[2];
 } int64x1x2_t;
 typedef struct int64x2x2_t {
  int64x2_t val[2];
 } int64x2x2_t;
 typedef struct uint8x8x2_t {
  uint8x8_t val[2];
 } uint8x8x2_t;
 typedef struct uint8x16x2_t {
  uint8x16_t val[2];
 } uint8x16x2_t;
 typedef struct uint16x4x2_t {
  uint16x4_t val[2];
 } uint16x4x2_t;
 typedef struct uint16x8x2_t {
  uint16x8_t val[2];
 } uint16x8x2_t;
 typedef struct uint32x2x2_t {
  uint32x2_t val[2];
 } uint32x2x2_t;
 typedef struct uint32x4x2_t {
  uint32x4_t val[2];
 } uint32x4x2_t;
 typedef struct uint64x1x2_t {
  uint64x1_t val[2];
 } uint64x1x2_t;
 typedef struct uint64x2x2_t {
  uint64x2_t val[2];
 } uint64x2x2_t;
 typedef struct float16x4x2_t {
  float16x4_t val[2];
 } float16x4x2_t;
 typedef struct float16x8x2_t {
  float16x8_t val[2];
 } float16x8x2_t;
 typedef struct float32x2x2_t {
  float32x2_t val[2];
 } float32x2x2_t;
 typedef struct float32x4x2_t {
  float32x4_t val[2];
 } float32x4x2_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x2_t {
  float64x1_t val[2];
 } float64x1x2_t;
 typedef struct float64x2x2_t {
  float64x2_t val[2];
 } float64x2x2_t;
 #endif
 typedef struct int8x8x3_t {
  int8x8_t val[3];
 } int8x8x3_t;
 typedef struct int8x16x3_t {
  int8x16_t val[3];
 } int8x16x3_t;
 typedef struct int16x4x3_t {
  int16x4_t val[3];
 } int16x4x3_t;
 typedef struct int16x8x3_t {
  int16x8_t val[3];
 } int16x8x3_t;
 typedef struct int32x2x3_t {
  int32x2_t val[3];
 } int32x2x3_t;
 typedef struct int32x4x3_t {
  int32x4_t val[3];
 } int32x4x3_t;
 typedef struct int64x1x3_t {
  int64x1_t val[3];
 } int64x1x3_t;
 typedef struct int64x2x3_t {
  int64x2_t val[3];
 } int64x2x3_t;
 typedef struct uint8x8x3_t {
  uint8x8_t val[3];
 } uint8x8x3_t;
 typedef struct uint8x16x3_t {
  uint8x16_t val[3];
 } uint8x16x3_t;
 typedef struct uint16x4x3_t {
  uint16x4_t val[3];
 } uint16x4x3_t;
 typedef struct uint16x8x3_t {
  uint16x8_t val[3];
 } uint16x8x3_t;
 typedef struct uint32x2x3_t {
  uint32x2_t val[3];
 } uint32x2x3_t;
 typedef struct uint32x4x3_t {
  uint32x4_t val[3];
 } uint32x4x3_t;
 typedef struct uint64x1x3_t {
  uint64x1_t val[3];
 } uint64x1x3_t;
 typedef struct uint64x2x3_t {
  uint64x2_t val[3];
 } uint64x2x3_t;
 typedef struct float16x4x3_t {
  float16x4_t val[3];
 } float16x4x3_t;
 typedef struct float16x8x3_t {
  float16x8_t val[3];
 } float16x8x3_t;
 typedef struct float32x2x3_t {
  float32x2_t val[3];
 } float32x2x3_t;
 typedef struct float32x4x3_t {
  float32x4_t val[3];
 } float32x4x3_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x3_t {
  float64x1_t val[3];
 } float64x1x3_t;
 typedef struct float64x2x3_t {
  float64x2_t val[3];
 } float64x2x3_t;
 #endif
 typedef struct int8x8x4_t {
  int8x8_t val[4];
 } int8x8x4_t;
 typedef struct int8x16x4_t {
  int8x16_t val[4];
 } int8x16x4_t;
 typedef struct int16x4x4_t {
  int16x4_t val[4];
 } int16x4x4_t;
 typedef struct int16x8x4_t {
  int16x8_t val[4];
 } int16x8x4_t;
 typedef struct int32x2x4_t {
  int32x2_t val[4];
 } int32x2x4_t;
 typedef struct int32x4x4_t {
  int32x4_t val[4];
 } int32x4x4_t;
 typedef struct int64x1x4_t {
  int64x1_t val[4];
 } int64x1x4_t;
 typedef struct int64x2x4_t {
  int64x2_t val[4];
 } int64x2x4_t;
 typedef struct uint8x8x4_t {
  uint8x8_t val[4];
 } uint8x8x4_t;
 typedef struct uint8x16x4_t {
  uint8x16_t val[4];
 } uint8x16x4_t;
 typedef struct uint16x4x4_t {
  uint16x4_t val[4];
 } uint16x4x4_t;
 typedef struct uint16x8x4_t {
  uint16x8_t val[4];
 } uint16x8x4_t;
 typedef struct uint32x2x4_t {
  uint32x2_t val[4];
 } uint32x2x4_t;
 typedef struct uint32x4x4_t {
  uint32x4_t val[4];
 } uint32x4x4_t;
 typedef struct uint64x1x4_t {
  uint64x1_t val[4];
 } uint64x1x4_t;
 typedef struct uint64x2x4_t {
  uint64x2_t val[4];
 } uint64x2x4_t;
 typedef struct float16x4x4_t {
  float16x4_t val[4];
 } float16x4x4_t;
 typedef struct float16x8x4_t {
  float16x8_t val[4];
 } float16x8x4_t;
 typedef struct float32x2x4_t {
  float32x2_t val[4];
 } float32x2x4_t;
 typedef struct float32x4x4_t {
  float32x4_t val[4];
 } float32x4x4_t;
 #if defined(__aarch64__) || defined(__arm64ec__)
 typedef struct float64x1x4_t {
  float64x1_t val[4];
 } float64x1x4_t;
 typedef struct float64x2x4_t {
  float64x2_t val[4];
 } float64x2x4_t;
 #endif
 typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
 typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
 typedef struct bfloat16x4x2_t {
  bfloat16x4_t val[2];
 } bfloat16x4x2_t;
 typedef struct bfloat16x8x2_t {
  bfloat16x8_t val[2];
 } bfloat16x8x2_t;
 typedef struct bfloat16x4x3_t {
  bfloat16x4_t val[3];
 } bfloat16x4x3_t;
 typedef struct bfloat16x8x3_t {
  bfloat16x8_t val[3];
 } bfloat16x8x3_t;
 typedef struct bfloat16x4x4_t {
  bfloat16x4_t val[4];
 } bfloat16x4x4_t;
 typedef struct bfloat16x8x4_t {
  bfloat16x8_t val[4];
 } bfloat16x8x4_t;
 #endif // __ARM_NEON_TYPES_H
--- a/third_party/aarch64/clang/armintr.h
+++ b/third_party/aarch64/clang/armintr.h
@ -0,0 +1,31 @@
 /*===---- armintr.h - ARM Windows intrinsics -------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 /* Only include this if we're compiling for the windows platform. */
 #ifndef _MSC_VER
 #include_next <armintr.h>
 #else
 #ifndef __ARMINTR_H
 #define __ARMINTR_H
 typedef enum
 {
  _ARM_BARRIER_SY    = 0xF,
  _ARM_BARRIER_ST    = 0xE,
  _ARM_BARRIER_ISH   = 0xB,
  _ARM_BARRIER_ISHST = 0xA,
  _ARM_BARRIER_NSH   = 0x7,
  _ARM_BARRIER_NSHST = 0x6,
  _ARM_BARRIER_OSH   = 0x3,
  _ARM_BARRIER_OSHST = 0x2
 } _ARMINTR_BARRIER_TYPE;
 #endif /* __ARMINTR_H */
 #endif /* _MSC_VER */
--- a/third_party/awk/run.c
+++ b/third_party/awk/run.c
@ -495,7 +495,7 @@ makearraystring(Node *p, const char *func)
 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
 			FATAL("%s: out of memory %s[%s...]",
-			    func, x->nval, buf);
+			      func ? func : "NULL", x->nval, buf);
 		}
 		memcpy(buf + blen, s, slen);
 		if (nsub) {
--- a/third_party/double-conversion/BUILD.mk
+++ b/third_party/double-conversion/BUILD.mk
@ -34,7 +34,8 @@ THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS =					\
 	LIBC_MEM								\
 	LIBC_STR								\
 	LIBC_TINYMATH								\
-	THIRD_PARTY_LIBCXXABI
+	THIRD_PARTY_LIBCXXABI							\
 	THIRD_PARTY_LIBUNWIND
 THIRD_PARTY_DOUBLECONVERSION_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))
--- a/third_party/intel/BUILD.mk
+++ b/third_party/intel/BUILD.mk
@ -3,4 +3,4 @@
 PKGS += THIRD_PARTY_INTEL
 THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
-THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)
+THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) $(wildcard third_party/intel/clang/*)
--- a/third_party/intel/clang/__wmmintrin_aes.h
+++ b/third_party/intel/clang/__wmmintrin_aes.h
@ -0,0 +1,140 @@
 /*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __WMMINTRIN_H
 #error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
 #endif
 #ifndef __WMMINTRIN_AES_H
 #define __WMMINTRIN_AES_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
 /// Performs a single round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VAESENC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
 /// \param __R
 ///    A 128-bit integer vector containing the round key value.
 /// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenc_si128(__m128i __V, __m128i __R)
 {
  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
 }
 /// Performs the final round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
 /// \param __R
 ///    A 128-bit integer vector containing the round key value.
 /// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenclast_si128(__m128i __V, __m128i __R)
 {
  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
 }
 /// Performs a single round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
 /// \param __R
 ///    A 128-bit integer vector containing the round key value.
 /// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdec_si128(__m128i __V, __m128i __R)
 {
  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
 }
 /// Performs the final round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the state value.
 /// \param __R
 ///    A 128-bit integer vector containing the round key value.
 /// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdeclast_si128(__m128i __V, __m128i __R)
 {
  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
 }
 /// Applies the AES InvMixColumns() transformation to an expanded key
 ///    contained in the source operand, and writes the result to the
 ///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
 ///
 /// \param __V
 ///    A 128-bit integer vector containing the expanded key.
 /// \returns A 128-bit integer vector containing the transformed value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesimc_si128(__m128i __V)
 {
  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
 }
 /// Generates a round key for AES encryption, operating on 128-bit data
 ///    specified in the first source operand and using an 8-bit round constant
 ///    specified by the second source operand, and writes the result to the
 ///    destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
 ///
 /// \param C
 ///    A 128-bit integer vector that is used to generate the AES encryption key.
 /// \param R
 ///    An 8-bit round constant used to generate the AES encryption key.
 /// \returns A 128-bit round key for AES encryption.
 #define _mm_aeskeygenassist_si128(C, R) \
  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
 #undef __DEFAULT_FN_ATTRS
 #endif  /* __WMMINTRIN_AES_H */
--- a/third_party/intel/clang/__wmmintrin_pclmul.h
+++ b/third_party/intel/clang/__wmmintrin_pclmul.h
@ -0,0 +1,48 @@
 /*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __WMMINTRIN_H
 #error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
 #endif
 #ifndef __WMMINTRIN_PCLMUL_H
 #define __WMMINTRIN_PCLMUL_H
 /// Multiplies two 64-bit integer values, which are selected from source
 ///    operands using the immediate-value operand. The multiplication is a
 ///    carry-less multiplication, and the 128-bit integer product is stored in
 ///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
 /// \param X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
 /// \param Y
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
 /// \param I
 ///    An immediate value specifying which 64-bit values to select from the
 ///    operands. Bit 0 is used to select a value from operand \a X, and bit
 ///    4 is used to select a value from operand \a Y: \n
 ///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
 ///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
 ///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
 ///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(X, Y, I) \
  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
                                        (__v2di)(__m128i)(Y), (char)(I)))
 #endif /* __WMMINTRIN_PCLMUL_H */
--- a/third_party/intel/clang/adcintrin.h
+++ b/third_party/intel/clang/adcintrin.h
@ -0,0 +1,160 @@
 /*===---- adcintrin.h - ADC intrinsics -------------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __ADCINTRIN_H
 #define __ADCINTRIN_H
 #if !defined(__i386__) && !defined(__x86_64__)
 #error "This header is only meant to be used on x86 and x64 architecture"
 #endif
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
 #define __INLINE __inline
 #else
 #define __INLINE static __inline
 #endif
 #if defined(__cplusplus)
 extern "C" {
 #endif
 /// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
 ///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
 ///    at \a __p, and returns the 8-bit carry-out (carry flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store32(__p, __x + __y + temp)
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c ADC instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    A 32-bit unsigned addend.
 /// \param __y
 ///    A 32-bit unsigned addend.
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
                                                        unsigned int __x,
                                                        unsigned int __y,
                                                        unsigned int *__p) {
  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 /// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
 ///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
 ///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
 ///    and returns the 8-bit carry-out (carry or overflow flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store32(__p, __x - (__y + temp))
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c SBB instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    The 32-bit unsigned minuend.
 /// \param __y
 ///    The 32-bit unsigned subtrahend.
 /// \param __p
 ///    Pointer to memory for storing the difference.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
                                                         unsigned int __x,
                                                         unsigned int __y,
                                                         unsigned int *__p) {
  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
 }
 #ifdef __x86_64__
 /// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
 ///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
 ///    at \a __p, and returns the 8-bit carry-out (carry flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store64(__p, __x + __y + temp)
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c ADC instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    A 64-bit unsigned addend.
 /// \param __y
 ///    A 64-bit unsigned addend.
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
 _addcarry_u64(unsigned char __cf, unsigned long long __x,
              unsigned long long __y, unsigned long long *__p) {
  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 /// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
 ///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
 ///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
 ///    and returns the 8-bit carry-out (carry or overflow flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store64(__p, __x - (__y + temp))
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c ADC instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    The 64-bit unsigned minuend.
 /// \param __y
 ///    The 64-bit unsigned subtrahend.
 /// \param __p
 ///    Pointer to memory for storing the difference.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
 _subborrow_u64(unsigned char __cf, unsigned long long __x,
               unsigned long long __y, unsigned long long *__p) {
  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
 }
 #endif
 #if defined(__cplusplus)
 }
 #endif
 #undef __INLINE
 #undef __DEFAULT_FN_ATTRS
 #endif /* __ADCINTRIN_H */
--- a/third_party/intel/clang/adxintrin.h
+++ b/third_party/intel/clang/adxintrin.h
@ -0,0 +1,102 @@
 /*===---- adxintrin.h - ADX intrinsics -------------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __ADXINTRIN_H
 #define __ADXINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
 #define __INLINE __inline
 #else
 #define __INLINE static __inline
 #endif
 #if defined(__cplusplus)
 extern "C" {
 #endif
 /* Intrinsics that are available only if __ADX__ is defined. */
 /// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
 ///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
 ///    at \a __p, and returns the 8-bit carry-out (carry flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store32(__p, __x + __y + temp)
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c ADCX instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    A 32-bit unsigned addend.
 /// \param __y
 ///    A 32-bit unsigned addend.
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
                                                         unsigned int __x,
                                                         unsigned int __y,
                                                         unsigned int *__p) {
  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 #ifdef __x86_64__
 /// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
 ///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
 ///    at \a __p, and returns the 8-bit carry-out (carry flag).
 ///
 /// \code{.operation}
 /// temp := (__cf == 0) ? 0 : 1
 /// Store64(__p, __x + __y + temp)
 /// result := CF
 /// \endcode
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the \c ADCX instruction.
 ///
 /// \param __cf
 ///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
 /// \param __x
 ///    A 64-bit unsigned addend.
 /// \param __y
 ///    A 64-bit unsigned addend.
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
 _addcarryx_u64(unsigned char __cf, unsigned long long __x,
               unsigned long long __y, unsigned long long *__p) {
  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 #if defined(__cplusplus)
 }
 #endif
 #undef __INLINE
 #undef __DEFAULT_FN_ATTRS
 #endif /* __ADXINTRIN_H */
--- a/third_party/intel/clang/ammintrin.h
+++ b/third_party/intel/clang/ammintrin.h
@ -0,0 +1,183 @@
 /*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __AMMINTRIN_H
 #define __AMMINTRIN_H
 #if !defined(__i386__) && !defined(__x86_64__)
 #error "This header is only meant to be used on x86 and x64 architecture"
 #endif
 #include "pmmintrin.h"
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
 /// Extracts the specified bits from the lower 64 bits of the 128-bit
 ///    integer vector operand at the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param x
 ///    The value from which bits are extracted.
 /// \param len
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
 ///    [63:0] of parameter \a x are extracted. If the length is zero but the
 ///    index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector whose lower 64 bits contain the bits
 ///    extracted from the source operand.
 #define _mm_extracti_si64(x, len, idx) \
  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
                                  (char)(len), (char)(idx)))
 /// Extracts the specified bits from the lower 64 bits of the 128-bit
 ///    integer vector operand at the index and of the length specified by
 ///    \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
 ///
 /// \param __x
 ///    The value from which bits are extracted.
 /// \param __y
 ///    Specifies the index of the least significant bit at [13:8] and the
 ///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
 ///    length is interpreted as 64. If the sum of the index and length is
 ///    greater than 64, the result is undefined. If the length and index are
 ///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
 ///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
 ///    from the source operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_extract_si64(__m128i __x, __m128i __y)
 {
  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 /// Inserts bits of a specified length from the source integer vector
 ///    \a y into the lower 64 bits of the destination integer vector \a x at
 ///    the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
 /// const int idx);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param x
 ///    The destination operand where bits will be inserted. The inserted bits
 ///    are defined by the length \a len and by the index \a idx specifying the
 ///    least significant bit.
 /// \param y
 ///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand \a y of length \a len.
 /// \param len
 ///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
 ///    are zero, the length is interpreted as 64.
 /// \param idx
 ///    Bits [5:0] specify the index of the least significant bit; the other
 ///    bits are ignored. If the sum of the index and length is greater than 64,
 ///    the result is undefined. If the length and index are both zero, bits
 ///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
 ///    is zero but the index is non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
 ///    destination operand \a x with the specified bitfields replaced by the
 ///    lower bits of source operand \a y. The upper 64 bits of the return value
 ///    are undefined.
 #define _mm_inserti_si64(x, y, len, idx) \
  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
                                    (__v2di)(__m128i)(y), \
                                    (char)(len), (char)(idx)))
 /// Inserts bits of a specified length from the source integer vector
 ///    \a __y into the lower 64 bits of the destination integer vector \a __x
 ///    at the index and of the length specified by \a __y.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
 ///
 /// \param __x
 ///    The destination operand where bits will be inserted. The inserted bits
 ///    are defined by the length and by the index of the least significant bit
 ///    specified by operand \a __y.
 /// \param __y
 ///    The source operand containing the bits to be extracted. The extracted
 ///    bits are the least significant bits of operand \a __y with length
 ///    specified by bits [69:64]. These are inserted into the destination at the
 ///    index specified by bits [77:72]; all other bits are ignored. If bits
 ///    [69:64] are zero, the length is interpreted as 64. If the sum of the
 ///    index and length is greater than 64, the result is undefined. If the
 ///    length and index are both zero, bits [63:0] of parameter \a __y are
 ///    inserted into parameter \a __x. If the length is zero but the index is
 ///    non-zero, the result is undefined.
 /// \returns A 128-bit integer vector containing the original lower 64-bits of
 ///    destination operand \a __x with the specified bitfields replaced by the
 ///    lower bits of source operand \a __y. The upper 64 bits of the return
 ///    value are undefined.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_si64(__m128i __x, __m128i __y)
 {
  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
 }
 /// Stores a 64-bit double-precision value in a 64-bit memory location.
 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
 ///
 /// \param __p
 ///    The 64-bit memory location used to store the register value.
 /// \param __a
 ///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_sd(void *__p, __m128d __a)
 {
  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
 }
 /// Stores a 32-bit single-precision floating-point value in a 32-bit
 ///    memory location. To minimize caching, the data is flagged as
 ///    non-temporal (unlikely to be used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
 ///
 /// \param __p
 ///    The 32-bit memory location used to store the register value.
 /// \param __a
 ///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ss(void *__p, __m128 __a)
 {
  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
 }
 #undef __DEFAULT_FN_ATTRS
 #endif /* __AMMINTRIN_H */
--- a/third_party/intel/clang/amxcomplexintrin.h
+++ b/third_party/intel/clang/amxcomplexintrin.h
@ -0,0 +1,169 @@
 /*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===------------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
 #endif // __IMMINTRIN_H
 #ifndef __AMX_COMPLEXINTRIN_H
 #define __AMX_COMPLEXINTRIN_H
 #ifdef __x86_64__
 #define __DEFAULT_FN_ATTRS_COMPLEX                                             \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
 /// Perform matrix multiplication of two tiles containing complex elements and
 ///    accumulate the results into a packed single precision tile. Each dword
 ///    element in input tiles \a a and \a b is interpreted as a complex number
 ///    with FP16 real part and FP16 imaginary part.
 /// Calculates the imaginary part of the result. For each possible combination
 ///    of (row of \a a, column of \a b), it performs a set of multiplication
 ///    and accumulations on all corresponding complex numbers (one from \a a
 ///    and one from \a b). The imaginary part of the \a a element is multiplied
 ///    with the real part of the corresponding \a b element, and the real part
 ///    of the \a a element is multiplied with the imaginary part of the
 ///    corresponding \a b elements. The two accumulated results are added, and
 ///    then accumulated into the corresponding row and column of \a dst.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
 /// \endcode
 ///
 /// \code{.operation}
 /// FOR m := 0 TO dst.rows - 1
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO (a.colsb / 4) - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
 ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
 ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
 ///		ENDFOR
 ///	ENDFOR
 ///	write_row_and_zero(dst, m, tmp, dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param a
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
 /// Perform matrix multiplication of two tiles containing complex elements and
 ///    accumulate the results into a packed single precision tile. Each dword
 ///    element in input tiles \a a and \a b is interpreted as a complex number
 ///    with FP16 real part and FP16 imaginary part.
 /// Calculates the real part of the result. For each possible combination
 ///    of (row of \a a, column of \a b), it performs a set of multiplication
 ///    and accumulations on all corresponding complex numbers (one from \a a
 ///    and one from \a b). The real part of the \a a element is multiplied
 ///    with the real part of the corresponding \a b element, and the negated
 ///    imaginary part of the \a a element is multiplied with the imaginary
 ///    part of the corresponding \a b elements. The two accumulated results
 ///    are added, and then accumulated into the corresponding row and column
 ///    of \a dst.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
 /// \endcode
 ///
 /// \code{.operation}
 /// FOR m := 0 TO dst.rows - 1
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO (a.colsb / 4) - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
 ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
 ///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
 ///		ENDFOR
 ///	ENDFOR
 ///	write_row_and_zero(dst, m, tmp, dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param a
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
 _tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
 }
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
 _tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
 }
 /// Perform matrix multiplication of two tiles containing complex elements and
 /// accumulate the results into a packed single precision tile. Each dword
 /// element in input tiles src0 and src1 is interpreted as a complex number with
 /// FP16 real part and FP16 imaginary part.
 /// This function calculates the imaginary part of the result.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_COMPLEX
 static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
                               __tile1024i src1) {
  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
                                         dst->tile, src0.tile, src1.tile);
 }
 /// Perform matrix multiplication of two tiles containing complex elements and
 /// accumulate the results into a packed single precision tile. Each dword
 /// element in input tiles src0 and src1 is interpreted as a complex number with
 /// FP16 real part and FP16 imaginary part.
 /// This function calculates the real part of the result.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_COMPLEX
 static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
                               __tile1024i src1) {
  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
                                         dst->tile, src0.tile, src1.tile);
 }
 #endif // __x86_64__
 #endif // __AMX_COMPLEXINTRIN_H
--- a/third_party/intel/clang/amxfp16intrin.h
+++ b/third_party/intel/clang/amxfp16intrin.h
@ -0,0 +1,58 @@
 /*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===------------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
 #endif /* __IMMINTRIN_H */
 #ifndef __AMX_FP16INTRIN_H
 #define __AMX_FP16INTRIN_H
 #ifdef __x86_64__
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
 ///    and \a b, accumulating the intermediate single-precision (32-bit)
 ///    floating-point elements with elements in \a dst, and store the 32-bit
 ///    result back to tile \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// \code
 /// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
 /// \endcode
 ///
 /// \code{.operation}
 /// FOR m := 0 TO dst.rows - 1
 ///	tmp := dst.row[m]
 ///	FOR k := 0 TO (a.colsb / 4) - 1
 ///		FOR n := 0 TO (dst.colsb / 4) - 1
 ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
 ///					FP32(b.row[k].fp16[2*n+0])
 ///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
 ///					FP32(b.row[k].fp16[2*n+1])
 ///		ENDFOR
 ///	ENDFOR
 ///	write_row_and_zero(dst, m, tmp, dst.colsb)
 /// ENDFOR
 /// zero_upper_rows(dst, dst.rows)
 /// zero_tileconfig_start()
 /// \endcode
 ///
 /// This intrinsic corresponds to the \c TDPFP16PS instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param a
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpfp16ps(dst, a, b)                                \
  __builtin_ia32_tdpfp16ps(dst, a, b)
 #endif /* __x86_64__ */
 #endif /* __AMX_FP16INTRIN_H */
--- a/third_party/intel/clang/amxintrin.h
+++ b/third_party/intel/clang/amxintrin.h
@ -0,0 +1,524 @@
 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===------------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
 #endif /* __IMMINTRIN_H */
 #ifndef __AMXINTRIN_H
 #define __AMXINTRIN_H
 #ifdef __x86_64__
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS_TILE                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
 #define __DEFAULT_FN_ATTRS_INT8                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
 #define __DEFAULT_FN_ATTRS_BF16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
 #define __DEFAULT_FN_ATTRS_FP16                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
 /// number of bytes per row, and the number of rows. If the specified
 /// palette_id is zero, that signifies the init state for both the tile
 /// config and the tile data, and the tiles are zeroed. Any invalid
 /// configurations will result in #GP fault.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
 static __inline__ void __DEFAULT_FN_ATTRS_TILE
 _tile_loadconfig(const void *__config) {
  __builtin_ia32_tile_loadconfig(__config);
 }
 /// Stores the current tile configuration to a 64-byte memory location
 /// specified by "mem_addr". The tile configuration includes the tile type
 /// palette, the number of bytes per row, and the number of rows. If tiles
 /// are not configured, all zeroes will be stored to memory.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
 ///
 /// \param __config
 ///    A pointer to 512-bits configuration
 static __inline__ void __DEFAULT_FN_ATTRS_TILE
 _tile_storeconfig(void *__config) {
  __builtin_ia32_tile_storeconfig(__config);
 }
 /// Release the tile configuration to return to the init state, which
 /// releases all storage it currently holds.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  __builtin_ia32_tilerelease();
 }
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst" using the tile configuration previously configured
 /// via "_tile_loadconfig".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 #define _tile_loadd(dst, base, stride)                                         \
  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
                             (__SIZE_TYPE__)(stride))
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst" using the tile configuration previously configured
 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
 /// that the data will likely not be reused in the near future and the data
 /// caching can be optimized accordingly.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 #define _tile_stream_loadd(dst, base, stride)                                  \
  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
                               (__SIZE_TYPE__)(stride))
 /// Store the tile specified by "src" to memory specifieid by "base" address and
 /// "stride" using the tile configuration previously configured via
 /// "_tile_loadconfig".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
 #define _tile_stored(dst, base, stride)                                        \
  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
 /// Zero the tile specified by "tdest".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 ///
 /// \param tile
 ///    The destination tile to be zero. Max size is 1024 Bytes.
 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbssd(dst, src0, src1)                                          \
  __builtin_ia32_tdpbssd((dst), (src0), (src1))
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 /// in "dst", and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbsud(dst, src0, src1)                                          \
  __builtin_ia32_tdpbsud((dst), (src0), (src1))
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbusd(dst, src0, src1)                                          \
  __builtin_ia32_tdpbusd((dst), (src0), (src1))
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 /// "dst", and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbuud(dst, src0, src1)                                          \
  __builtin_ia32_tdpbuud((dst), (src0), (src1))
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 /// elements with elements in "dst", and store the 32-bit result back to tile
 /// "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbf16ps(dst, src0, src1)                                        \
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
 /// AMX tile register size can be configured, the maximum size is 16x64=1024
 /// bytes. Since there is no 2D type in llvm IR, we use vector type to
 /// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                     __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloadd64_internal(m, n, base,
                                             (__SIZE_TYPE__)(stride));
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
                       __SIZE_TYPE__ stride) {
  return __builtin_ia32_tileloaddt164_internal(m, n, base,
                                               (__SIZE_TYPE__)(stride));
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                      __SIZE_TYPE__ stride, _tile1024i tile) {
  return __builtin_ia32_tilestored64_internal(m, n, base,
                                              (__SIZE_TYPE__)(stride), tile);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
 _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 /// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
 _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
 }
 /// This struct pack the shape and tile data together for user. We suggest
 /// initializing the struct as early as possible, because compiler depends
 /// on the shape information to do configure. The constant value is preferred
 /// for optimization by compiler.
 typedef struct __tile1024i_str {
  const unsigned short row;
  const unsigned short col;
  _tile1024i tile;
 } __tile1024i;
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
 static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
                                    __SIZE_TYPE__ stride) {
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }
 /// Load tile rows from memory specifieid by "base" address and "stride" into
 /// destination tile "dst". This intrinsic provides a hint to the implementation
 /// that the data will likely not be reused in the near future and the data
 /// caching can be optimized accordingly.
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
 static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
                                           __SIZE_TYPE__ stride) {
  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 }
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
 static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
 }
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 /// in "dst", and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
 static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
 }
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
 static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
 }
 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 /// "dst", and store the 32-bit result back to tile "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
 static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
 }
 /// Store the tile specified by "src" to memory specifieid by "base" address and
 /// "stride".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
 /// \param base
 ///    A pointer to base address.
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
 __DEFAULT_FN_ATTRS_TILE
 static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
                                     __tile1024i src) {
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }
 /// Zero the tile specified by "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 ///
 /// \param dst
 ///    The destination tile to be zero. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_TILE
 static __inline__ void __tile_zero(__tile1024i *dst) {
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 }
 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 /// elements with elements in "dst", and store the 32-bit result back to tile
 /// "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_BF16
 static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                       __tile1024i src1) {
  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                      src0.tile, src1.tile);
 }
 /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
 /// elements with elements in "dst", and store the 32-bit result back to tile
 /// "dst".
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
 /// \param src0
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_FP16
 static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
                                       __tile1024i src1) {
  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                      src0.tile, src1.tile);
 }
 #undef __DEFAULT_FN_ATTRS_TILE
 #undef __DEFAULT_FN_ATTRS_INT8
 #undef __DEFAULT_FN_ATTRS_BF16
 #undef __DEFAULT_FN_ATTRS_FP16
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/third_party/intel/clang/avx2intrin.h
+++ b/third_party/intel/clang/avx2intrin.h
--- a/third_party/intel/clang/avx512bf16intrin.h
+++ b/third_party/intel/clang/avx512bf16intrin.h
@ -0,0 +1,283 @@
 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifdef __SSE2__
 #ifndef __AVX512BF16INTRIN_H
 #define __AVX512BF16INTRIN_H
 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 #define __DEFAULT_FN_ATTRS512 \
  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
                 __min_vector_width__(512)))
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512bf16,no-evex512")))
 /// Convert One BF16 Data to One Single Float Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic does not correspond to a specific instruction.
 ///
 /// \param __A
 ///    A bfloat data.
 /// \returns A float data whose sign field and exponent field keep unchanged,
 ///    and fraction field is extended to 23 bits.
 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
  return __builtin_ia32_cvtsbf162ss_32(__A);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \param __B
 ///    A 512-bit vector of [16 x float].
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
                                                    (__v16sf) __B);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \param __B
 ///    A 512-bit vector of [16 x float].
 /// \param __W
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __U
 ///    A 32-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
                                        (__v32bf)__W);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \param __B
 ///    A 512-bit vector of [16 x float].
 /// \param __U
 ///    A 32-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
                                        (__v32bf)_mm512_setzero_si512());
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_cvtneps_pbh(__m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
                                              (__v16bf)_mm256_undefined_si256(),
                                              (__mmask16)-1);
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \param __W
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
                                                        (__v16bf)__W,
                                                        (__mmask16)__U);
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [16 x float].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
                                                (__v16bf)_mm256_setzero_si256(),
                                                (__mmask16)__U);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __B
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __D
 ///    A 512-bit vector of [16 x float].
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
                                             (__v32bf) __A,
                                             (__v32bf) __B);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __B
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __D
 ///    A 512-bit vector of [16 x float].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
                                       (__v16sf)__D);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __B
 ///    A 512-bit vector of [32 x bfloat].
 /// \param __D
 ///    A 512-bit vector of [16 x float].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
                                       (__v16sf)_mm512_setzero_si512());
 }
 /// Convert Packed BF16 Data to Packed float Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __U
 ///    A 16-bit mask. Elements are zeroed out when the corresponding mask
 ///    bit is not set.
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __S
 ///    A 512-bit vector of [16 x float]. Elements are copied from __S when
 ///     the corresponding mask bit is not set.
 /// \param __U
 ///    A 16-bit mask.
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
      (__m512i)__S, (__mmask16)__U,
      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
 }
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS512
 #endif
 #endif
--- a/third_party/intel/clang/avx512bitalgintrin.h
+++ b/third_party/intel/clang/avx512bitalgintrin.h
@ -0,0 +1,86 @@
 /*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512BITALGINTRIN_H
 #define __AVX512BITALGINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512bitalg,evex512"),                           \
                 __min_vector_width__(512)))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi16(__m512i __A)
 {
  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
 {
  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
              (__v32hi) _mm512_popcnt_epi16(__B),
              (__v32hi) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
 {
  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
              __U,
              __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi8(__m512i __A)
 {
  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
 {
  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
              (__v64qi) _mm512_popcnt_epi8(__B),
              (__v64qi) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
 {
  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
              __U,
              __B);
 }
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
 {
  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
              (__v64qi) __B,
              __U);
 }
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
 _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
 {
  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
              __A,
              __B);
 }
 #undef __DEFAULT_FN_ATTRS
 #endif
--- a/third_party/intel/clang/avx512bwintrin.h
+++ b/third_party/intel/clang/avx512bwintrin.h
--- a/third_party/intel/clang/avx512cdintrin.h
+++ b/third_party/intel/clang/avx512cdintrin.h
@ -0,0 +1,125 @@
 /*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512CDINTRIN_H
 #define __AVX512CDINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512cd,evex512"), __min_vector_width__(512)))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
 {
  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                             (__v8di)_mm512_conflict_epi64(__A),
                                             (__v8di)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                             (__v8di)_mm512_conflict_epi64(__A),
                                             (__v8di)_mm512_setzero_si512 ());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi32 (__m512i __A)
 {
  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                            (__v16si)_mm512_conflict_epi32(__A),
                                            (__v16si)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                            (__v16si)_mm512_conflict_epi32(__A),
                                            (__v16si)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi32 (__m512i __A)
 {
  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                             (__v16si)_mm512_lzcnt_epi32(__A),
                                             (__v16si)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
                                             (__v16si)_mm512_lzcnt_epi32(__A),
                                             (__v16si)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi64 (__m512i __A)
 {
  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                             (__v8di)_mm512_lzcnt_epi64(__A),
                                             (__v8di)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
 {
  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
                                             (__v8di)_mm512_lzcnt_epi64(__A),
                                             (__v8di)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmb_epi64 (__mmask8 __A)
 {
  return (__m512i) _mm512_set1_epi64((long long) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcastmw_epi32 (__mmask16 __A)
 {
  return (__m512i) _mm512_set1_epi32((int) __A);
 }
 #undef __DEFAULT_FN_ATTRS
 #endif
--- a/third_party/intel/clang/avx512dqintrin.h
+++ b/third_party/intel/clang/avx512dqintrin.h
--- a/third_party/intel/clang/avx512erintrin.h
+++ b/third_party/intel/clang/avx512erintrin.h
@ -0,0 +1,271 @@
 /*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512ERINTRIN_H
 #define __AVX512ERINTRIN_H
 /* exp2a23 */
 #define _mm512_exp2a23_round_pd(A, R) \
  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
                                       (__mmask8)-1, (int)(R)))
 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
                                       (int)(R)))
 #define _mm512_maskz_exp2a23_round_pd(M, A, R) \
  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
                                       (__mmask8)(M), (int)(R)))
 #define _mm512_exp2a23_pd(A) \
  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_exp2a23_pd(S, M, A) \
  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_exp2a23_pd(M, A) \
  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_exp2a23_round_ps(A, R) \
  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
                                      (__mmask16)-1, (int)(R)))
 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
                                      (int)(R)))
 #define _mm512_maskz_exp2a23_round_ps(M, A, R) \
  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
                                      (__mmask16)(M), (int)(R)))
 #define _mm512_exp2a23_ps(A) \
  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_exp2a23_ps(S, M, A) \
  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_exp2a23_ps(M, A) \
  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 /* rsqrt28 */
 #define _mm512_rsqrt28_round_pd(A, R) \
  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
                                          (__mmask8)-1, (int)(R)))
 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
                                          (int)(R)))
 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
                                          (__mmask8)(M), (int)(R)))
 #define _mm512_rsqrt28_pd(A) \
  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_rsqrt28_pd(S, M, A) \
  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_rsqrt28_pd(M, A) \
  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_rsqrt28_round_ps(A, R) \
  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
                                         (__mmask16)-1, (int)(R)))
 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
                                         (int)(R)))
 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
                                         (__mmask16)(M), (int)(R)))
 #define _mm512_rsqrt28_ps(A) \
  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_rsqrt28_ps(S, M, A) \
  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_rsqrt28_ps(M, A) \
  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm_rsqrt28_round_ss(A, B, R) \
  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
                                               (__mmask8)-1, (int)(R)))
 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)(__m128)(S), \
                                               (__mmask8)(M), (int)(R)))
 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
                                               (__mmask8)(M), (int)(R)))
 #define _mm_rsqrt28_ss(A, B) \
  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_mask_rsqrt28_ss(S, M, A, B) \
  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_maskz_rsqrt28_ss(M, A, B) \
  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_rsqrt28_round_sd(A, B, R) \
  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
                                                (__mmask8)-1, (int)(R)))
 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)(__m128d)(S), \
                                                (__mmask8)(M), (int)(R)))
 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
                                                (__mmask8)(M), (int)(R)))
 #define _mm_rsqrt28_sd(A, B) \
  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_mask_rsqrt28_sd(S, M, A, B) \
  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_maskz_rsqrt28_sd(M, A, B) \
  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 /* rcp28 */
 #define _mm512_rcp28_round_pd(A, R) \
  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
                                        (__mmask8)-1, (int)(R)))
 #define _mm512_mask_rcp28_round_pd(S, M, A, R) \
  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
                                        (int)(R)))
 #define _mm512_maskz_rcp28_round_pd(M, A, R) \
  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
                                        (__mmask8)(M), (int)(R)))
 #define _mm512_rcp28_pd(A) \
  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_rcp28_pd(S, M, A) \
  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_rcp28_pd(M, A) \
  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_rcp28_round_ps(A, R) \
  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
                                       (__mmask16)-1, (int)(R)))
 #define _mm512_mask_rcp28_round_ps(S, M, A, R) \
  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
                                       (int)(R)))
 #define _mm512_maskz_rcp28_round_ps(M, A, R) \
  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
                                       (__mmask16)(M), (int)(R)))
 #define _mm512_rcp28_ps(A) \
  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_mask_rcp28_ps(S, M, A) \
  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm512_maskz_rcp28_ps(M, A) \
  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 #define _mm_rcp28_round_ss(A, B, R) \
  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
                                             (__mmask8)-1, (int)(R)))
 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)(__m128)(S), \
                                             (__mmask8)(M), (int)(R)))
 #define _mm_maskz_rcp28_round_ss(M, A, B, R) \
  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
                                             (__mmask8)(M), (int)(R)))
 #define _mm_rcp28_ss(A, B) \
  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_mask_rcp28_ss(S, M, A, B) \
  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_maskz_rcp28_ss(M, A, B) \
  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_rcp28_round_sd(A, B, R) \
  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
                                              (__mmask8)-1, (int)(R)))
 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)(__m128d)(S), \
                                              (__mmask8)(M), (int)(R)))
 #define _mm_maskz_rcp28_round_sd(M, A, B, R) \
  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
                                              (__mmask8)(M), (int)(R)))
 #define _mm_rcp28_sd(A, B) \
  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_mask_rcp28_sd(S, M, A, B) \
  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #define _mm_maskz_rcp28_sd(M, A, B) \
  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 #endif /* __AVX512ERINTRIN_H */
--- a/third_party/intel/clang/avx512fintrin.h
+++ b/third_party/intel/clang/avx512fintrin.h
--- a/third_party/intel/clang/avx512fp16intrin.h
+++ b/third_party/intel/clang/avx512fp16intrin.h
--- a/third_party/intel/clang/avx512ifmaintrin.h
+++ b/third_party/intel/clang/avx512ifmaintrin.h
@ -0,0 +1,70 @@
 /*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __IFMAINTRIN_H
 #define __IFMAINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512ifma,evex512"), __min_vector_width__(512)))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
                                                (__v8di) __Z);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
  return (__m512i)__builtin_ia32_selectq_512(__M,
                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
                                   (__v8di)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
  return (__m512i)__builtin_ia32_selectq_512(__M,
                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
                                   (__v8di)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
                                                (__v8di) __Z);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
  return (__m512i)__builtin_ia32_selectq_512(__M,
                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
                                   (__v8di)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
  return (__m512i)__builtin_ia32_selectq_512(__M,
                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
                                   (__v8di)_mm512_setzero_si512());
 }
 #undef __DEFAULT_FN_ATTRS
 #endif
--- a/third_party/intel/clang/avx512ifmavlintrin.h
+++ b/third_party/intel/clang/avx512ifmavlintrin.h
@ -0,0 +1,111 @@
 /*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __IFMAVLINTRIN_H
 #define __IFMAVLINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
                 __min_vector_width__(256)))
 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
                                          (__v2di)(Z)))
 #define _mm256_madd52hi_epu64(X, Y, Z)                                         \
  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
                                          (__v4di)(Z)))
 #define _mm_madd52lo_epu64(X, Y, Z)                                            \
  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
                                          (__v2di)(Z)))
 #define _mm256_madd52lo_epu64(X, Y, Z)                                         \
  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
                                          (__v4di)(Z)))
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_selectq_128(__M,
                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
                                      (__v2di)__W);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
  return (__m128i)__builtin_ia32_selectq_128(__M,
                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
                                      (__v2di)_mm_setzero_si128());
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_selectq_256(__M,
                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
                                   (__v4di)__W);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
  return (__m256i)__builtin_ia32_selectq_256(__M,
                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
                                   (__v4di)_mm256_setzero_si256());
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_selectq_128(__M,
                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
                                      (__v2di)__W);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
  return (__m128i)__builtin_ia32_selectq_128(__M,
                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
                                      (__v2di)_mm_setzero_si128());
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_selectq_256(__M,
                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
                                   (__v4di)__W);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
  return (__m256i)__builtin_ia32_selectq_256(__M,
                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
                                   (__v4di)_mm256_setzero_si256());
 }
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 #endif
--- a/third_party/intel/clang/avx512pfintrin.h
+++ b/third_party/intel/clang/avx512pfintrin.h
@ -0,0 +1,92 @@
 /*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512PFINTRIN_H
 #define __AVX512PFINTRIN_H
 #define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                             (void const *)(addr), (int)(scale), \
                             (int)(hint))
 #define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
                             (void const *)(addr), (int)(scale), \
                             (int)(hint))
 #define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
                             (__v16si)(__m512i)(index), (void const *)(addr), \
                             (int)(scale), (int)(hint))
 #define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
  __builtin_ia32_gatherpfdps((__mmask16) -1, \
                             (__v16si)(__m512i)(index), (void const *)(addr), \
                             (int)(scale), (int)(hint))
 #define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                             (void const *)(addr), (int)(scale), \
                             (int)(hint))
 #define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
                             (void const *)(addr), (int)(scale), \
                             (int)(hint))
 #define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
                             (void const *)(addr), (int)(scale), (int)(hint))
 #define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
                             (void const *)(addr), (int)(scale), (int)(hint))
 #define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
                              (void *)(addr), (int)(scale), \
                              (int)(hint))
 #define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                              (void *)(addr), (int)(scale), \
                              (int)(hint))
 #define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
                              (void *)(addr), (int)(scale), (int)(hint))
 #define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
                              (__v16si)(__m512i)(index), (void *)(addr), \
                              (int)(scale), (int)(hint))
 #define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), \
                              (int)(hint))
 #define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), \
                              (int)(hint))
 #define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), (int)(hint))
 #define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), (int)(hint))
 #endif
--- a/third_party/intel/clang/avx512vbmi2intrin.h
+++ b/third_party/intel/clang/avx512vbmi2intrin.h
@ -0,0 +1,357 @@
 /*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512VBMI2INTRIN_H
 #define __AVX512VBMI2INTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
              (__v32hi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
              (__v32hi) _mm512_setzero_si512(),
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
              (__v64qi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
              (__v64qi) _mm512_setzero_si512(),
              __U);
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
 {
  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
              __U);
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
 {
  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
              (__v32hi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
              (__v32hi) _mm512_setzero_si512(),
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
              (__v64qi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
              (__v64qi) _mm512_setzero_si512(),
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
              (__v32hi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
              (__v32hi) _mm512_setzero_si512(),
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
              (__v64qi) __S,
              __U);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
              (__v64qi) _mm512_setzero_si512(),
              __U);
 }
 #define _mm512_shldi_epi64(A, B, I) \
  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
                                      (__v8di)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shldi_epi64(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
                                     (__v8di)(__m512i)(S)))
 #define _mm512_maskz_shldi_epi64(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
                                     (__v8di)_mm512_setzero_si512()))
 #define _mm512_shldi_epi32(A, B, I) \
  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
                                      (__v16si)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shldi_epi32(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
                                    (__v16si)(__m512i)(S)))
 #define _mm512_maskz_shldi_epi32(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
                                    (__v16si)_mm512_setzero_si512()))
 #define _mm512_shldi_epi16(A, B, I) \
  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
                                      (__v32hi)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shldi_epi16(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
                                    (__v32hi)(__m512i)(S)))
 #define _mm512_maskz_shldi_epi16(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
                                    (__v32hi)_mm512_setzero_si512()))
 #define _mm512_shrdi_epi64(A, B, I) \
  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
                                      (__v8di)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
                                     (__v8di)(__m512i)(S)))
 #define _mm512_maskz_shrdi_epi64(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
                                     (__v8di)_mm512_setzero_si512()))
 #define _mm512_shrdi_epi32(A, B, I) \
  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
                                      (__v16si)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
                                    (__v16si)(__m512i)(S)))
 #define _mm512_maskz_shrdi_epi32(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
                                    (__v16si)_mm512_setzero_si512()))
 #define _mm512_shrdi_epi16(A, B, I) \
  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
                                      (__v32hi)(__m512i)(B), (int)(I)))
 #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
                                    (__v32hi)(__m512i)(S)))
 #define _mm512_maskz_shrdi_epi16(U, A, B, I) \
  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
                                    (__v32hi)_mm512_setzero_si512()))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
                                             (__v8di)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectq_512(__U,
                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
                                      (__v8di)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectq_512(__U,
                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
                                      (__v8di)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
                                             (__v16si)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectd_512(__U,
                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
                                     (__v16si)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectd_512(__U,
                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
                                     (__v16si)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
                                             (__v32hi)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectw_512(__U,
                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
                                     (__v32hi)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectw_512(__U,
                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
                                     (__v32hi)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
                                             (__v8di)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectq_512(__U,
                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
                                      (__v8di)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectq_512(__U,
                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
                                      (__v8di)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
                                             (__v16si)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
 {
  return (__m512i) __builtin_ia32_selectd_512(__U,
                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
                                     (__v16si)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i) __builtin_ia32_selectd_512(__U,
                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
                                     (__v16si)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
                                             (__v32hi)__C);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectw_512(__U,
                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
                                     (__v32hi)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
 {
  return (__m512i)__builtin_ia32_selectw_512(__U,
                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
                                     (__v32hi)_mm512_setzero_si512());
 }
 #undef __DEFAULT_FN_ATTRS
 #endif
--- a/third_party/intel/clang/avx512vbmiintrin.h
+++ b/third_party/intel/clang/avx512vbmiintrin.h
@ -0,0 +1,106 @@
 /*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __VBMIINTRIN_H
 #define __VBMIINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vbmi,evex512"), __min_vector_width__(512)))
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
 {
  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
                                                 (__v64qi) __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
                              __m512i __B)
 {
  return (__m512i)__builtin_ia32_selectb_512(__U,
                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                               (__v64qi)__A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
                               __m512i __B)
 {
  return (__m512i)__builtin_ia32_selectb_512(__U,
                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                               (__v64qi)__I);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
                               __m512i __B)
 {
  return (__m512i)__builtin_ia32_selectb_512(__U,
                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                               (__v64qi)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
 {
  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
        __m512i __B)
 {
  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                     (__v64qi)_mm512_setzero_si512());
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
             __m512i __B)
 {
  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                     (__v64qi)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
 {
  return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
                                  __m512i __Y)
 {
  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                (__v64qi)__W);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
 {
  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                (__v64qi)_mm512_setzero_si512());
 }
 #undef __DEFAULT_FN_ATTRS
 #endif
--- a/third_party/intel/clang/avx512vbmivlintrin.h
+++ b/third_party/intel/clang/avx512vbmivlintrin.h
@ -0,0 +1,193 @@
 /*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __VBMIVLINTRIN_H
 #define __VBMIVLINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
                 __min_vector_width__(256)))
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
 {
  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
                                                 (__v16qi)__I,
                                                 (__v16qi)__B);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
                           __m128i __B)
 {
  return (__m128i)__builtin_ia32_selectb_128(__U,
                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                  (__v16qi)__A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
                            __m128i __B)
 {
  return (__m128i)__builtin_ia32_selectb_128(__U,
                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                  (__v16qi)__I);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
                            __m128i __B)
 {
  return (__m128i)__builtin_ia32_selectb_128(__U,
                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                  (__v16qi)_mm_setzero_si128());
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
 {
  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
                                                 (__v32qi)__B);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
                              __m256i __B)
 {
  return (__m256i)__builtin_ia32_selectb_256(__U,
                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                               (__v32qi)__A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
                               __m256i __B)
 {
  return (__m256i)__builtin_ia32_selectb_256(__U,
                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                               (__v32qi)__I);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
                               __m256i __B)
 {
  return (__m256i)__builtin_ia32_selectb_256(__U,
                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                               (__v32qi)_mm256_setzero_si256());
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutexvar_epi8 (__m128i __A, __m128i __B)
 {
  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
 {
  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                        (__v16qi)_mm_setzero_si128());
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
          __m128i __B)
 {
  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                        (__v16qi)__W);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
 {
  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
        __m256i __B)
 {
  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                     (__v32qi)_mm256_setzero_si256());
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
             __m256i __B)
 {
  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                     (__v32qi)__W);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
                               __m128i __Y)
 {
  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
                                   (__v16qi)__W);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
                                   (__v16qi)_mm_setzero_si128());
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
                                  __m256i __Y)
 {
  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
                                (__v32qi)__W);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
                                (__v32qi)_mm256_setzero_si256());
 }
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 #endif
--- a/third_party/intel/clang/avx512vlbf16intrin.h
+++ b/third_party/intel/clang/avx512vlbf16intrin.h
@ -0,0 +1,517 @@
 /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifdef __SSE2__
 #ifndef __AVX512VLBF16INTRIN_H
 #define __AVX512VLBF16INTRIN_H
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
                 __min_vector_width__(256)))
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \param __B
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
                                                    (__v4sf) __B);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \param __B
 ///    A 128-bit vector of [4 x float].
 /// \param __W
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
                                             (__v8bf)__W);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \param __B
 ///    A 128-bit vector of [4 x float].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
                                             (__v8bf)_mm_setzero_si128());
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \param __B
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
                                                    (__v8sf) __B);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \param __B
 ///    A 256-bit vector of [8 x float].
 /// \param __W
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
                                         (__v16bf)__W);
 }
 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \param __B
 ///    A 256-bit vector of [8 x float].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
 /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
                                         (__v16bf)_mm256_setzero_si256());
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
 #define _mm_cvtneps_pbh(A)                                                     \
  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \param __W
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __U
 ///    A 4-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
                                                        (__v8bf)__W,
                                                        (__mmask8)__U);
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x float].
 /// \param __U
 ///    A 4-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
                                                    (__v8bf)_mm_setzero_si128(),
                                                    (__mmask8)__U);
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 #define _mm256_cvtneps_pbh(A)                                                  \
  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \param __W
 ///    A 256-bit vector of [8 x bfloat].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element from __W.
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
                                                        (__v8bf)__W,
                                                        (__mmask8)__U);
 }
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means conversion of __A. A 0 means element is zero.
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
                                                    (__v8bf)_mm_setzero_si128(),
                                                    (__mmask8)__U);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __B
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __D
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
                                             (__v8bf)__A,
                                             (__v8bf)__B);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __B
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __D
 ///    A 128-bit vector of [4 x float].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
                                           (__v4sf)__D);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __B
 ///    A 128-bit vector of [8 x bfloat].
 /// \param __D
 ///    A 128-bit vector of [4 x float].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
                                           (__v4sf)_mm_setzero_si128());
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __B
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __D
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
                                             (__v16bf)__A,
                                             (__v16bf)__B);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __B
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __D
 ///    A 256-bit vector of [8 x float].
 /// \param __U
 ///    A 16-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
                                        (__v8sf)__D);
 }
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
 ///
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __B
 ///    A 256-bit vector of [16 x bfloat].
 /// \param __D
 ///    A 256-bit vector of [8 x float].
 /// \param __U
 ///    A 8-bit mask value specifying what is chosen for each element.
 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
 /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
 ///  __A, __B and __D
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
                                        (__v8sf)_mm256_setzero_si256());
 }
 /// Convert One Single float Data to One BF16 Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
 ///
 /// \param __A
 ///    A float data.
 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
 ///    and fraction field is truncated to 7 bits.
 static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  __v4sf __V = {__A, 0, 0, 0};
  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
  return (__bf16)__R[0];
 }
 /// Convert Packed BF16 Data to Packed float Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
  return _mm_castsi128_ps(
      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __U
 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __U
 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 }
 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __S
 ///    A 128-bit vector of [4 x float]. Elements are copied from __S when
 ///     the corresponding mask bit is not set.
 /// \param __U
 ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [4 x bfloat].
 /// \returns A 128-bit vector of [4 x float] come from conversion of __A
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
      16));
 }
 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __S
 ///    A 256-bit vector of [8 x float]. Elements are copied from __S when
 ///     the corresponding mask bit is not set.
 /// \param __U
 ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
 /// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
      16));
 }
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 #endif
 #endif
--- a/third_party/intel/clang/avx512vlbitalgintrin.h
+++ b/third_party/intel/clang/avx512vlbitalgintrin.h
@ -0,0 +1,151 @@
 /*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
 *
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __IMMINTRIN_H
 #error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
 #endif
 #ifndef __AVX512VLBITALGINTRIN_H
 #define __AVX512VLBITALGINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
                 __min_vector_width__(256)))
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi16(__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
 {
  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
              (__v16hi) _mm256_popcnt_epi16(__B),
              (__v16hi) __A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
 {
  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
              __U,
              __B);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi16(__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
 {
  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
              (__v8hi) _mm_popcnt_epi16(__B),
              (__v8hi) __A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
 {
  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
              __U,
              __B);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi8(__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
 {
  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
              (__v32qi) _mm256_popcnt_epi8(__B),
              (__v32qi) __A);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
 {
  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
              __U,
              __B);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi8(__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
 {
  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
              (__v16qi) _mm_popcnt_epi8(__B),
              (__v16qi) __A);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
 {
  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
              __U,
              __B);
 }
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
 {
  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
              (__v32qi) __B,
              __U);
 }
 static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
 _mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
 {
  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
              __A,
              __B);
 }
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
 {
  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
              (__v16qi) __B,
              __U);
 }
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
 _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
 {
  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
              __A,
              __B);
 }
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 #endif
--- a/Show more
+++ b/Show more