Release Cosmopolitan v3.8.0

This change switches c++ exception handling from sjlj to standard dwarf. It's needed because clang for aarch64 doesn't support sjlj. It turns out that libunwind had a bare-metal configuration that made this easy to do. This change gets the new experimental cosmocc -mclang flag in a state of working so well that it can now be used to build all of llamafile and it goes 3x faster in terms of build latency, without trading away any perf. The int_fast16_t and int_fast32_t types are now always defined as 32-bit in the interest of having more abi consistency between cosmocc -mgcc and -mclang mode.
2025-10-27 19:34:33 +00:00 · 2024-08-30 20:12:26 -07:00 · 2024-08-30 20:12:26 -07:00 · c9152b6f14
commit c9152b6f14
parent 5b9862907c
188 changed files with 199063 additions and 636 deletions
--- a/third_party/aarch64/BUILD.mk
+++ b/third_party/aarch64/BUILD.mk
@ -3,4 +3,4 @@

 PKGS += THIRD_PARTY_AARCH64
 THIRD_PARTY_AARCH64_HDRS = $(filter %.h,$(THIRD_PARTY_AARCH64_FILES))
-THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*)
+THIRD_PARTY_AARCH64_FILES := $(wildcard third_party/aarch64/*) $(wildcard third_party/aarch64/clang/*)
--- a/third_party/aarch64/clang/arm64intr.h
+++ b/third_party/aarch64/clang/arm64intr.h
@ -0,0 +1,35 @@
+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <arm64intr.h>
+#else
+
+#ifndef __ARM64INTR_H
+#define __ARM64INTR_H
+
+typedef enum
+{
+  _ARM64_BARRIER_SY    = 0xF,
+  _ARM64_BARRIER_ST    = 0xE,
+  _ARM64_BARRIER_LD    = 0xD,
+  _ARM64_BARRIER_ISH   = 0xB,
+  _ARM64_BARRIER_ISHST = 0xA,
+  _ARM64_BARRIER_ISHLD = 0x9,
+  _ARM64_BARRIER_NSH   = 0x7,
+  _ARM64_BARRIER_NSHST = 0x6,
+  _ARM64_BARRIER_NSHLD = 0x5,
+  _ARM64_BARRIER_OSH   = 0x3,
+  _ARM64_BARRIER_OSHST = 0x2,
+  _ARM64_BARRIER_OSHLD = 0x1
+} _ARM64INTR_BARRIER_TYPE;
+
+#endif /* __ARM64INTR_H */
+#endif /* _MSC_VER */
--- a/third_party/aarch64/clang/arm_acle.h
+++ b/third_party/aarch64/clang/arm_acle.h
@ -0,0 +1,888 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ * The Arm C Language Extensions specifications can be found in the following
+ * link: https://github.com/ARM-software/acle/releases
+ *
+ * The ACLE section numbers are subject to change. When consulting the
+ * specifications, it is recommended to search using section titles if
+ * the section numbers look outdated.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 7.3 Memory barriers */
+#if !__has_builtin(__dmb)
+#define __dmb(i) __builtin_arm_dmb(i)
+#endif
+#if !__has_builtin(__dsb)
+#define __dsb(i) __builtin_arm_dsb(i)
+#endif
+#if !__has_builtin(__isb)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 7.4 Hints */
+
+#if !__has_builtin(__wfi)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+#endif
+
+#if !__has_builtin(__wfe)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+#endif
+
+#if !__has_builtin(__sev)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+#endif
+
+#if !__has_builtin(__sevl)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+#endif
+
+#if !__has_builtin(__yield)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+#define _CHKFEAT_GCS 1
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__chkfeat(uint64_t __features) {
+  return __builtin_arm_chkfeat(__features) ^ __features;
+}
+#endif
+
+/* 7.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__swp(uint32_t __x, volatile uint32_t *__p) {
+  uint32_t v;
+  do
+    v = __builtin_arm_ldrex(__p);
+  while (__builtin_arm_strex(__x, __p));
+  return v;
+}
+
+/* 7.6 Memory prefetch intrinsics */
+/* 7.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 7.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 7.7 NOP */
+#if !defined(_MSC_VER) || (!defined(__aarch64__) && !defined(__arm64ec__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+#endif
+
+/* 8 DATA-PROCESSING INTRINSICS */
+/* 8.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__ror(uint32_t __x, uint32_t __y) {
+  __y %= 32;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (32 - __y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rorll(uint64_t __x, uint32_t __y) {
+  __y %= 64;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (64 - __y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rorl(unsigned long __x, uint32_t __y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(__x, __y);
+#else
+  return __rorll(__x, __y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__clz(uint32_t __t) {
+  return __builtin_arm_clz(__t);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__clzl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_arm_clz(__t);
+#else
+  return __builtin_arm_clz64(__t);
+#endif
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__clzll(uint64_t __t) {
+  return __builtin_arm_clz64(__t);
+}
+
+/* CLS */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__cls(uint32_t __t) {
+  return __builtin_arm_cls(__t);
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__clsl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_arm_cls(__t);
+#else
+  return __builtin_arm_cls64(__t);
+#endif
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__clsll(uint64_t __t) {
+  return __builtin_arm_cls64(__t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev(uint32_t __t) {
+  return __builtin_bswap32(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__revl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(__t);
+#else
+  return __builtin_bswap64(__t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__revll(uint64_t __t) {
+  return __builtin_bswap64(__t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev16(uint32_t __t) {
+  return __ror(__rev(__t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rev16ll(uint64_t __t) {
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rev16l(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(__t);
+#else
+    return __rev16ll(__t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+__revsh(int16_t __t) {
+  return (int16_t)__builtin_bswap16((uint16_t)__t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rbit(uint32_t __t) {
+  return __builtin_arm_rbit(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rbitll(uint64_t __t) {
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
+         __builtin_arm_rbit(__t >> 32);
+#else
+  return __builtin_arm_rbit64(__t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rbitl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(__t);
+#else
+  return __rbitll(__t);
+#endif
+}
+
+/* 8.3 16-bit multiplications */
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwt(__a, __b);
+}
+#endif
+
+/*
+ * 8.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 8.4.1 Width-specified saturation intrinsics */
+#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 8.4.2 Saturating addition and subtraction intrinsics */
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qadd(int32_t __t, int32_t __v) {
+  return __builtin_arm_qadd(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qsub(int32_t __t, int32_t __v) {
+  return __builtin_arm_qsub(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t __t) {
+  return __builtin_arm_qadd(__t, __t);
+}
+#endif
+
+/* 8.4.3 Accumulating multiplications */
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawt(__a, __b, __c);
+}
+#endif
+
+
+/* 8.5.4 Parallel 16-bit saturation */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
+#define __usat16(x, y) __builtin_arm_usat16(x, y)
+#endif
+
+/* 8.5.5 Packing and unpacking */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+typedef int32_t int8x4_t;
+typedef int32_t int16x2_t;
+typedef uint32_t uint8x4_t;
+typedef uint32_t uint16x2_t;
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_sxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtb16(int8x4_t __a) {
+  return __builtin_arm_sxtb16(__a);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_uxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtb16(int8x4_t __a) {
+  return __builtin_arm_uxtb16(__a);
+}
+#endif
+
+/* 8.5.6 Parallel selection */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__sel(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_sel(__a, __b);
+}
+#endif
+
+/* 8.5.7 Parallel 8-bit addition and subtraction */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__sadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_sadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__ssub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_ssub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__usub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usub8(__a, __b);
+}
+#endif
+
+/* 8.5.8 Sum of 8-bit absolute differences */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usad8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usad8(__a, __b);
+}
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
+  return __builtin_arm_usada8(__a, __b, __c);
+}
+#endif
+
+/* 8.5.9 Parallel 16-bit addition and subtraction */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usub16(__a, __b);
+}
+#endif
+
+/* 8.5.10 Parallel 16-bit multiplication */
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlad(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smladx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlald(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlaldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsd(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsdx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsld(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuad(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuad(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuadx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuadx(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusd(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusd(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusdx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusdx(__a, __b);
+}
+#endif
+
+/* 8.6 Floating-point data-processing intrinsics */
+#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING)    &&                         \
+  (__ARM_FEATURE_DIRECTED_ROUNDING))             &&                         \
+  (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__rintn(double __a) {
+  return __builtin_roundeven(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__rintnf(float __a) {
+  return __builtin_roundevenf(__a);
+}
+#endif
+
+/* 8.8 CRC32 intrinsics */
+#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
+    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32b(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32b(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32h(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32h(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32w(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32w(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32d(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32d(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32cb(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32cb(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32ch(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32ch(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32cw(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32cw(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
+__crc32cd(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32cd(__a, __b);
+}
+#endif
+
+/* 8.6 Floating-point data-processing intrinsics */
+/* Armv8.3-A Javascript conversion intrinsic */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
+__jcvt(double __a) {
+  return __builtin_arm_jcvt(__a);
+}
+#endif
+
+/* Armv8.5-A FP rounding intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32zf(float __a) {
+  return __builtin_arm_rint32zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32z(double __a) {
+  return __builtin_arm_rint32z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64zf(float __a) {
+  return __builtin_arm_rint64zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64z(double __a) {
+  return __builtin_arm_rint64z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32xf(float __a) {
+  return __builtin_arm_rint32xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32x(double __a) {
+  return __builtin_arm_rint32x(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64xf(float __a) {
+  return __builtin_arm_rint64xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64x(double __a) {
+  return __builtin_arm_rint64x(__a);
+}
+#endif
+
+/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+typedef struct {
+    uint64_t val[8];
+} data512_t;
+
+static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
+__arm_ld64b(const void *__addr) {
+  data512_t __value;
+  __builtin_arm_ld64b(__addr, __value.val);
+  return __value;
+}
+static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
+__arm_st64b(void *__addr, data512_t __value) {
+  __builtin_arm_st64b(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
+__arm_st64bv(void *__addr, data512_t __value) {
+  return __builtin_arm_st64bv(__addr, __value.val);
+}
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
+__arm_st64bv0(void *__addr, data512_t __value) {
+  return __builtin_arm_st64bv0(__addr, __value.val);
+}
+#endif
+
+/* 11.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
+#define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+#define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
+#define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
+
+/* 10.3 MTE intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+#define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
+#define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
+#define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
+#define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
+#define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
+#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
+
+/* 18 memcpy family of operations intrinsics - MOPS */
+#define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
+  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
+#endif
+
+/* 11.3 Coprocessor Intrinsics */
+#if defined(__ARM_FEATURE_COPROC)
+
+#if (__ARM_FEATURE_COPROC & 0x1)
+
+#if (__ARM_ARCH < 8)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#endif /* __ARM_ARCH < 8 */
+
+#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
+#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
+
+#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2)                         \
+  __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc(coproc, opc1, CRn, CRm, opc2)                                \
+  __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
+
+#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
+
+#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* ___ARM_ARCH_8M_MAIN__ */
+
+#endif /* __ARM_FEATURE_COPROC & 0x1 */
+
+#if (__ARM_FEATURE_COPROC & 0x2)
+#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)                          \
+  __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
+#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
+#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
+#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
+#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)                        \
+  __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2)                               \
+  __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x4)
+#define __arm_mcrr(coproc, opc1, value, CRm)                                   \
+  __builtin_arm_mcrr(coproc, opc1, value, CRm)
+#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x8)
+#define __arm_mcrr2(coproc, opc1, value, CRm)                                  \
+  __builtin_arm_mcrr2(coproc, opc1, value, CRm)
+#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
+#endif
+
+#endif // __ARM_FEATURE_COPROC
+
+/* 17 Transactional Memory Extension (TME) Intrinsics */
+#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
+
+#define _TMFAILURE_REASON  0x00007fffu
+#define _TMFAILURE_RTRY    0x00008000u
+#define _TMFAILURE_CNCL    0x00010000u
+#define _TMFAILURE_MEM     0x00020000u
+#define _TMFAILURE_IMP     0x00040000u
+#define _TMFAILURE_ERR     0x00080000u
+#define _TMFAILURE_SIZE    0x00100000u
+#define _TMFAILURE_NEST    0x00200000u
+#define _TMFAILURE_DBG     0x00400000u
+#define _TMFAILURE_INT     0x00800000u
+#define _TMFAILURE_TRIVIAL 0x01000000u
+
+#define __tstart()        __builtin_arm_tstart()
+#define __tcommit()       __builtin_arm_tcommit()
+#define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
+#define __ttest()         __builtin_arm_ttest()
+
+#endif /* __ARM_FEATURE_TME */
+
+/* 8.7 Armv8.5-A Random number generation intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
+__rndr(uint64_t *__p) {
+  return __builtin_arm_rndr(__p);
+}
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
+__rndrrs(uint64_t *__p) {
+  return __builtin_arm_rndrrs(__p);
+}
+#endif
+
+/* 11.2 Guarded Control Stack intrinsics */
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ void * __attribute__((__always_inline__, __nodebug__))
+__gcspr() {
+  return (void *)__builtin_arm_rsr64("gcspr_el0");
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("gcs")))
+__gcspopm() {
+  return __builtin_arm_gcspopm(0);
+}
+
+static __inline__ const void * __attribute__((__always_inline__, __nodebug__, target("gcs")))
+__gcsss(const void *__stack) {
+  return __builtin_arm_gcsss(__stack);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */
--- a/third_party/aarch64/clang/arm_bf16.h
+++ b/third_party/aarch64/clang/arm_bf16.h
@ -0,0 +1,20 @@
+/*===---- arm_bf16.h - ARM BF16 intrinsics -----------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_BF16_H
+#define __ARM_BF16_H
+
+typedef __bf16 bfloat16_t;
+#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
+
+
+#undef __ai
+
+#endif
--- a/third_party/aarch64/clang/arm_cde.h
+++ b/third_party/aarch64/clang/arm_cde.h
@ -0,0 +1,410 @@
+/*===---- arm_cde.h - ARM CDE intrinsics -----------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_CDE_H
+#define __ARM_CDE_H
+
+#if !__ARM_FEATURE_CDE
+#error "CDE support not enabled"
+#endif
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1)))
+uint32_t __arm_cx1(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1a)))
+uint32_t __arm_cx1a(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1d)))
+uint64_t __arm_cx1d(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx1da)))
+uint64_t __arm_cx1da(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2)))
+uint32_t __arm_cx2(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2a)))
+uint32_t __arm_cx2a(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2d)))
+uint64_t __arm_cx2d(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx2da)))
+uint64_t __arm_cx2da(int, uint64_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3)))
+uint32_t __arm_cx3(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3a)))
+uint32_t __arm_cx3a(int, uint32_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3d)))
+uint64_t __arm_cx3d(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_cx3da)))
+uint64_t __arm_cx3da(int, uint64_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1_u32)))
+uint32_t __arm_vcx1_u32(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1a_u32)))
+uint32_t __arm_vcx1a_u32(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1d_u64)))
+uint64_t __arm_vcx1d_u64(int, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1da_u64)))
+uint64_t __arm_vcx1da_u64(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2_u32)))
+uint32_t __arm_vcx2_u32(int, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2a_u32)))
+uint32_t __arm_vcx2a_u32(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2d_u64)))
+uint64_t __arm_vcx2d_u64(int, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx2da_u64)))
+uint64_t __arm_vcx2da_u64(int, uint64_t, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3_u32)))
+uint32_t __arm_vcx3_u32(int, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3a_u32)))
+uint32_t __arm_vcx3a_u32(int, uint32_t, uint32_t, uint32_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3d_u64)))
+uint64_t __arm_vcx3d_u64(int, uint64_t, uint64_t, uint32_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx3da_u64)))
+uint64_t __arm_vcx3da_u64(int, uint64_t, uint64_t, uint64_t, uint32_t);
+
+#if __ARM_FEATURE_MVE
+
+typedef uint16_t mve_pred16_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) int16_t int16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) int32_t int32x4_t;
+typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) int64_t int64x2_t;
+typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) int8_t int8x16_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) uint16_t uint16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) uint32_t uint32x4_t;
+typedef __attribute__((__neon_vector_type__(2), __clang_arm_mve_strict_polymorphism)) uint64_t uint64x2_t;
+typedef __attribute__((__neon_vector_type__(16), __clang_arm_mve_strict_polymorphism)) uint8_t uint8x16_t;
+
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s16)))
+int16x8_t __arm_vcx1q_m(int, int16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s32)))
+int32x4_t __arm_vcx1q_m(int, int32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s64)))
+int64x2_t __arm_vcx1q_m(int, int64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_s8)))
+int8x16_t __arm_vcx1q_m(int, int8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u16)))
+uint16x8_t __arm_vcx1q_m(int, uint16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u32)))
+uint32x4_t __arm_vcx1q_m(int, uint32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u64)))
+uint64x2_t __arm_vcx1q_m(int, uint64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_u8)))
+uint8x16_t __arm_vcx1q_m(int, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_u8)))
+uint8x16_t __arm_vcx1q_u8(int, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s16)))
+int16x8_t __arm_vcx1qa_m(int, int16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s32)))
+int32x4_t __arm_vcx1qa_m(int, int32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s64)))
+int64x2_t __arm_vcx1qa_m(int, int64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_s8)))
+int8x16_t __arm_vcx1qa_m(int, int8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u16)))
+uint16x8_t __arm_vcx1qa_m(int, uint16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u32)))
+uint32x4_t __arm_vcx1qa_m(int, uint32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u64)))
+uint64x2_t __arm_vcx1qa_m(int, uint64x2_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_u8)))
+uint8x16_t __arm_vcx1qa_m(int, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s16)))
+int16x8_t __arm_vcx1qa(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s32)))
+int32x4_t __arm_vcx1qa(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s64)))
+int64x2_t __arm_vcx1qa(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_s8)))
+int8x16_t __arm_vcx1qa(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u16)))
+uint16x8_t __arm_vcx1qa(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u32)))
+uint32x4_t __arm_vcx1qa(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u64)))
+uint64x2_t __arm_vcx1qa(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_u8)))
+uint8x16_t __arm_vcx1qa(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s16)))
+int16x8_t __arm_vcx2q_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s32)))
+int32x4_t __arm_vcx2q_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s64)))
+int64x2_t __arm_vcx2q_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_s8)))
+int8x16_t __arm_vcx2q_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u16)))
+uint16x8_t __arm_vcx2q_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u32)))
+uint32x4_t __arm_vcx2q_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u64)))
+uint64x2_t __arm_vcx2q_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_u8)))
+uint8x16_t __arm_vcx2q_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s16)))
+int16x8_t __arm_vcx2q(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s32)))
+int32x4_t __arm_vcx2q(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s64)))
+int64x2_t __arm_vcx2q(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_s8)))
+int8x16_t __arm_vcx2q(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u16)))
+uint16x8_t __arm_vcx2q(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u32)))
+uint32x4_t __arm_vcx2q(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u64)))
+uint64x2_t __arm_vcx2q(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8)))
+uint8x16_t __arm_vcx2q(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s16)))
+uint8x16_t __arm_vcx2q_u8(int, int16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s32)))
+uint8x16_t __arm_vcx2q_u8(int, int32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s64)))
+uint8x16_t __arm_vcx2q_u8(int, int64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_s8)))
+uint8x16_t __arm_vcx2q_u8(int, int8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u16)))
+uint8x16_t __arm_vcx2q_u8(int, uint16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u32)))
+uint8x16_t __arm_vcx2q_u8(int, uint32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u64)))
+uint8x16_t __arm_vcx2q_u8(int, uint64x2_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_u8)))
+uint8x16_t __arm_vcx2q_u8(int, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s16)))
+int16x8_t __arm_vcx2qa_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s32)))
+int32x4_t __arm_vcx2qa_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s64)))
+int64x2_t __arm_vcx2qa_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_s8)))
+int8x16_t __arm_vcx2qa_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u16)))
+uint16x8_t __arm_vcx2qa_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u32)))
+uint32x4_t __arm_vcx2qa_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u64)))
+uint64x2_t __arm_vcx2qa_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_u8)))
+uint8x16_t __arm_vcx2qa_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s16)))
+int16x8_t __arm_vcx2qa_m_impl(int, int16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s32)))
+int32x4_t __arm_vcx2qa_m_impl(int, int32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s64)))
+int64x2_t __arm_vcx2qa_m_impl(int, int64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_s8)))
+int8x16_t __arm_vcx2qa_m_impl(int, int8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u16)))
+uint16x8_t __arm_vcx2qa_m_impl(int, uint16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u32)))
+uint32x4_t __arm_vcx2qa_m_impl(int, uint32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u64)))
+uint64x2_t __arm_vcx2qa_m_impl(int, uint64x2_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_u8)))
+uint8x16_t __arm_vcx2qa_m_impl(int, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s16)))
+int16x8_t __arm_vcx3q_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s32)))
+int32x4_t __arm_vcx3q_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s64)))
+int64x2_t __arm_vcx3q_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_s8)))
+int8x16_t __arm_vcx3q_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u16)))
+uint16x8_t __arm_vcx3q_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u32)))
+uint32x4_t __arm_vcx3q_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u64)))
+uint64x2_t __arm_vcx3q_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_u8)))
+uint8x16_t __arm_vcx3q_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s16)))
+int16x8_t __arm_vcx3q_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s32)))
+int32x4_t __arm_vcx3q_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s64)))
+int64x2_t __arm_vcx3q_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_s8)))
+int8x16_t __arm_vcx3q_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u16)))
+uint16x8_t __arm_vcx3q_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u32)))
+uint32x4_t __arm_vcx3q_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u64)))
+uint64x2_t __arm_vcx3q_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_u8)))
+uint8x16_t __arm_vcx3q_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s64)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_s8)))
+uint8x16_t __arm_vcx3q_u8_impl(int, int8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u64)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint64x2_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_u8)))
+uint8x16_t __arm_vcx3q_u8_impl(int, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s16)))
+int16x8_t __arm_vcx3qa_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s32)))
+int32x4_t __arm_vcx3qa_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s64)))
+int64x2_t __arm_vcx3qa_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_s8)))
+int8x16_t __arm_vcx3qa_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u16)))
+uint16x8_t __arm_vcx3qa_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u32)))
+uint32x4_t __arm_vcx3qa_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u64)))
+uint64x2_t __arm_vcx3qa_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_u8)))
+uint8x16_t __arm_vcx3qa_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s16)))
+int16x8_t __arm_vcx3qa_m_impl(int, int16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s32)))
+int32x4_t __arm_vcx3qa_m_impl(int, int32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s64)))
+int64x2_t __arm_vcx3qa_m_impl(int, int64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_s8)))
+int8x16_t __arm_vcx3qa_m_impl(int, int8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u16)))
+uint16x8_t __arm_vcx3qa_m_impl(int, uint16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u32)))
+uint32x4_t __arm_vcx3qa_m_impl(int, uint32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u64)))
+uint64x2_t __arm_vcx3qa_m_impl(int, uint64x2_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_u8)))
+uint8x16_t __arm_vcx3qa_m_impl(int, uint8x16_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s16_u8)))
+int16x8_t __arm_vreinterpretq_s16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s32_u8)))
+int32x4_t __arm_vreinterpretq_s32_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s64_u8)))
+int64x2_t __arm_vreinterpretq_s64_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_s8_u8)))
+int8x16_t __arm_vreinterpretq_s8_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u16_u8)))
+uint16x8_t __arm_vreinterpretq_u16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u32_u8)))
+uint32x4_t __arm_vreinterpretq_u32_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u64_u8)))
+uint64x2_t __arm_vreinterpretq_u64_u8(uint8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s16)))
+uint8x16_t __arm_vreinterpretq_u8(int16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s32)))
+uint8x16_t __arm_vreinterpretq_u8(int32x4_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s64)))
+uint8x16_t __arm_vreinterpretq_u8(int64x2_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_s8)))
+uint8x16_t __arm_vreinterpretq_u8(int8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u16)))
+uint8x16_t __arm_vreinterpretq_u8(uint16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u32)))
+uint8x16_t __arm_vreinterpretq_u8(uint32x4_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_u64)))
+uint8x16_t __arm_vreinterpretq_u8(uint64x2_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vreinterpretq_u8_u8)))
+uint8x16_t __arm_vreinterpretq_u8(uint8x16_t);
+#define __arm_vcx2q_m(cp, inactive, n, imm, pred) __arm_vcx2q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), (imm), (pred))
+#define __arm_vcx2qa(cp, acc, n, imm) __arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))
+#define __arm_vcx2qa_m(cp, acc, n, imm, pred) __arm_vcx2qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm), (pred))
+#define __arm_vcx3q(cp, n, m, imm) __arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3q_m(cp, inactive, n, m, imm, pred) __arm_vcx3q_m_impl((cp), (inactive), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
+#define __arm_vcx3q_u8(cp, n, m, imm) __arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3qa(cp, acc, n, m, imm) __arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm))
+#define __arm_vcx3qa_m(cp, acc, n, m, imm, pred) __arm_vcx3qa_m_impl((cp), (acc), __arm_vreinterpretq_u8(n), __arm_vreinterpretq_u8(m), (imm), (pred))
+
+#endif /* __ARM_FEATURE_MVE */
+
+#if __ARM_FEATURE_MVE & 2
+
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef __attribute__((__neon_vector_type__(8), __clang_arm_mve_strict_polymorphism)) float16_t float16x8_t;
+typedef __attribute__((__neon_vector_type__(4), __clang_arm_mve_strict_polymorphism)) float32_t float32x4_t;
+
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f16)))
+float16x8_t __arm_vcx1q_m(int, float16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1q_m_f32)))
+float32x4_t __arm_vcx1q_m(int, float32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f16)))
+float16x8_t __arm_vcx1qa(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_f32)))
+float32x4_t __arm_vcx1qa(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f16)))
+float16x8_t __arm_vcx1qa_m(int, float16x8_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx1qa_m_f32)))
+float32x4_t __arm_vcx1qa_m(int, float32x4_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f16)))
+float16x8_t __arm_vcx2q(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_f32)))
+float32x4_t __arm_vcx2q(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f16)))
+float16x8_t __arm_vcx2q_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_m_impl_f32)))
+float32x4_t __arm_vcx2q_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f16)))
+uint8x16_t __arm_vcx2q_u8(int, float16x8_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2q_u8_f32)))
+uint8x16_t __arm_vcx2q_u8(int, float32x4_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f16)))
+float16x8_t __arm_vcx2qa_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_impl_f32)))
+float32x4_t __arm_vcx2qa_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f16)))
+float16x8_t __arm_vcx2qa_m_impl(int, float16x8_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx2qa_m_impl_f32)))
+float32x4_t __arm_vcx2qa_m_impl(int, float32x4_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f16)))
+float16x8_t __arm_vcx3q_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_impl_f32)))
+float32x4_t __arm_vcx3q_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f16)))
+float16x8_t __arm_vcx3q_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_m_impl_f32)))
+float32x4_t __arm_vcx3q_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f16)))
+uint8x16_t __arm_vcx3q_u8_impl(int, float16x8_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3q_u8_impl_f32)))
+uint8x16_t __arm_vcx3q_u8_impl(int, float32x4_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f16)))
+float16x8_t __arm_vcx3qa_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_impl_f32)))
+float32x4_t __arm_vcx3qa_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f16)))
+float16x8_t __arm_vcx3qa_m_impl(int, float16x8_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_cde_vcx3qa_m_impl_f32)))
+float32x4_t __arm_vcx3qa_m_impl(int, float32x4_t, uint8x16_t, uint8x16_t, uint32_t, mve_pred16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f16_u8)))
+float16x8_t __arm_vreinterpretq_f16_u8(uint8x16_t);
+static __inline__ __attribute__((__clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_f32_u8)))
+float32x4_t __arm_vreinterpretq_f32_u8(uint8x16_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f16)))
+uint8x16_t __arm_vreinterpretq_u8(float16x8_t);
+static __inline__ __attribute__((__overloadable__, __clang_arm_builtin_alias(__builtin_arm_mve_vreinterpretq_u8_f32)))
+uint8x16_t __arm_vreinterpretq_u8(float32x4_t);
+
+#endif /* __ARM_FEATURE_MVE & 2 */
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* __ARM_CDE_H */
--- a/third_party/aarch64/clang/arm_cmse.h
+++ b/third_party/aarch64/clang/arm_cmse.h
@ -0,0 +1,217 @@
+//===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __ARM_CMSE_H
+#define __ARM_CMSE_H
+
+#if (__ARM_FEATURE_CMSE & 0x1)
+#include <stddef.h>
+#include <stdint.h>
+
+#define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
+#define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
+#define CMSE_AU_NONSECURE  2 /* checks if permissions have secure field unset */
+#define CMSE_MPU_UNPRIV    4 /* sets T flag on TT insrtuction */
+#define CMSE_MPU_READ      8 /* checks if read_ok field is set */
+#define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
+#define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
+
+#define cmse_check_pointed_object(p, f) \
+  cmse_check_address_range((p), sizeof(*(p)), (f))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef union {
+  struct cmse_address_info {
+#ifdef __ARM_BIG_ENDIAN
+    /* __ARM_BIG_ENDIAN */
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned idau_region : 8;
+    unsigned idau_region_valid : 1;
+    unsigned secure : 1;
+    unsigned nonsecure_readwrite_ok : 1;
+    unsigned nonsecure_read_ok : 1;
+#else
+    unsigned : 12;
+#endif
+    unsigned readwrite_ok : 1;
+    unsigned read_ok : 1;
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned sau_region_valid : 1;
+#else
+    unsigned : 1;
+#endif
+    unsigned mpu_region_valid : 1;
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned sau_region : 8;
+#else
+    unsigned : 8;
+#endif
+    unsigned mpu_region : 8;
+
+#else /* __ARM_LITTLE_ENDIAN */
+    unsigned mpu_region : 8;
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned sau_region : 8;
+#else
+    unsigned : 8;
+#endif
+    unsigned mpu_region_valid : 1;
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned sau_region_valid : 1;
+#else
+    unsigned : 1;
+#endif
+    unsigned read_ok : 1;
+    unsigned readwrite_ok : 1;
+#if (__ARM_CMSE_SECURE_MODE)
+    unsigned nonsecure_read_ok : 1;
+    unsigned nonsecure_readwrite_ok : 1;
+    unsigned secure : 1;
+    unsigned idau_region_valid : 1;
+    unsigned idau_region : 8;
+#else
+    unsigned : 12;
+#endif
+#endif /*__ARM_LITTLE_ENDIAN */
+  } flags;
+  unsigned value;
+} cmse_address_info_t;
+
+static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
+cmse_TT(void *__p) {
+  cmse_address_info_t __u;
+  __u.value = __builtin_arm_cmse_TT(__p);
+  return __u;
+}
+static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
+cmse_TTT(void *__p) {
+  cmse_address_info_t __u;
+  __u.value = __builtin_arm_cmse_TTT(__p);
+  return __u;
+}
+
+#if __ARM_CMSE_SECURE_MODE
+static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
+cmse_TTA(void *__p) {
+  cmse_address_info_t __u;
+  __u.value = __builtin_arm_cmse_TTA(__p);
+  return __u;
+}
+static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
+cmse_TTAT(void *__p) {
+  cmse_address_info_t __u;
+  __u.value = __builtin_arm_cmse_TTAT(__p);
+  return __u;
+}
+#endif
+
+#define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
+#define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
+
+#if __ARM_CMSE_SECURE_MODE
+#define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
+#define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
+#endif
+
+static void *__attribute__((__always_inline__))
+cmse_check_address_range(void *__pb, size_t __s, int __flags) {
+  uintptr_t __begin = (uintptr_t)__pb;
+  uintptr_t __end = __begin + __s - 1;
+
+  if (__end < __begin)
+    return NULL; /* wrap around check */
+
+  /* Check whether the range crosses a 32-bytes aligned address */
+  const int __single_check = (__begin ^ __end) < 0x20u;
+
+  /* execute the right variant of the TT instructions */
+  void *__pe = (void *)__end;
+  cmse_address_info_t __permb, __perme;
+  switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
+  case 0:
+    __permb = cmse_TT(__pb);
+    __perme = __single_check ? __permb : cmse_TT(__pe);
+    break;
+  case CMSE_MPU_UNPRIV:
+    __permb = cmse_TTT(__pb);
+    __perme = __single_check ? __permb : cmse_TTT(__pe);
+    break;
+#if __ARM_CMSE_SECURE_MODE
+  case CMSE_MPU_NONSECURE:
+    __permb = cmse_TTA(__pb);
+    __perme = __single_check ? __permb : cmse_TTA(__pe);
+    break;
+  case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
+    __permb = cmse_TTAT(__pb);
+    __perme = __single_check ? __permb : cmse_TTAT(__pe);
+    break;
+#endif
+  /* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
+  default:
+    return NULL;
+  }
+
+  /* check that the range does not cross MPU, SAU, or IDAU region boundaries */
+  if (__permb.value != __perme.value)
+    return NULL;
+#if !(__ARM_CMSE_SECURE_MODE)
+  /* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
+  if (__flags & CMSE_AU_NONSECURE)
+    return NULL;
+#endif
+
+  /* check the permission on the range */
+  switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
+#if (__ARM_CMSE_SECURE_MODE)
+  case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
+  case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
+    return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
+
+  case CMSE_MPU_READ | CMSE_AU_NONSECURE:
+    return __permb.flags.nonsecure_read_ok ? __pb : NULL;
+
+  case CMSE_AU_NONSECURE:
+    return __permb.flags.secure ? NULL : __pb;
+#endif
+  case CMSE_MPU_READ | CMSE_MPU_READWRITE:
+  case CMSE_MPU_READWRITE:
+    return __permb.flags.readwrite_ok ? __pb : NULL;
+
+  case CMSE_MPU_READ:
+    return __permb.flags.read_ok ? __pb : NULL;
+
+  default:
+    return NULL;
+  }
+}
+
+#if __ARM_CMSE_SECURE_MODE
+static int __attribute__((__always_inline__, __nodebug__))
+cmse_nonsecure_caller(void) {
+  return !((uintptr_t)__builtin_return_address(0) & 1);
+}
+
+#define cmse_nsfptr_create(p)                                                  \
+  __builtin_bit_cast(__typeof__(p),                                            \
+                     (__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
+
+#define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
+
+#endif /* __ARM_CMSE_SECURE_MODE */
+
+void __attribute__((__noreturn__)) cmse_abort(void);
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* (__ARM_FEATURE_CMSE & 0x1) */
+
+#endif /* __ARM_CMSE_H */
--- a/third_party/aarch64/clang/arm_fp16.h
+++ b/third_party/aarch64/clang/arm_fp16.h
@ -0,0 +1,596 @@
+/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_FP16_H
+#define __ARM_FP16_H
+
+#include <stdint.h>
+
+typedef __fp16 float16_t;
+#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
+
+#if defined(__aarch64__) || defined(__arm64ec__)
+#define vabdh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vabsh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \
+  __ret; \
+})
+#define vaddh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcageh_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcagth_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcaleh_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcalth_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \
+  __ret; \
+})
+#define vceqh_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vceqzh_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \
+  __ret; \
+})
+#define vcgeh_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcgezh_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \
+  __ret; \
+})
+#define vcgth_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcgtzh_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \
+  __ret; \
+})
+#define vcleh_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vclezh_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \
+  __ret; \
+})
+#define vclth_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \
+  __ret; \
+})
+#define vcltzh_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \
+  __ret; \
+})
+#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_s16_f16(__p0) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \
+  __ret; \
+})
+#define vcvth_s32_f16(__p0) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \
+  __ret; \
+})
+#define vcvth_s64_f16(__p0) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \
+  __ret; \
+})
+#define vcvth_u16_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \
+  __ret; \
+})
+#define vcvth_u32_f16(__p0) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \
+  __ret; \
+})
+#define vcvth_u64_f16(__p0) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_s16_f16(__p0) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_s32_f16(__p0) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_s64_f16(__p0) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_u16_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_u32_f16(__p0) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \
+  __ret; \
+})
+#define vcvtah_u64_f16(__p0) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \
+  __ret; \
+})
+#define vcvth_f16_u16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  uint16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__s0); \
+  __ret; \
+})
+#define vcvth_f16_s16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  int16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__s0); \
+  __ret; \
+})
+#define vcvth_f16_u32(__p0) __extension__ ({ \
+  float16_t __ret; \
+  uint32_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__s0); \
+  __ret; \
+})
+#define vcvth_f16_s32(__p0) __extension__ ({ \
+  float16_t __ret; \
+  int32_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__s0); \
+  __ret; \
+})
+#define vcvth_f16_u64(__p0) __extension__ ({ \
+  float16_t __ret; \
+  uint64_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__s0); \
+  __ret; \
+})
+#define vcvth_f16_s64(__p0) __extension__ ({ \
+  float16_t __ret; \
+  int64_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__s0); \
+  __ret; \
+})
+#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  uint32_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  int32_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  uint64_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  int64_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  uint16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \
+  __ret; \
+})
+#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  int16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \
+  __ret; \
+})
+#define vcvtmh_s16_f16(__p0) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \
+  __ret; \
+})
+#define vcvtmh_s32_f16(__p0) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \
+  __ret; \
+})
+#define vcvtmh_s64_f16(__p0) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \
+  __ret; \
+})
+#define vcvtmh_u16_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \
+  __ret; \
+})
+#define vcvtmh_u32_f16(__p0) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \
+  __ret; \
+})
+#define vcvtmh_u64_f16(__p0) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_s16_f16(__p0) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_s32_f16(__p0) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_s64_f16(__p0) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_u16_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_u32_f16(__p0) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \
+  __ret; \
+})
+#define vcvtnh_u64_f16(__p0) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_s16_f16(__p0) __extension__ ({ \
+  int16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_s32_f16(__p0) __extension__ ({ \
+  int32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_s64_f16(__p0) __extension__ ({ \
+  int64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_u16_f16(__p0) __extension__ ({ \
+  uint16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_u32_f16(__p0) __extension__ ({ \
+  uint32_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \
+  __ret; \
+})
+#define vcvtph_u64_f16(__p0) __extension__ ({ \
+  uint64_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \
+  __ret; \
+})
+#define vdivh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  float16_t __s2 = __p2; \
+  __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \
+  __ret; \
+})
+#define vmaxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vminh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vminnmh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vmulh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vmulxh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vnegh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \
+  __ret; \
+})
+#define vrecpeh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \
+  __ret; \
+})
+#define vrecpsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vrecpxh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \
+  __ret; \
+})
+#define vrndh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \
+  __ret; \
+})
+#define vrndah_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \
+  __ret; \
+})
+#define vrndih_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \
+  __ret; \
+})
+#define vrndmh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \
+  __ret; \
+})
+#define vrndnh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \
+  __ret; \
+})
+#define vrndph_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \
+  __ret; \
+})
+#define vrndxh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \
+  __ret; \
+})
+#define vrsqrteh_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \
+  __ret; \
+})
+#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \
+  __ret; \
+})
+#define vsqrth_f16(__p0) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \
+  __ret; \
+})
+#define vsubh_f16(__p0, __p1) __extension__ ({ \
+  float16_t __ret; \
+  float16_t __s0 = __p0; \
+  float16_t __s1 = __p1; \
+  __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \
+  __ret; \
+})
+#endif
+
+#undef __ai
+
+#endif /* __ARM_FP16_H */
--- a/third_party/aarch64/clang/arm_mve.h
+++ b/third_party/aarch64/clang/arm_mve.h
--- a/third_party/aarch64/clang/arm_neon.h
+++ b/third_party/aarch64/clang/arm_neon.h
--- a/third_party/aarch64/clang/arm_neon_sve_bridge.h
+++ b/third_party/aarch64/clang/arm_neon_sve_bridge.h
@ -0,0 +1,182 @@
+/*===---- arm_neon_sve_bridge.h - ARM NEON SVE Bridge intrinsics -----------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_NEON_SVE_BRIDGE_H
+#define __ARM_NEON_SVE_BRIDGE_H
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Function attributes */
+#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
+#define __aio                                                                  \
+  static __inline__                                                            \
+      __attribute__((__always_inline__, __nodebug__, __overloadable__))
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
+svint8_t svset_neonq(svint8_t, int8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
+svint16_t svset_neonq(svint16_t, int16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
+svint32_t svset_neonq(svint32_t, int32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
+svint64_t svset_neonq(svint64_t, int64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
+svuint8_t svset_neonq(svuint8_t, uint8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
+svuint16_t svset_neonq(svuint16_t, uint16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
+svuint32_t svset_neonq(svuint32_t, uint32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
+svuint64_t svset_neonq(svuint64_t, uint64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
+svfloat16_t svset_neonq(svfloat16_t, float16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
+svfloat32_t svset_neonq(svfloat32_t, float32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
+svfloat64_t svset_neonq(svfloat64_t, float64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s8)))
+svint8_t svset_neonq_s8(svint8_t, int8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s16)))
+svint16_t svset_neonq_s16(svint16_t, int16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s32)))
+svint32_t svset_neonq_s32(svint32_t, int32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_s64)))
+svint64_t svset_neonq_s64(svint64_t, int64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u8)))
+svuint8_t svset_neonq_u8(svuint8_t, uint8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u16)))
+svuint16_t svset_neonq_u16(svuint16_t, uint16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u32)))
+svuint32_t svset_neonq_u32(svuint32_t, uint32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_u64)))
+svuint64_t svset_neonq_u64(svuint64_t, uint64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f16)))
+svfloat16_t svset_neonq_f16(svfloat16_t, float16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f32)))
+svfloat32_t svset_neonq_f32(svfloat32_t, float32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_f64)))
+svfloat64_t svset_neonq_f64(svfloat64_t, float64x2_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
+int8x16_t svget_neonq(svint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
+int16x8_t svget_neonq(svint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
+int32x4_t svget_neonq(svint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
+int64x2_t svget_neonq(svint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
+uint8x16_t svget_neonq(svuint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
+uint16x8_t svget_neonq(svuint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
+uint32x4_t svget_neonq(svuint32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
+uint64x2_t svget_neonq(svuint64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
+float16x8_t svget_neonq(svfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
+float32x4_t svget_neonq(svfloat32_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
+float64x2_t svget_neonq(svfloat64_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s8)))
+int8x16_t svget_neonq_s8(svint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s16)))
+int16x8_t svget_neonq_s16(svint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s32)))
+int32x4_t svget_neonq_s32(svint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_s64)))
+int64x2_t svget_neonq_s64(svint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u8)))
+uint8x16_t svget_neonq_u8(svuint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u16)))
+uint16x8_t svget_neonq_u16(svuint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u32)))
+uint32x4_t svget_neonq_u32(svuint32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_u64)))
+uint64x2_t svget_neonq_u64(svuint64_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f16)))
+float16x8_t svget_neonq_f16(svfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f32)))
+float32x4_t svget_neonq_f32(svfloat32_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_f64)))
+float64x2_t svget_neonq_f64(svfloat64_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
+svint8_t svdup_neonq(int8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
+svint16_t svdup_neonq(int16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
+svint32_t svdup_neonq(int32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
+svint64_t svdup_neonq(int64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
+svuint8_t svdup_neonq(uint8x16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
+svuint16_t svdup_neonq(uint16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
+svuint32_t svdup_neonq(uint32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
+svuint64_t svdup_neonq(uint64x2_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
+svfloat16_t svdup_neonq(float16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
+svfloat32_t svdup_neonq(float32x4_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
+svfloat64_t svdup_neonq(float64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s8)))
+svint8_t svdup_neonq_s8(int8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s16)))
+svint16_t svdup_neonq_s16(int16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s32)))
+svint32_t svdup_neonq_s32(int32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_s64)))
+svint64_t svdup_neonq_s64(int64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u8)))
+svuint8_t svdup_neonq_u8(uint8x16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u16)))
+svuint16_t svdup_neonq_u16(uint16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u32)))
+svuint32_t svdup_neonq_u32(uint32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_u64)))
+svuint64_t svdup_neonq_u64(uint64x2_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f16)))
+svfloat16_t svdup_neonq_f16(float16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f32)))
+svfloat32_t svdup_neonq_f32(float32x4_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
+svfloat64_t svdup_neonq_f64(float64x2_t);
+
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
+svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
+svbfloat16_t svset_neonq_bf16(svbfloat16_t, bfloat16x8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
+bfloat16x8_t svget_neonq(svbfloat16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svget_neonq_bf16)))
+bfloat16x8_t svget_neonq_bf16(svbfloat16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
+svbfloat16_t svdup_neonq(bfloat16x8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
+svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
+
+#undef __ai
+#undef __aio
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif //__ARM_NEON_SVE_BRIDGE_H
--- a/third_party/aarch64/clang/arm_sme.h
+++ b/third_party/aarch64/clang/arm_sme.h
--- a/third_party/aarch64/clang/arm_sve.h
+++ b/third_party/aarch64/clang/arm_sve.h
--- a/third_party/aarch64/clang/arm_vector_types.h
+++ b/third_party/aarch64/clang/arm_vector_types.h
@ -0,0 +1,345 @@
+/*===---- arm_vector_types - ARM vector type ------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
+#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
+
+#endif
+#ifndef __ARM_NEON_TYPES_H
+#define __ARM_NEON_TYPES_H
+typedef float float32_t;
+typedef __fp16 float16_t;
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef double float64_t;
+#endif
+
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+#endif
+
+typedef struct int8x8x2_t {
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t {
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t {
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t {
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t {
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t {
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t {
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t {
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t {
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t {
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t {
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t {
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t {
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t {
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t {
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t {
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float16x4x2_t {
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t {
+  float16x8_t val[2];
+} float16x8x2_t;
+
+typedef struct float32x2x2_t {
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t {
+  float32x4_t val[2];
+} float32x4x2_t;
+
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct float64x1x2_t {
+  float64x1_t val[2];
+} float64x1x2_t;
+
+typedef struct float64x2x2_t {
+  float64x2_t val[2];
+} float64x2x2_t;
+
+#endif
+typedef struct int8x8x3_t {
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t {
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t {
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t {
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t {
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t {
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t {
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t {
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t {
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t {
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t {
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t {
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t {
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t {
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t {
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t {
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float16x4x3_t {
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t {
+  float16x8_t val[3];
+} float16x8x3_t;
+
+typedef struct float32x2x3_t {
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t {
+  float32x4_t val[3];
+} float32x4x3_t;
+
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct float64x1x3_t {
+  float64x1_t val[3];
+} float64x1x3_t;
+
+typedef struct float64x2x3_t {
+  float64x2_t val[3];
+} float64x2x3_t;
+
+#endif
+typedef struct int8x8x4_t {
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t {
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t {
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t {
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t {
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t {
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t {
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t {
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t {
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t {
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t {
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t {
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t {
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t {
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t {
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t {
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float16x4x4_t {
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t {
+  float16x8_t val[4];
+} float16x8x4_t;
+
+typedef struct float32x2x4_t {
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t {
+  float32x4_t val[4];
+} float32x4x4_t;
+
+#if defined(__aarch64__) || defined(__arm64ec__)
+typedef struct float64x1x4_t {
+  float64x1_t val[4];
+} float64x1x4_t;
+
+typedef struct float64x2x4_t {
+  float64x2_t val[4];
+} float64x2x4_t;
+
+#endif
+typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
+typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
+
+typedef struct bfloat16x4x2_t {
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t {
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t {
+  bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t {
+  bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t {
+  bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t {
+  bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
+#endif // __ARM_NEON_TYPES_H
--- a/third_party/aarch64/clang/armintr.h
+++ b/third_party/aarch64/clang/armintr.h
@ -0,0 +1,31 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */
--- a/third_party/awk/run.c
+++ b/third_party/awk/run.c
@ -495,7 +495,7 @@ makearraystring(Node *p, const char *func)

 		if (!adjbuf(&buf, &bufsz, tlen + 1, recsize, 0, func)) {
 			FATAL("%s: out of memory %s[%s...]",
-			    func, x->nval, buf);
+			      func ? func : "NULL", x->nval, buf);
 		}
 		memcpy(buf + blen, s, slen);
 		if (nsub) {
--- a/third_party/double-conversion/BUILD.mk
+++ b/third_party/double-conversion/BUILD.mk
@ -34,7 +34,8 @@ THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS =					\
 	LIBC_MEM								\
 	LIBC_STR								\
 	LIBC_TINYMATH								\
-	THIRD_PARTY_LIBCXXABI
+	THIRD_PARTY_LIBCXXABI							\
+	THIRD_PARTY_LIBUNWIND

 THIRD_PARTY_DOUBLECONVERSION_A_DEPS :=						\
 	$(call uniq,$(foreach x,$(THIRD_PARTY_DOUBLECONVERSION_A_DIRECTDEPS),$($(x))))
--- a/third_party/intel/BUILD.mk
+++ b/third_party/intel/BUILD.mk
@ -3,4 +3,4 @@

 PKGS += THIRD_PARTY_INTEL
 THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
-THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)
+THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*) $(wildcard third_party/intel/clang/*)
--- a/third_party/intel/clang/__wmmintrin_aes.h
+++ b/third_party/intel/clang/__wmmintrin_aes.h
@ -0,0 +1,140 @@
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_AES_H
+#define __WMMINTRIN_AES_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
+
+/// Performs a single round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs the final round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs a single round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs the final round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
+}
+
+/// Generates a round key for AES encryption, operating on 128-bit data
+///    specified in the first source operand and using an 8-bit round constant
+///    specified by the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
+///
+/// \param C
+///    A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+///    An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
+#define _mm_aeskeygenassist_si128(C, R) \
+  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif  /* __WMMINTRIN_AES_H */
--- a/third_party/intel/clang/__wmmintrin_pclmul.h
+++ b/third_party/intel/clang/__wmmintrin_pclmul.h
@ -0,0 +1,48 @@
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_PCLMUL_H
+#define __WMMINTRIN_PCLMUL_H
+
+/// Multiplies two 64-bit integer values, which are selected from source
+///    operands using the immediate-value operand. The multiplication is a
+///    carry-less multiplication, and the 128-bit integer product is stored in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param Y
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param I
+///    An immediate value specifying which 64-bit values to select from the
+///    operands. Bit 0 is used to select a value from operand \a X, and bit
+///    4 is used to select a value from operand \a Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
+#define _mm_clmulepi64_si128(X, Y, I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
+                                        (__v2di)(__m128i)(Y), (char)(I)))
+
+#endif /* __WMMINTRIN_PCLMUL_H */
--- a/third_party/intel/clang/adcintrin.h
+++ b/third_party/intel/clang/adcintrin.h
@ -0,0 +1,160 @@
+/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ADCINTRIN_H
+#define __ADCINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
+///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c SBB instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 32-bit unsigned minuend.
+/// \param __y
+///    The 32-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
+///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 64-bit unsigned minuend.
+/// \param __y
+///    The 64-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADCINTRIN_H */
--- a/third_party/intel/clang/adxintrin.h
+++ b/third_party/intel/clang/adxintrin.h
@ -0,0 +1,102 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Intrinsics that are available only if __ADX__ is defined. */
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADCX instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADCX instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */
--- a/third_party/intel/clang/ammintrin.h
+++ b/third_party/intel/clang/ammintrin.h
@ -0,0 +1,183 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include "pmmintrin.h"
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a y of length \a len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// Inserts bits of a specified length from the source integer vector
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand \a __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(void *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
+}
+
+/// Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(void *__p, __m128 __a)
+{
+  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */
--- a/third_party/intel/clang/amxcomplexintrin.h
+++ b/third_party/intel/clang/amxcomplexintrin.h
@ -0,0 +1,169 @@
+/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXINTRIN_H
+#define __AMX_COMPLEXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The imaginary part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the real part
+///    of the \a a element is multiplied with the imaginary part of the
+///    corresponding \a b elements. The two accumulated results are added, and
+///    then accumulated into the corresponding row and column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The real part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the negated
+///    imaginary part of the \a a element is multiplied with the imaginary
+///    part of the corresponding \a b elements. The two accumulated results
+///    are added, and then accumulated into the corresponding row and column
+///    of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXINTRIN_H
--- a/third_party/intel/clang/amxfp16intrin.h
+++ b/third_party/intel/clang/amxfp16intrin.h
@ -0,0 +1,58 @@
+/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16INTRIN_H
+#define __AMX_FP16INTRIN_H
+#ifdef __x86_64__
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
+///    and \a b, accumulating the intermediate single-precision (32-bit)
+///    floating-point elements with elements in \a dst, and store the 32-bit
+///    result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///					FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///					FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpfp16ps(dst, a, b)                                \
+  __builtin_ia32_tdpfp16ps(dst, a, b)
+
+#endif /* __x86_64__ */
+#endif /* __AMX_FP16INTRIN_H */
--- a/third_party/intel/clang/amxintrin.h
+++ b/third_party/intel/clang/amxintrin.h
@ -0,0 +1,524 @@
+/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXINTRIN_H
+#define __AMXINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS_TILE                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+#define __DEFAULT_FN_ATTRS_BF16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
+#define __DEFAULT_FN_ATTRS_FP16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
+
+/// Load tile configuration from a 64-byte memory location specified by
+/// "mem_addr". The tile configuration includes the tile type palette, the
+/// number of bytes per row, and the number of rows. If the specified
+/// palette_id is zero, that signifies the init state for both the tile
+/// config and the tile data, and the tiles are zeroed. Any invalid
+/// configurations will result in #GP fault.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
+  __builtin_ia32_tile_loadconfig(__config);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location
+/// specified by "mem_addr". The tile configuration includes the tile type
+/// palette, the number of bytes per row, and the number of rows. If tiles
+/// are not configured, all zeroes will be stored to memory.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
+///
+/// \param __config
+///    A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
+  __builtin_ia32_tile_storeconfig(__config);
+}
+
+/// Release the tile configuration to return to the init state, which
+/// releases all storage it currently holds.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
+  __builtin_ia32_tilerelease();
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_loadd(dst, base, stride)                                         \
+  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
+                             (__SIZE_TYPE__)(stride))
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+#define _tile_stream_loadd(dst, base, stride)                                  \
+  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
+                               (__SIZE_TYPE__)(stride))
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride" using the tile configuration previously configured via
+/// "_tile_loadconfig".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
+#define _tile_stored(dst, base, stride)                                        \
+  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Zero the tile specified by "tdest".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param tile
+///    The destination tile to be zero. Max size is 1024 Bytes.
+#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbssd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbssd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbsud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbsud((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbusd(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbusd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbuud(dst, src0, src1)                                          \
+  __builtin_ia32_tdpbuud((dst), (src0), (src1))
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbf16ps(dst, src0, src1)                                        \
+  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
+
+/// AMX tile register size can be configured, the maximum size is 16x64=1024
+/// bytes. Since there is no 2D type in llvm IR, we use vector type to
+/// represent 2D tile and the fixed size is maximum amx tile register size.
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+                     __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloadd64_internal(m, n, base,
+                                             (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddt164_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+                      __SIZE_TYPE__ stride, _tile1024i tile) {
+  return __builtin_ia32_tilestored64_internal(m, n, base,
+                                              (__SIZE_TYPE__)(stride), tile);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// This struct pack the shape and tile data together for user. We suggest
+/// initializing the struct as early as possible, because compiler depends
+/// on the shape information to do configure. The constant value is preferred
+/// for optimization by compiler.
+typedef struct __tile1024i_str {
+  const unsigned short row;
+  const unsigned short col;
+  _tile1024i tile;
+} __tile1024i;
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
+                                    __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
+                                           __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
+  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
+  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
+  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
+  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
+                                     __tile1024i src) {
+  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
+
+/// Zero the tile specified by "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param dst
+///    The destination tile to be zero. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_zero(__tile1024i *dst) {
+  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_BF16
+static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
+  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP16
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
+  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS_TILE
+#undef __DEFAULT_FN_ATTRS_INT8
+#undef __DEFAULT_FN_ATTRS_BF16
+#undef __DEFAULT_FN_ATTRS_FP16
+
+#endif /* __x86_64__ */
+#endif /* __AMXINTRIN_H */
--- a/third_party/intel/clang/avx2intrin.h
+++ b/third_party/intel/clang/avx2intrin.h
--- a/third_party/intel/clang/avx512bf16intrin.h
+++ b/third_party/intel/clang/avx512bf16intrin.h
@ -0,0 +1,283 @@
+/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX512BF16INTRIN_H
+#define __AVX512BF16INTRIN_H
+
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
+
+#define __DEFAULT_FN_ATTRS512 \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
+                 __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bf16,no-evex512")))
+
+/// Convert One BF16 Data to One Single Float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __A
+///    A bfloat data.
+/// \returns A float data whose sign field and exponent field keep unchanged,
+///    and fraction field is extended to 23 bits.
+static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
+  return __builtin_ia32_cvtsbf162ss_32(__A);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
+                                                    (__v16sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \param __W
+///    A 512-bit vector of [32 x bfloat].
+/// \param __U
+///    A 32-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __B
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 32-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+///    conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)_mm512_setzero_si512());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_cvtneps_pbh(__m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                              (__v16bf)_mm256_undefined_si256(),
+                                              (__mmask16)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __W
+///    A 256-bit vector of [16 x bfloat].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                        (__v16bf)__W,
+                                                        (__mmask16)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+                                                (__v16bf)_mm256_setzero_si256(),
+                                                (__mmask16)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
+                                             (__v32bf) __A,
+                                             (__v32bf) __B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+                                       (__v16sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 512-bit vector of [32 x bfloat].
+/// \param __B
+///    A 512-bit vector of [32 x bfloat].
+/// \param __D
+///    A 512-bit vector of [16 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+                                       (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+                                       (__v16sf)_mm512_setzero_si512());
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 16-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+      (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 512-bit vector of [16 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 16-bit mask.
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
+  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
+      (__m512i)__S, (__mmask16)__U,
+      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
+#endif
--- a/third_party/intel/clang/avx512bitalgintrin.h
+++ b/third_party/intel/clang/avx512bitalgintrin.h
@ -0,0 +1,86 @@
+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BITALGINTRIN_H
+#define __AVX512BITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bitalg,evex512"),                           \
+                 __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi16(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
+              (__v32hi) _mm512_popcnt_epi16(__B),
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
+              __U,
+              __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi8(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
+              (__v64qi) _mm512_popcnt_epi8(__B),
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
+              (__v64qi) __B,
+              __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
+{
+  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512bwintrin.h
+++ b/third_party/intel/clang/avx512bwintrin.h
--- a/third_party/intel/clang/avx512cdintrin.h
+++ b/third_party/intel/clang/avx512cdintrin.h
@ -0,0 +1,125 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512cd,evex512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_conflict_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_conflict_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_conflict_epi32(__A),
+                                            (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                            (__v16si)_mm512_conflict_epi32(__A),
+                                            (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) _mm512_set1_epi64((long long) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) _mm512_set1_epi32((int) __A);
+
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512dqintrin.h
+++ b/third_party/intel/clang/avx512dqintrin.h
--- a/third_party/intel/clang/avx512erintrin.h
+++ b/third_party/intel/clang/avx512erintrin.h
@ -0,0 +1,271 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+/* exp2a23 */
+#define _mm512_exp2a23_round_pd(A, R) \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)))
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (int)(R)))
+
+#define _mm512_exp2a23_pd(A) \
+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)))
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (int)(R)))
+
+#define _mm512_exp2a23_ps(A) \
+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+/* rsqrt28 */
+#define _mm512_rsqrt28_round_pd(A, R) \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                          (int)(R)))
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(M), (int)(R)))
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                         (int)(R)))
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(M), (int)(R)))
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(S), \
+                                               (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(M), (int)(R)))
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(S), \
+                                                (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(M), (int)(R)))
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+/* rcp28 */
+#define _mm512_rcp28_round_pd(A, R) \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                        (int)(R)))
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(M), (int)(R)))
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                       (int)(R)))
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(M), (int)(R)))
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)(__m128)(S), \
+                                             (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(M), (int)(R)))
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)(__m128d)(S), \
+                                              (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(M), (int)(R)))
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif /* __AVX512ERINTRIN_H */
--- a/third_party/intel/clang/avx512fintrin.h
+++ b/third_party/intel/clang/avx512fintrin.h
--- a/third_party/intel/clang/avx512fp16intrin.h
+++ b/third_party/intel/clang/avx512fp16intrin.h
--- a/third_party/intel/clang/avx512ifmaintrin.h
+++ b/third_party/intel/clang/avx512ifmaintrin.h
@ -0,0 +1,70 @@
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,evex512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512ifmavlintrin.h
+++ b/third_party/intel/clang/avx512ifmavlintrin.h
@ -0,0 +1,111 @@
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))
+
+#define _mm_madd52hi_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))
+
+#define _mm256_madd52hi_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))
+
+#define _mm_madd52lo_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))
+
+#define _mm256_madd52lo_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512pfintrin.h
+++ b/third_party/intel/clang/avx512pfintrin.h
@ -0,0 +1,92 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                             (void const *)(addr), (int)(scale), \
+                             (int)(hint))
+
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+                             (void const *)(addr), (int)(scale), \
+                             (int)(hint))
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+                             (__v16si)(__m512i)(index), (void const *)(addr), \
+                             (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
+                             (__v16si)(__m512i)(index), (void const *)(addr), \
+                             (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (void const *)(addr), (int)(scale), \
+                             (int)(hint))
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (void const *)(addr), (int)(scale), \
+                             (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (void const *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (void const *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+                              (void *)(addr), (int)(scale), \
+                              (int)(hint))
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                              (void *)(addr), (int)(scale), \
+                              (int)(hint))
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+                              (void *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+                              (__v16si)(__m512i)(index), (void *)(addr), \
+                              (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (void *)(addr), (int)(scale), \
+                              (int)(hint))
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (void *)(addr), (int)(scale), \
+                              (int)(hint))
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (void *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (void *)(addr), (int)(scale), (int)(hint))
+
+#endif
--- a/third_party/intel/clang/avx512vbmi2intrin.h
+++ b/third_party/intel/clang/avx512vbmi2intrin.h
@ -0,0 +1,357 @@
+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H
+#define __AVX512VBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
+{
+  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
+{
+  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) _mm512_setzero_si512(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) _mm512_setzero_si512(),
+              __U);
+}
+
+#define _mm512_shldi_epi64(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                     (__v8di)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                     (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_shldi_epi32(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                    (__v16si)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                    (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_shldi_epi16(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                    (__v32hi)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                    (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi64(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                     (__v8di)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                     (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi32(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                    (__v16si)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                    (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi16(A, B, I) \
+  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                    (__v32hi)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                    (__v32hi)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
+                                             (__v8di)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
+                                      (__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                      (__v8di)_mm512_shldv_epi64(__A, __B, __C),
+                                      (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
+                                     (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                     (__v16si)_mm512_shldv_epi32(__A, __B, __C),
+                                     (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
+                                             (__v32hi)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
+                                     (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                                     (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
+                                     (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
+                                             (__v8di)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
+                                      (__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__U,
+                                      (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
+                                      (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
+                                             (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_selectd_512(__U,
+                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
+                                     (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_selectd_512(__U,
+                                     (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
+                                     (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
+                                             (__v32hi)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
+                                     (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i)__builtin_ia32_selectw_512(__U,
+                                     (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
+                                     (__v32hi)_mm512_setzero_si512());
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
+
--- a/third_party/intel/clang/avx512vbmiintrin.h
+++ b/third_party/intel/clang/avx512vbmiintrin.h
@ -0,0 +1,106 @@
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,evex512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
+                                                 (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
+                              __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
+                               __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__I);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
+                               __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
+                                  __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
+                                (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
+                                (__v64qi)_mm512_setzero_si512());
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512vbmivlintrin.h
+++ b/third_party/intel/clang/avx512vbmivlintrin.h
@ -0,0 +1,193 @@
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
+                                                 (__v16qi)__I,
+                                                 (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
+                           __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
+                            __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__I);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
+                            __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
+                                                 (__v32qi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
+                              __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
+                               __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__I);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
+                               __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
+                               __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
+                                   (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                   (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
+                                   (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
+                                  __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
+                                (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
+                                (__v32qi)_mm256_setzero_si256());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512vlbf16intrin.h
+++ b/third_party/intel/clang/avx512vlbf16intrin.h
@ -0,0 +1,517 @@
+/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifdef __SSE2__
+
+#ifndef __AVX512VLBF16INTRIN_H
+#define __AVX512VLBF16INTRIN_H
+
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(256)))
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
+                                                    (__v4sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \param __W
+///    A 128-bit vector of [8 x bfloat].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __B
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)_mm_setzero_si128());
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
+                                                    (__v8sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \param __W
+///    A 256-bit vector of [16 x bfloat].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __B
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+///    conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)_mm256_setzero_si256());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+#define _mm_cvtneps_pbh(A)                                                     \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __W
+///    A 128-bit vector of [8 x bfloat].
+/// \param __U
+///    A 4-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                        (__v8bf)__W,
+                                                        (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 4-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+///    conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+                                                    (__v8bf)_mm_setzero_si128(),
+                                                    (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+#define _mm256_cvtneps_pbh(A)                                                  \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __W
+///    A 256-bit vector of [8 x bfloat].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                        (__v8bf)__W,
+                                                        (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+                                                    (__v8bf)_mm_setzero_si128(),
+                                                    (__mmask8)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
+                                             (__v8bf)__A,
+                                             (__v8bf)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+                                           (__v4sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \param __B
+///    A 128-bit vector of [8 x bfloat].
+/// \param __D
+///    A 128-bit vector of [4 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
+  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+                                           (__v4sf)_mm_setzero_si128());
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
+                                             (__v16bf)__A,
+                                             (__v16bf)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 16-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+                                        (__v8sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+///    A 256-bit vector of [16 x bfloat].
+/// \param __B
+///    A 256-bit vector of [16 x bfloat].
+/// \param __D
+///    A 256-bit vector of [8 x float].
+/// \param __U
+///    A 8-bit mask value specifying what is chosen for each element.
+///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
+///  __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+                                        (__v8sf)_mm256_setzero_si256());
+}
+
+/// Convert One Single float Data to One BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+///    A float data.
+/// \returns A bf16 data whose sign field and exponent field keep unchanged,
+///    and fraction field is truncated to 7 bits.
+static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
+  __v4sf __V = {__A, 0, 0, 0};
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
+      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
+  return (__bf16)__R[0];
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+  return _mm_castsi128_ps(
+      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 4-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
+      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 8-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 128-bit vector of [4 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 4-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
+  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
+      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
+      16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 256-bit vector of [8 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 8-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
+  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
+      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
+      16));
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
+#endif
--- a/third_party/intel/clang/avx512vlbitalgintrin.h
+++ b/third_party/intel/clang/avx512vlbitalgintrin.h
@ -0,0 +1,151 @@
+/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBITALGINTRIN_H
+#define __AVX512VLBITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(256)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi16(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
+              (__v16hi) _mm256_popcnt_epi16(__B),
+              (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi16(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
+              (__v8hi) _mm_popcnt_epi16(__B),
+              (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+{
+  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi8(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
+              (__v32qi) _mm256_popcnt_epi8(__B),
+              (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi8(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
+              (__v16qi) _mm_popcnt_epi8(__B),
+              (__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+{
+  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
+              (__v32qi) __B,
+              __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
+{
+  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
+              __A,
+              __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
+              (__v16qi) __B,
+              __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
+{
+  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512vlbwintrin.h
+++ b/third_party/intel/clang/avx512vlbwintrin.h
--- a/third_party/intel/clang/avx512vlcdintrin.h
+++ b/third_party/intel/clang/avx512vlcdintrin.h
@ -0,0 +1,230 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) _mm_set1_epi64x((long long) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) _mm256_set1_epi64x((long long)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) _mm_set1_epi32((int)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) _mm256_set1_epi32((int)__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_conflict_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_conflict_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_conflict_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_conflict_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_conflict_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_conflict_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_conflict_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_conflict_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __AVX512VLCDINTRIN_H */
--- a/third_party/intel/clang/avx512vldqintrin.h
+++ b/third_party/intel/clang/avx512vldqintrin.h
--- a/third_party/intel/clang/avx512vlfp16intrin.h
+++ b/third_party/intel/clang/avx512vlfp16intrin.h
--- a/third_party/intel/clang/avx512vlintrin.h
+++ b/third_party/intel/clang/avx512vlintrin.h
--- a/third_party/intel/clang/avx512vlvbmi2intrin.h
+++ b/third_party/intel/clang/avx512vlvbmi2intrin.h
@ -0,0 +1,695 @@
+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVBMI2INTRIN_H
+#define __AVX512VLVBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+{
+  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+{
+  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) _mm_setzero_si128(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
+{
+  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
+{
+  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) _mm256_setzero_si256(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) _mm256_setzero_si256(),
+              __U);
+}
+
+#define _mm256_shldi_epi64(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                     (__v4di)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                     (__v4di)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi64(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi64(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                       (__v2di)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi64(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                       (__v2di)_mm_setzero_si128()))
+
+#define _mm256_shldi_epi32(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                     (__v8si)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                     (__v8si)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi32(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi32(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                       (__v4si)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi32(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                       (__v4si)_mm_setzero_si128()))
+
+#define _mm256_shldi_epi16(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                    (__v16hi)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                    (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi16(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi16(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                       (__v8hi)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi16(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                       (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi64(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                     (__v4di)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                     (__v4di)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi64(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                       (__v2di)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi64(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                       (__v2di)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi32(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                     (__v8si)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                     (__v8si)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi32(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                       (__v4si)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi32(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                       (__v4si)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi16(A, B, I) \
+  ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                    (__v16hi)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                    (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi16(A, B, I) \
+  ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                       (__v8hi)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi16(U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                       (__v8hi)_mm_setzero_si128()))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
+                                             (__v4di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
+                                      (__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                      (__v4di)_mm256_shldv_epi64(__A, __B, __C),
+                                      (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
+                                             (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
+                                         (__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                         (__v2di)_mm_shldv_epi64(__A, __B, __C),
+                                         (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
+                                             (__v8si)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
+                                      (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                      (__v8si)_mm256_shldv_epi32(__A, __B, __C),
+                                      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
+                                             (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
+                                         (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                         (__v4si)_mm_shldv_epi32(__A, __B, __C),
+                                         (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
+                                             (__v16hi)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
+                                      (__v16hi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                                      (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
+                                      (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
+                                             (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
+                                         (__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                         (__v8hi)_mm_shldv_epi16(__A, __B, __C),
+                                         (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
+                                             (__v4di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
+                                      (__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectq_256(__U,
+                                      (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
+                                      (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
+                                             (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
+                                         (__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectq_128(__U,
+                                         (__v2di)_mm_shrdv_epi64(__A, __B, __C),
+                                         (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
+                                             (__v8si)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
+                                      (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                      (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
+                                      (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
+                                             (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
+                                         (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                         (__v4si)_mm_shrdv_epi32(__A, __B, __C),
+                                         (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
+                                             (__v16hi)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
+                                     (__v16hi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i)__builtin_ia32_selectw_256(__U,
+                                     (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
+                                     (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
+                                             (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
+                                         (__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i)__builtin_ia32_selectw_128(__U,
+                                         (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
+                                         (__v8hi)_mm_setzero_si128());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512vlvnniintrin.h
+++ b/third_party/intel/clang/avx512vlvnniintrin.h
@ -0,0 +1,310 @@
+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVNNIINTRIN_H
+#define __AVX512VLVNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(256)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+#define _mm256_dpbusd_epi32(S, A, B) \
+  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+#define _mm256_dpbusds_epi32(S, A, B) \
+  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+#define _mm256_dpwssd_epi32(S, A, B) \
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+#define _mm256_dpwssds_epi32(S, A, B) \
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+#define _mm_dpbusd_epi32(S, A, B) \
+  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+#define _mm_dpbusds_epi32(S, A, B) \
+  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+#define _mm_dpwssd_epi32(S, A, B) \
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+#define _mm_dpwssds_epi32(S, A, B) \
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512vlvp2intersectintrin.h
+++ b/third_party/intel/clang/avx512vlvp2intersectintrin.h
@ -0,0 +1,123 @@
+/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLVP2INTERSECT_H
+#define _AVX512VLVP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
+                 __min_vector_width__(128)))
+
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
+                 __min_vector_width__(256)))
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \param __b
+///    A 256-bit vector of [8 x i32]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x i64].
+/// \param __b
+///    A 256-bit vector of [4 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \param __b
+///    A 128-bit vector of [4 x i32]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+/// \param __b
+///    A 128-bit vector of [2 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avx512vnniintrin.h
+++ b/third_party/intel/clang/avx512vnniintrin.h
@ -0,0 +1,116 @@
+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H
+#define __AVX512VNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vnni,evex512"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512vp2intersectintrin.h
+++ b/third_party/intel/clang/avx512vp2intersectintrin.h
@ -0,0 +1,78 @@
+/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECT_H
+#define _AVX512VP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vp2intersect,evex512"),                     \
+                 __min_vector_width__(512)))
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+///    A 512-bit vector of [16 x i32].
+/// \param __b
+///    A 512-bit vector of [16 x i32]
+/// \param __m0
+///    A pointer point to 16-bit mask
+/// \param __m1
+///    A pointer point to 16-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
+  __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+///    A 512-bit vector of [8 x i64].
+/// \param __b
+///    A 512-bit vector of [8 x i64]
+/// \param __m0
+///    A pointer point to 8-bit mask
+/// \param __m1
+///    A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
+  __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512vpopcntdqintrin.h
+++ b/third_party/intel/clang/avx512vpopcntdqintrin.h
@ -0,0 +1,56 @@
+/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQINTRIN_H
+#define __AVX512VPOPCNTDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,evex512"),                        \
+                 __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/avx512vpopcntdqvlintrin.h
+++ b/third_party/intel/clang/avx512vpopcntdqvlintrin.h
@ -0,0 +1,95 @@
+/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
+#define __AVX512VPOPCNTDQVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- a/third_party/intel/clang/avxifmaintrin.h
+++ b/third_party/intel/clang/avxifmaintrin.h
@ -0,0 +1,177 @@
+/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXIFMAINTRIN_H
+#define __AVXIFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256)))
+
+// must vex-encoding
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXIFMAINTRIN_H
--- a/third_party/intel/clang/avxintrin.h
+++ b/third_party/intel/clang/avxintrin.h
--- a/third_party/intel/clang/avxneconvertintrin.h
+++ b/third_party/intel/clang/avxneconvertintrin.h
@ -0,0 +1,484 @@
+/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVXNECONVERTINTRIN_H
+#define __AVXNECONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(256)))
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnebf16_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
+}
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnebf16_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnesh_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnesh_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneebf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneebf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneebf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneebf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneeph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneeph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneeph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneeph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneobf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneobf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneobf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneobf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneoph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneoph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneoph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneoph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneps_avx_pbh(__m128 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtneps_avx_pbh(__m128 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneps_avx_pbh(__m256 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_cvtneps_avx_pbh(__m256 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXNECONVERTINTRIN_H
+#endif // __SSE2__
--- a/third_party/intel/clang/avxvnniint16intrin.h
+++ b/third_party/intel/clang/avxvnniint16intrin.h
@ -0,0 +1,473 @@
+/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVXVNNIINT16INTRIN_H
+#define __AVXVNNIINT16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint16"),   \
+                 __min_vector_width__(256)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwusd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUSD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwusd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x short].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwusds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwusds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x short].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwusds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwusds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x unsigned int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWUUD instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x unsigned int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 128-bit vector of [4 x unsigned int].
+/// \param __A
+///    A 128-bit vector of [8 x unsigned short].
+/// \param __B
+///    A 128-bit vector of [8 x unsigned short].
+/// \returns
+///    A 128-bit vector of [4 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpwuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpwuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
+///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
+///    signed 16-bit results. Sum these 2 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+///
+/// \param __W
+///    A 256-bit vector of [8 x unsigned int].
+/// \param __A
+///    A 256-bit vector of [16 x unsigned short].
+/// \param __B
+///    A 256-bit vector of [16 x unsigned short].
+/// \returns
+///    A 256-bit vector of [8 x unsigned int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
+/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpwuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT16INTRIN_H
--- a/third_party/intel/clang/avxvnniint8intrin.h
+++ b/third_party/intel/clang/avxvnniint8intrin.h
@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
--- a/third_party/intel/clang/avxvnniintrin.h
+++ b/third_party/intel/clang/avxvnniintrin.h
@ -0,0 +1,225 @@
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \code{.operation}
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
--- a/third_party/intel/clang/bmi2intrin.h
+++ b/third_party/intel/clang/bmi2intrin.h
@ -0,0 +1,255 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+/// Copies the unsigned 32-bit integer \a __X and zeroes the upper bits
+///    starting at bit number \a __Y.
+///
+/// \code{.operation}
+/// i := __Y[7:0]
+/// result := __X
+/// IF i < 32
+///   result[31:i] := 0
+/// FI
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c BZHI instruction.
+///
+/// \param __X
+///    The 32-bit source value to copy.
+/// \param __Y
+///    The lower 8 bits specify the bit number of the lowest bit to zero.
+/// \returns The partially zeroed 32-bit value.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+/// Deposit (scatter) low-order bits from the unsigned 32-bit integer \a __X
+///    into the 32-bit result, according to the mask in the unsigned 32-bit
+///    integer \a __Y. All other bits of the result are zero.
+///
+/// \code{.operation}
+/// i := 0
+/// result := 0
+/// FOR m := 0 TO 31
+///   IF __Y[m] == 1
+///     result[m] := __X[i]
+///     i := i + 1
+///   ENDIF
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c PDEP instruction.
+///
+/// \param __X
+///    The 32-bit source value to copy.
+/// \param __Y
+///    The 32-bit mask specifying where to deposit source bits.
+/// \returns The 32-bit result.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+/// Extract (gather) bits from the unsigned 32-bit integer \a __X into the
+///    low-order bits of the 32-bit result, according to the mask in the
+///    unsigned 32-bit integer \a __Y. All other bits of the result are zero.
+///
+/// \code{.operation}
+/// i := 0
+/// result := 0
+/// FOR m := 0 TO 31
+///   IF __Y[m] == 1
+///     result[i] := __X[m]
+///     i := i + 1
+///   ENDIF
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c PEXT instruction.
+///
+/// \param __X
+///    The 32-bit source value to copy.
+/// \param __Y
+///    The 32-bit mask specifying which source bits to extract.
+/// \returns The 32-bit result.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+/// Multiplies the unsigned 32-bit integers \a __X and \a __Y to form a
+///    64-bit product. Stores the upper 32 bits of the product in the
+///    memory at \a __P and returns the lower 32 bits.
+///
+/// \code{.operation}
+/// Store32(__P, (__X * __Y)[63:32])
+/// result := (__X * __Y)[31:0]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c MULX instruction.
+///
+/// \param __X
+///    An unsigned 32-bit multiplicand.
+/// \param __Y
+///    An unsigned 32-bit multiplicand.
+/// \param __P
+///    A pointer to memory for storing the upper half of the product.
+/// \returns The lower half of the product.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int)(__res >> 32);
+  return (unsigned int)__res;
+}
+
+#ifdef  __x86_64__
+
+/// Copies the unsigned 64-bit integer \a __X and zeroes the upper bits
+///    starting at bit number \a __Y.
+///
+/// \code{.operation}
+/// i := __Y[7:0]
+/// result := __X
+/// IF i < 64
+///   result[63:i] := 0
+/// FI
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c BZHI instruction.
+///
+/// \param __X
+///    The 64-bit source value to copy.
+/// \param __Y
+///    The lower 8 bits specify the bit number of the lowest bit to zero.
+/// \returns The partially zeroed 64-bit value.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+/// Deposit (scatter) low-order bits from the unsigned 64-bit integer \a __X
+///    into the 64-bit result, according to the mask in the unsigned 64-bit
+///    integer \a __Y. All other bits of the result are zero.
+///
+/// \code{.operation}
+/// i := 0
+/// result := 0
+/// FOR m := 0 TO 63
+///   IF __Y[m] == 1
+///     result[m] := __X[i]
+///     i := i + 1
+///   ENDIF
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c PDEP instruction.
+///
+/// \param __X
+///    The 64-bit source value to copy.
+/// \param __Y
+///    The 64-bit mask specifying where to deposit source bits.
+/// \returns The 64-bit result.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+/// Extract (gather) bits from the unsigned 64-bit integer \a __X into the
+///    low-order bits of the 64-bit result, according to the mask in the
+///    unsigned 64-bit integer \a __Y. All other bits of the result are zero.
+///
+/// \code{.operation}
+/// i := 0
+/// result := 0
+/// FOR m := 0 TO 63
+///   IF __Y[m] == 1
+///     result[i] := __X[m]
+///     i := i + 1
+///   ENDIF
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c PEXT instruction.
+///
+/// \param __X
+///    The 64-bit source value to copy.
+/// \param __Y
+///    The 64-bit mask specifying which source bits to extract.
+/// \returns The 64-bit result.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+/// Multiplies the unsigned 64-bit integers \a __X and \a __Y to form a
+///    128-bit product. Stores the upper 64 bits of the product to the
+///    memory addressed by \a __P and returns the lower 64 bits.
+///
+/// \code{.operation}
+/// Store64(__P, (__X * __Y)[127:64])
+/// result := (__X * __Y)[63:0]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c MULX instruction.
+///
+/// \param __X
+///    An unsigned 64-bit multiplicand.
+/// \param __Y
+///    An unsigned 64-bit multiplicand.
+/// \param __P
+///    A pointer to memory for storing the upper half of the product.
+/// \returns The lower half of the product.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#endif /* __x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */
--- a/third_party/intel/clang/bmiintrin.h
+++ b/third_party/intel/clang/bmiintrin.h
@ -0,0 +1,614 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see _tzcnt_u16
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __builtin_ia32_tzcnt_u16(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see __tzcnt_u16
+#define _tzcnt_u16 __tzcnt_u16
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_32 _tzcnt_u32 }
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __builtin_ia32_tzcnt_u32(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns A 32-bit integer containing the number of trailing zero bits in
+///    the operand.
+/// \see { __tzcnt_u32 _tzcnt_u32 }
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return (int)__builtin_ia32_tzcnt_u32(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_32 __tzcnt_u32 }
+#define _tzcnt_u32 __tzcnt_u32
+
+#ifdef __x86_64__
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_64 _tzcnt_u64 }
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ia32_tzcnt_u64(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
+/// \see { __tzcnt_u64 _tzcnt_u64 }
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return (long long)__builtin_ia32_tzcnt_u64(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_64 __tzcnt_u64
+#define _tzcnt_u64 __tzcnt_u64
+
+#endif /* __x86_64__ */
+
+#undef __RELAXED_FN_ATTRS
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see _andn_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u32
+#define _andn_u32 __andn_u32
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+/// \see _bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr2_u32(unsigned int __X, unsigned int __Y) {
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+/// \see _blsi_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+/// \see __blsi_u32
+#define _blsi_u32 __blsi_u32
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+/// \see _blsmsk_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+/// \see __blsmsk_u32
+#define _blsmsk_u32 __blsmsk_u32
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+/// \see _blsr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _bls4_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+/// \see __blsr_u32
+#define _blsr_u32 __blsr_u32
+
+#ifdef __x86_64__
+
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see _andn_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64(unsigned long long __X,
+///                              unsigned long long __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u64
+#define _andn_u64 __andn_u64
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+/// \see _bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+///     in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+/// \see __bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+/// \see _blsi_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+/// \see __blsi_u64
+#define _blsi_u64 __blsi_u64
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see _blsmsk_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see __blsmsk_u64
+#define _blsmsk_u64 __blsmsk_u64
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+/// \see _blsr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+/// \see __blsr_u64
+#define _blsr_u64 __blsr_u64
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__BMI__) */
+
+#endif /* __BMIINTRIN_H */
--- a/third_party/intel/clang/cetintrin.h
+++ b/third_party/intel/clang/cetintrin.h
@ -0,0 +1,115 @@
+/*===---- cetintrin.h - CET intrinsic --------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CETINTRIN_H
+#define __CETINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
+
+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
+  __builtin_ia32_incsspd((unsigned int)__a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#else /* __x86_64__ */
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspd(__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
+  return __builtin_ia32_rdsspd(__a);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+  unsigned int t;
+  return __builtin_ia32_rdsspd(t);
+#pragma clang diagnostic pop
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
+  return __builtin_ia32_rdsspq(__a);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+  unsigned long long t;
+  return __builtin_ia32_rdsspq(t);
+#pragma clang diagnostic pop
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspq(0);
+}
+#else /* __x86_64__ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspd(0);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
+  __builtin_ia32_saveprevssp();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
+  __builtin_ia32_rstorssp(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrssd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrssq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrussd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrussq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
+  __builtin_ia32_setssbsy();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
+  __builtin_ia32_clrssbsy(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CETINTRIN_H */
--- a/third_party/intel/clang/cldemoteintrin.h
+++ b/third_party/intel/clang/cldemoteintrin.h
@ -0,0 +1,36 @@
+/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLDEMOTEINTRIN_H
+#define __CLDEMOTEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))
+
+/// Hint to hardware that the cache line that contains \p __P should be demoted
+/// from the cache closest to the processor core to a level more distant from
+/// the processor core.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS
+_cldemote(const void * __P) {
+  __builtin_ia32_cldemote(__P);
+}
+
+#define _mm_cldemote(p) _cldemote(p)
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/clflushoptintrin.h
+++ b/third_party/intel/clang/clflushoptintrin.h
@ -0,0 +1,36 @@
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
+
+/// Invalidates all levels of the cache hierarchy and flushes modified data to
+///    memory for the cache line specified by the address \a __m.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c CLFLUSHOPT instruction.
+///
+/// \param __m
+///    An address within the cache line to flush and invalidate.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(void const * __m) {
+  __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/clwbintrin.h
+++ b/third_party/intel/clang/clwbintrin.h
@ -0,0 +1,38 @@
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
+
+/// Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+  __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/clzerointrin.h
+++ b/third_party/intel/clang/clzerointrin.h
@ -0,0 +1,38 @@
+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86INTRIN_H
+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLZEROINTRIN_H
+#define __CLZEROINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
+
+/// Zeroes out the cache line for the address \a __line. This uses a
+///    non-temporal store. Calling \c _mm_sfence() afterward might be needed
+///    to enforce ordering.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CLZERO instruction.
+///
+/// \param __line
+///    An address within the cache line to zero out.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clzero (void * __line)
+{
+  __builtin_ia32_clzero ((void *)__line);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CLZEROINTRIN_H */
--- a/third_party/intel/clang/cmpccxaddintrin.h
+++ b/third_party/intel/clang/cmpccxaddintrin.h
@ -0,0 +1,70 @@
+/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error                                                                         \
+    "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __CMPCCXADDINTRIN_H
+#define __CMPCCXADDINTRIN_H
+#ifdef __x86_64__
+
+typedef enum {
+  _CMPCCX_O,   /* Overflow.  */
+  _CMPCCX_NO,  /* No overflow.  */
+  _CMPCCX_B,   /* Below.  */
+  _CMPCCX_NB,  /* Not below.  */
+  _CMPCCX_Z,   /* Zero.  */
+  _CMPCCX_NZ,  /* Not zero.  */
+  _CMPCCX_BE,  /* Below or equal.  */
+  _CMPCCX_NBE, /* Neither below nor equal.  */
+  _CMPCCX_S,   /* Sign.  */
+  _CMPCCX_NS,  /* No sign.  */
+  _CMPCCX_P,   /* Parity.  */
+  _CMPCCX_NP,  /* No parity.  */
+  _CMPCCX_L,   /* Less.  */
+  _CMPCCX_NL,  /* Not less.  */
+  _CMPCCX_LE,  /* Less or equal.  */
+  _CMPCCX_NLE, /* Neither less nor equal.  */
+} _CMPCCX_ENUM;
+
+/// Compares the value from the memory __A with the value of __B. If the
+/// specified condition __D is met, then add the third operand __C to the
+/// __A and write it into __A, else the value of __A is unchanged. The return
+/// value is the original value of __A.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c CMPCCXADD instructions.
+///
+/// \param __A
+///    __A pointer specifying the memory address.
+///
+/// \param __B
+///   A integer operand.
+///
+/// \param __C
+///   A integer operand.
+///
+/// \param __D
+///   The specified condition.
+///
+/// \returns a integer which is the original value of first operand.
+
+#define _cmpccxadd_epi32(__A, __B, __C, __D)                                   \
+  ((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C),     \
+                                    (int)(__D))))
+
+#define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
+  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
+                                          (long long)(__C), (int)(__D))))
+
+#endif // __x86_64__
+#endif // __CMPCCXADDINTRIN_H
--- a/third_party/intel/clang/crc32intrin.h
+++ b/third_party/intel/clang/crc32intrin.h
@ -0,0 +1,100 @@
+/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CRC32INTRIN_H
+#define __CRC32INTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
+///    the second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CRC32INTRIN_H */
--- a/third_party/intel/clang/emmintrin.h
+++ b/third_party/intel/clang/emmintrin.h
--- a/third_party/intel/clang/enqcmdintrin.h
+++ b/third_party/intel/clang/enqcmdintrin.h
@ -0,0 +1,63 @@
+/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ENQCMDINTRIN_H
+#define __ENQCMDINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define _DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+///    data, and performs 64-byte enqueue store to memory pointed by \a __dst.
+///    This intrinsics may only be used in User mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
+///
+/// \param __dst
+///    Pointer to the destination of the enqueue store.
+/// \param __src
+///    Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+///    returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmd (void *__dst, const void *__src)
+{
+  return __builtin_ia32_enqcmd(__dst, __src);
+}
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+///    data, and performs 64-byte enqueue store to memory pointed by \a __dst
+///    This intrinsic may only be used in Privileged mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
+///
+/// \param __dst
+///    Pointer to the destination of the enqueue store.
+/// \param __src
+///    Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+///    returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmds (void *__dst, const void *__src)
+{
+  return __builtin_ia32_enqcmds(__dst, __src);
+}
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif /* __ENQCMDINTRIN_H */
--- a/third_party/intel/clang/f16cintrin.h
+++ b/third_party/intel/clang/f16cintrin.h
@ -0,0 +1,162 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
+
+/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
+ * but that's because icc can emulate these without f16c using a library call.
+ * Since we don't do that let's leave these in f16cintrin.h.
+ */
+
+/// Converts a 16-bit half-precision float value into a 32-bit float
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS128
+_cvtsh_ss(unsigned short __a)
+{
+  __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+  return __r[0];
+}
+
+/// Converts a 32-bit single-precision float value to a 16-bit
+///    half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 32-bit single-precision float value to be converted to a 16-bit
+///    half-precision float value.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm) __extension__ ({ \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]); })
+
+/// Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) \
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values. The lower
+///    64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
+static __inline __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) \
+ ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __F16CINTRIN_H */
--- a/third_party/intel/clang/fma4intrin.h
+++ b/third_party/intel/clang/fma4intrin.h
@ -0,0 +1,218 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include "pmmintrin.h"
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __FMA4INTRIN_H */
--- a/third_party/intel/clang/fmaintrin.h
+++ b/third_party/intel/clang/fmaintrin.h
@ -0,0 +1,796 @@
+/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
+
+/// Computes a multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+/// Computes a multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit [2 x double] vector containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/// Computes a scalar multiply-add of the single-precision values in the
+///    low 32 bits of 128-bit vectors of [4 x float].
+///
+/// \code{.operation}
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+/// Computes a scalar multiply-add of the double-precision values in the
+///    low 64 bits of 128-bit vectors of [2 x double].
+///
+/// \code{.operation}
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/// Computes a scalar multiply-subtract of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+///
+/// \code{.operation}
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///   32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+/// Computes a scalar multiply-subtract of the double-precision values in
+///    the low 64 bits of 128-bit vectors of [2 x double].
+///
+/// \code{.operation}
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit [4 x float] vector containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/// Computes a scalar negated multiply-add of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+///
+/// \code{.operation}
+/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+}
+
+/// Computes a scalar negated multiply-add of the double-precision values
+///    in the low 64 bits of 128-bit vectors of [2 x double].
+///
+/// \code{.operation}
+/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+}
+
+/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/// Computes a scalar negated multiply-subtract of the single-precision
+///    values in the low 32 bits of 128-bit vectors of [4 x float].
+///
+/// \code{.operation}
+/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+}
+
+/// Computes a scalar negated multiply-subtract of the double-precision
+///    values in the low 64 bits of 128-bit vectors of [2 x double].
+///
+/// \code{.operation}
+/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+///
+/// \code{.operation}
+/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+///
+/// \code{.operation}
+/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+///
+/// \code{.operation}
+/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+///
+/// \code{.operation}
+/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/// Computes a multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+/// Computes a multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [8 x float].
+///
+/// \code{.operation}
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [4 x double].
+///
+/// \code{.operation}
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [8 x float].
+///
+/// \code{.operation}
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [4 x double].
+///
+/// \code{.operation}
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __FMAINTRIN_H */
--- a/third_party/intel/clang/fxsrintrin.h
+++ b/third_party/intel/clang/fxsrintrin.h
@ -0,0 +1,91 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p)
+{
+  __builtin_ia32_fxsave(__p);
+}
+
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p)
+{
+  __builtin_ia32_fxrstor(__p);
+}
+
+#ifdef __x86_64__
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p)
+{
+  __builtin_ia32_fxsave64(__p);
+}
+
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p)
+{
+  __builtin_ia32_fxrstor64(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/gfniintrin.h
+++ b/third_party/intel/clang/gfniintrin.h
@ -0,0 +1,211 @@
+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __GFNIINTRIN_H
+#define __GFNIINTRIN_H
+
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("gfni,no-evex512"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx,gfni,no-evex512"),                            \
+                 __min_vector_width__(256)))
+
+/* Default attributes for ZMM unmasked forms. */
+#define __DEFAULT_FN_ATTRS_Z                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,evex512,gfni"),                           \
+                 __min_vector_width__(512)))
+/* Default attributes for ZMM masked forms. */
+#define __DEFAULT_FN_ATTRS_Z_MASK                                              \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,evex512,gfni"),                          \
+                 __min_vector_width__(512)))
+
+/* Default attributes for VLX masked forms. */
+#define __DEFAULT_FN_ATTRS_VL128                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(256)))
+
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
+  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
+                                                   (__v16qi)(__m128i)(B), \
+                                                   (char)(I)))
+
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
+  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
+                                                   (__v16qi)(__m128i)(B), \
+                                                   (char)(I)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}
+
+#ifdef __AVXINTRIN_H
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
+  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
+                                                   (__v32qi)(__m256i)(B), \
+                                                   (char)(I)))
+
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
+  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
+                                                   (__v32qi)(__m256i)(B), \
+                                                   (char)(I)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+#endif /* __AVXINTRIN_H */
+
+#ifdef __AVX512BWINTRIN_H
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
+  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
+                                                   (__v64qi)(__m512i)(B), \
+                                                   (char)(I)))
+
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v64qi)(__m512i)(S)))
+
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+         U, A, B, I)
+
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
+  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
+                                                   (__v64qi)(__m512i)(B), \
+                                                   (char)(I)))
+
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
+         (__v64qi)(__m512i)(S)))
+
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+         U, A, B, I)
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
+              (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512(__U,
+              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
+              (__v64qi) __S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
+              __U, __A, __B);
+}
+#endif /* __AVX512BWINTRIN_H */
+
+#ifdef __AVX512VLBWINTRIN_H
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v16qi)(__m128i)(S)))
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
+         U, A, B, I)
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v32qi)(__m256i)(S)))
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+         U, A, B, I)
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
+         (__v16qi)(__m128i)(S)))
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
+         (__v32qi)(__m256i)(S)))
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+         U, A, B, I)
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+#endif /* __AVX512VLBWINTRIN_H */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_Y
+#undef __DEFAULT_FN_ATTRS_Z
+#undef __DEFAULT_FN_ATTRS_VL128
+#undef __DEFAULT_FN_ATTRS_VL256
+
+#endif /* __GFNIINTRIN_H */
+
--- a/third_party/intel/clang/hresetintrin.h
+++ b/third_party/intel/clang/hresetintrin.h
@ -0,0 +1,49 @@
+/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __HRESETINTRIN_H
+#define __HRESETINTRIN_H
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
+
+/// Provides a hint to the processor to selectively reset the prediction
+///    history of the current logical processor specified by a 32-bit integer
+///    value \a __eax.
+///
+/// This intrinsic corresponds to the <c> HRESET </c> instruction.
+///
+/// \code{.operation}
+///    IF __eax == 0
+///      // nop
+///    ELSE
+///      FOR i := 0 to 31
+///        IF __eax[i]
+///          ResetPredictionFeature(i)
+///        FI
+///      ENDFOR
+///    FI
+/// \endcode
+static __inline void __DEFAULT_FN_ATTRS
+_hreset(int __eax)
+{
+  __asm__ ("hreset $0" :: "a"(__eax));
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif /* __HRESETINTRIN_H */
--- a/third_party/intel/clang/ia32intrin.h
+++ b/third_party/intel/clang/ia32intrin.h
@ -0,0 +1,863 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSF instruction or the
+///    \c TZCNT instruction.
+///
+/// \param __A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see _bit_scan_forward
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsfd(int __A) {
+  return __builtin_ctz((unsigned int)__A);
+}
+
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSR instruction or the
+///    \c LZCNT instruction and an \c XOR.
+///
+/// \param __A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see _bit_scan_reverse
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsrd(int __A) {
+  return 31 - __builtin_clz((unsigned int)__A);
+}
+
+/// Swaps the bytes in the input, converting little endian to big endian or
+///    vice versa.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSWAP instruction.
+///
+/// \param __A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the swapped bytes.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bswapd(int __A) {
+  return (int)__builtin_bswap32((unsigned int)__A);
+}
+
+/// Swaps the bytes in the input, converting little endian to big endian or
+///    vice versa.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSWAP instruction.
+///
+/// \param __A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the swapped bytes.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_bswap(int __A) {
+  return (int)__builtin_bswap32((unsigned int)__A);
+}
+
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _bit_scan_forward(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSF instruction or the
+///    \c TZCNT instruction.
+///
+/// \param A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see __bsfd
+#define _bit_scan_forward(A) __bsfd((A))
+
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if the input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _bit_scan_reverse(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSR instruction or the
+///    \c LZCNT instruction and an \c XOR.
+///
+/// \param A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see __bsrd
+#define _bit_scan_reverse(A) __bsrd((A))
+
+#ifdef __x86_64__
+/// Finds the first set bit starting from the least significant bit. The result
+///    is undefined if the input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSF instruction or the
+///    \c TZCNT instruction.
+///
+/// \param __A
+///    A 64-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsfq(long long __A) {
+  return (long long)__builtin_ctzll((unsigned long long)__A);
+}
+
+/// Finds the first set bit starting from the most significant bit. The result
+///    is undefined if input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSR instruction or the
+///    \c LZCNT instruction and an \c XOR.
+///
+/// \param __A
+///    A 64-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsrq(long long __A) {
+  return 63 - __builtin_clzll((unsigned long long)__A);
+}
+
+/// Swaps the bytes in the input, converting little endian to big endian or
+///    vice versa.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BSWAP instruction.
+///
+/// \param __A
+///    A 64-bit integer operand.
+/// \returns A 64-bit integer containing the swapped bytes.
+/// \see _bswap64
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__bswapq(long long __A) {
+  return (long long)__builtin_bswap64((unsigned long long)__A);
+}
+
+/// Swaps the bytes in the input, converting little endian to big endian or
+///    vice versa.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _bswap64(long long A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSWAP instruction.
+///
+/// \param A
+///    A 64-bit integer operand.
+/// \returns A 64-bit integer containing the swapped bytes.
+/// \see __bswapq
+#define _bswap64(A) __bswapq((A))
+#endif /* __x86_64__ */
+
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    sequence of arithmetic and logic operations to calculate it.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see _popcnt32
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__popcntd(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _popcnt32(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    sequence of arithmetic and logic operations to calculate it.
+///
+/// \param A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see __popcntd
+#define _popcnt32(A) __popcntd((A))
+
+#ifdef __x86_64__
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    sequence of arithmetic and logic operations to calculate it.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see _popcnt64
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__popcntq(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _popcnt64(unsigned long long A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    sequence of arithmetic and logic operations to calculate it.
+///
+/// \param A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see __popcntq
+#define _popcnt64(A) __popcntq((A))
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+/// Returns the program status-and-control \c RFLAGS register with the \c VM
+///    and \c RF flags cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUSHFQ + \c POP instruction sequence.
+///
+/// \returns The 64-bit value of the RFLAGS register.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u64();
+}
+
+/// Writes the specified value to the program status-and-control \c RFLAGS
+///    register. Reserved bits are not affected.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUSH + \c POPFQ instruction sequence.
+///
+/// \param __f
+///    The 64-bit value to write to \c RFLAGS.
+static __inline__ void __DEFAULT_FN_ATTRS
+__writeeflags(unsigned long long __f)
+{
+  __builtin_ia32_writeeflags_u64(__f);
+}
+
+#else /* !__x86_64__ */
+/// Returns the program status-and-control \c EFLAGS register with the \c VM
+///    and \c RF flags cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUSHFD + \c POP instruction sequence.
+///
+/// \returns The 32-bit value of the EFLAGS register.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u32();
+}
+
+/// Writes the specified value to the program status-and-control \c EFLAGS
+///    register. Reserved bits are not affected.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PUSH + \c POPFD instruction sequence.
+///
+/// \param __f
+///    The 32-bit value to write to \c EFLAGS.
+static __inline__ void __DEFAULT_FN_ATTRS
+__writeeflags(unsigned int __f)
+{
+  __builtin_ia32_writeeflags_u32(__f);
+}
+#endif /* !__x86_64__ */
+
+/// Casts a 32-bit float value to a 32-bit unsigned integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVD / \c MOVD instruction in x86_64,
+///    and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
+///
+/// \param __A
+///    A 32-bit float value.
+/// \returns A 32-bit unsigned integer containing the converted value.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
+_castf32_u32(float __A) {
+  return __builtin_bit_cast(unsigned int, __A);
+}
+
+/// Casts a 64-bit float value to a 64-bit unsigned integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
+///    and corresponds to the \c VMOVL / \c MOVL instruction in ia32.
+///
+/// \param __A
+///    A 64-bit float value.
+/// \returns A 64-bit unsigned integer containing the converted value.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
+_castf64_u64(double __A) {
+  return __builtin_bit_cast(unsigned long long, __A);
+}
+
+/// Casts a 32-bit unsigned integer value to a 32-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
+///    and corresponds to the \c FLDS instruction in ia32.
+///
+/// \param __A
+///    A 32-bit unsigned integer value.
+/// \returns A 32-bit float value containing the converted value.
+static __inline__ float __DEFAULT_FN_ATTRS_CAST
+_castu32_f32(unsigned int __A) {
+  return __builtin_bit_cast(float, __A);
+}
+
+/// Casts a 64-bit unsigned integer value to a 64-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / \c MOVQ instruction in x86_64,
+///    and corresponds to the \c FLDL instruction in ia32.
+///
+/// \param __A
+///    A 64-bit unsigned integer value.
+/// \returns A 64-bit float value containing the converted value.
+static __inline__ double __DEFAULT_FN_ATTRS_CAST
+_castu64_f64(unsigned long long __A) {
+  return __builtin_bit_cast(double, __A);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///     unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CRC32B instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32b(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CRC32W instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32w(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CRC32D instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32d(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CRC32Q instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
+__crc32q(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+/// Reads the specified performance-monitoring counter. Refer to your
+///    processor's documentation to determine which performance counters are
+///    supported.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c RDPMC instruction.
+///
+/// \param __A
+///    The performance counter to read.
+/// \returns The 64-bit value read from the performance counter.
+/// \see _rdpmc
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/// Reads the processor's time-stamp counter and the \c IA32_TSC_AUX MSR
+///    \c (0xc0000103).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c RDTSCP instruction.
+///
+/// \param __A
+///    The address of where to store the 32-bit \c IA32_TSC_AUX value.
+/// \returns The 64-bit value of the time-stamp counter.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+/// Reads the processor's time-stamp counter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _rdtsc();
+/// \endcode
+///
+/// This intrinsic corresponds to the \c RDTSC instruction.
+///
+/// \returns The 64-bit value of the time-stamp counter.
+#define _rdtsc() __rdtsc()
+
+/// Reads the specified performance monitoring counter. Refer to your
+///    processor's documentation to determine which performance counters are
+///    supported.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _rdpmc(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c RDPMC instruction.
+///
+/// \param A
+///    The performance counter to read.
+/// \returns The 64-bit value read from the performance counter.
+/// \see __rdpmc
+#define _rdpmc(A) __rdpmc(A)
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wbinvd(void) {
+  __builtin_ia32_wbinvd();
+}
+
+/// Rotates an 8-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param __X
+///    The unsigned 8-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolb(unsigned char __X, int __C) {
+  return __builtin_rotateleft8(__X, __C);
+}
+
+/// Rotates an 8-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param __X
+///    The unsigned 8-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorb(unsigned char __X, int __C) {
+  return __builtin_rotateright8(__X, __C);
+}
+
+/// Rotates a 16-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param __X
+///    The unsigned 16-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see _rotwl
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolw(unsigned short __X, int __C) {
+  return __builtin_rotateleft16(__X, __C);
+}
+
+/// Rotates a 16-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param __X
+///    The unsigned 16-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see _rotwr
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorw(unsigned short __X, int __C) {
+  return __builtin_rotateright16(__X, __C);
+}
+
+/// Rotates a 32-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param __X
+///    The unsigned 32-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see _rotl
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
+__rold(unsigned int __X, int __C) {
+  return __builtin_rotateleft32(__X, (unsigned int)__C);
+}
+
+/// Rotates a 32-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param __X
+///    The unsigned 32-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see _rotr
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
+__rord(unsigned int __X, int __C) {
+  return __builtin_rotateright32(__X, (unsigned int)__C);
+}
+
+#ifdef __x86_64__
+/// Rotates a 64-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param __X
+///    The unsigned 64-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolq(unsigned long long __X, int __C) {
+  return __builtin_rotateleft64(__X, (unsigned long long)__C);
+}
+
+/// Rotates a 64-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param __X
+///    The unsigned 64-bit value to be rotated.
+/// \param __C
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorq(unsigned long long __X, int __C) {
+  return __builtin_rotateright64(__X, (unsigned long long)__C);
+}
+#endif /* __x86_64__ */
+
+#ifndef _MSC_VER
+/* These are already provided as builtins for MSVC. */
+/* Select the correct function based on the size of long. */
+#ifdef __LP64__
+/// Rotates a 64-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _lrotl(unsigned long long a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 64-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rolq
+#define _lrotl(a,b) __rolq((a), (b))
+
+/// Rotates a 64-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _lrotr(unsigned long long a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 64-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rorq
+#define _lrotr(a,b) __rorq((a), (b))
+#else // __LP64__
+/// Rotates a 32-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _lrotl(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rold
+#define _lrotl(a,b) __rold((a), (b))
+
+/// Rotates a 32-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _lrotr(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rord
+#define _lrotr(a,b) __rord((a), (b))
+#endif // __LP64__
+
+/// Rotates a 32-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _rotl(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rold
+#define _rotl(a,b) __rold((a), (b))
+
+/// Rotates a 32-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _rotr(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rord
+#define _rotr(a,b) __rord((a), (b))
+#endif // _MSC_VER
+
+/* These are not builtins so need to be provided in all modes. */
+/// Rotates a 16-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _rotwl(unsigned short a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 16-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rolw
+#define _rotwl(a,b) __rolw((a), (b))
+
+/// Rotates a 16-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _rotwr(unsigned short a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 16-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rorw
+#define _rotwr(a,b) __rorw((a), (b))
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CAST
+#undef __DEFAULT_FN_ATTRS_CRC32
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
+#endif /* __IA32INTRIN_H */
--- a/third_party/intel/clang/immintrin.h
+++ b/third_party/intel/clang/immintrin.h
@ -0,0 +1,747 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include "x86gprintrin.h"
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MMX__)
+#include "mmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE__)
+#include "xmmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE2__)
+#include "emmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSE3__)
+#include "pmmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SSSE3__)
+#include "tmmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__SSE4_2__) || defined(__SSE4_1__))
+#include "smmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AES__) || defined(__PCLMUL__))
+#include "wmmintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#include "clflushoptintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLWB__)
+#include "clwbintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX__)
+#include "avxintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX2__)
+#include "avx2intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__F16C__)
+#include "f16cintrin.h"
+#endif
+
+/* No feature check desired due to internal checks */
+#include "bmiintrin.h"
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__BMI2__)
+#include "bmi2intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__LZCNT__)
+#include "lzcntintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__POPCNT__)
+#include "popcntintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FMA__)
+#include "fmaintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512F__)
+#include "avx512fintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VL__)
+#include "avx512vlintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BW__)
+#include "avx512bwintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BITALG__)
+#include "avx512bitalgintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512CD__)
+#include "avx512cdintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
+#include "avx512vpopcntdqintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
+#include "avx512vpopcntdqvlintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VNNI__)
+#include "avx512vnniintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
+#include "avx512vlvnniintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNI__)
+#include "avxvnniintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512DQ__)
+#include "avx512dqintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
+#include "avx512vlbitalgintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512BW__))
+#include "avx512vlbwintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include "avx512vlcdintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
+#include "avx512vldqintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
+#include "avx512ifmaintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include "avx512ifmavlintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXIFMA__)
+#include "avxifmaintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI__)
+#include "avx512vbmiintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include "avx512vbmivlintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512VBMI2__)
+#include "avx512vbmi2intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
+#include "avx512vlvbmi2intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
+#include "avx512fp16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512FP16__))
+#include "avx512vlfp16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512BF16__)
+#include "avx512bf16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512BF16__))
+#include "avx512vlbf16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PKU__)
+#include "pkuintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VPCLMULQDQ__)
+#include "vpclmulqdqintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__VAES__)
+#include "vaesintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__GFNI__)
+#include "gfniintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT8__)
+#include "avxvnniint8intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXNECONVERT__)
+#include "avxneconvertintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA512__)
+#include "sha512intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM3__)
+#include "sm3intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SM4__)
+#include "sm4intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AVXVNNIINT16__)
+#include "avxvnniint16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDPID__)
+/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPID </c> instruction.
+///
+/// \returns The 32-bit contents of the MSR.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
+_rdpid_u32(void) {
+  return __builtin_ia32_rdpid();
+}
+#endif // __RDPID__
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDRND__)
+/// Returns a 16-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 16-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return (int)__builtin_ia32_rdrand16_step(__p);
+}
+
+/// Returns a 32-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return (int)__builtin_ia32_rdrand32_step(__p);
+}
+
+/// Returns a 64-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+#ifdef __x86_64__
+  return (int)__builtin_ia32_rdrand64_step(__p);
+#else
+  // We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+  // rdrand instructions.
+  unsigned int __lo, __hi;
+  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+    *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+    return 1;
+  } else {
+    *__p = 0;
+    return 0;
+  }
+#endif
+}
+#endif /* __RDRND__ */
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FSGSBASE__)
+#ifdef __x86_64__
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the FS base register.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The contents of the FS base register.
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the GS base register.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The contents of the GS base register.
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the FS base register.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  __builtin_ia32_wrfsbase32(__V);
+}
+
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the FS base register.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  __builtin_ia32_wrfsbase64(__V);
+}
+
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the GS base register.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  __builtin_ia32_wrgsbase32(__V);
+}
+
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for GS base register.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  __builtin_ia32_wrgsbase64(__V);
+}
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVBE__)
+
+/* The structs used below are to force the load/store to be unaligned. This
+ * is accomplished with the __packed__ attribute. The __may_alias__ prevents
+ * tbaa metadata from being generated based on the struct and the type of the
+ * field inside of it.
+ */
+
+/// Load a 16-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 16-bit value to load.
+/// \returns The byte-swapped value.
+static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i16(void const * __P) {
+  struct __loadu_i16 {
+    unsigned short __v;
+  } __attribute__((__packed__, __may_alias__));
+  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
+}
+
+/// Swap the bytes of a 16-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 16-bit value to be byte-swapped.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i16(void * __P, short __D) {
+  struct __storeu_i16 {
+    unsigned short __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
+}
+
+/// Load a 32-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 32-bit value to load.
+/// \returns The byte-swapped value.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i32(void const * __P) {
+  struct __loadu_i32 {
+    unsigned int __v;
+  } __attribute__((__packed__, __may_alias__));
+  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
+}
+
+/// Swap the bytes of a 32-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 32-bit value to be byte-swapped.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i32(void * __P, int __D) {
+  struct __storeu_i32 {
+    unsigned int __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
+}
+
+#ifdef __x86_64__
+/// Load a 64-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 64-bit value to load.
+/// \returns The byte-swapped value.
+static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i64(void const * __P) {
+  struct __loadu_i64 {
+    unsigned long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
+}
+
+/// Swap the bytes of a 64-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 64-bit value to be byte-swapped.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i64(void * __P, long long __D) {
+  struct __storeu_i64 {
+    unsigned long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
+}
+#endif
+#endif /* __MOVBE */
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RTM__)
+#include "rtmintrin.h"
+#include "xtestintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHA__)
+#include "shaintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__FXSR__)
+#include "fxsrintrin.h"
+#endif
+
+/* No feature check desired due to internal MSC_VER checks */
+#include "xsaveintrin.h"
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEOPT__)
+#include "xsaveoptintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVEC__)
+#include "xsavecintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__XSAVES__)
+#include "xsavesintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SHSTK__)
+#include "cetintrin.h"
+#endif
+
+/* Intrinsics inside adcintrin.h are available at all times. */
+#include "adcintrin.h"
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ADX__)
+#include "adxintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__RDSEED__)
+#include "rdseedintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WBNOINVD__)
+#include "wbnoinvdintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__CLDEMOTE__)
+#include "cldemoteintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WAITPKG__)
+#include "waitpkgintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__MOVDIRI__) ||     \
+    defined(__MOVDIR64B__)
+#include "movdirintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PCONFIG__)
+#include "pconfigintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SGX__)
+#include "sgxintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__PTWRITE__)
+#include "ptwriteintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__INVPCID__)
+#include "invpcidintrin.h"
+#endif
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP16__)
+#include "amxfp16intrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__) ||          \
+    defined(__WIDEKL__)
+#include "keylockerintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TILE__) ||    \
+    defined(__AMX_INT8__) || defined(__AMX_BF16__)
+#include "amxintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_COMPLEX__)
+#include "amxcomplexintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    defined(__AVX512VP2INTERSECT__)
+#include "avx512vp2intersectintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) ||                             \
+    (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
+#include "avx512vlvp2intersectintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__ENQCMD__)
+#include "enqcmdintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__SERIALIZE__)
+#include "serializeintrin.h"
+#endif
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__TSXLDTRK__)
+#include "tsxldtrkintrin.h"
+#endif
+
+#if defined(_MSC_VER) && __has_extension(gnu_asm)
+/* Define the default attributes for these intrinsics */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
+
+#endif /* __IMMINTRIN_H */
--- a/third_party/intel/clang/invpcidintrin.h
+++ b/third_party/intel/clang/invpcidintrin.h
@ -0,0 +1,23 @@
+/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __INVPCIDINTRIN_H
+#define __INVPCIDINTRIN_H
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__,  __target__("invpcid")))
+_invpcid(unsigned int __type, void *__descriptor) {
+  __builtin_ia32_invpcid(__type, __descriptor);
+}
+
+#endif /* __INVPCIDINTRIN_H */
--- a/third_party/intel/clang/keylockerintrin.h
+++ b/third_party/intel/clang/keylockerintrin.h
@ -0,0 +1,527 @@
+/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H
+#define _KEYLOCKERINTRIN_H
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__KL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
+                 __min_vector_width__(128)))
+
+/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
+/// will assigned to EAX, whch specifies the KeySource and whether backing up
+/// the key is permitted. The 256-bit encryption key is loaded from the two
+/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
+/// loaded from the implicit operand XMM0 which assigned by __intkey.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
+///
+/// \code{.operation}
+/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
+///   GP (0)
+/// FI
+/// IF “LOADIWKEY exiting” VM execution control set
+///   VMexit
+/// FI
+/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
+///   GP (0)
+/// FI
+/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
+///   GP (0)
+/// FI
+/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
+///   GP (0)
+/// FI
+/// IF (__ctl[4:1] == 0) // KeySource of 0.
+///   IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
+///   IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
+///   IWKey.IntegrityKey[127:0] := __intkey[127:0]
+///   IWKey.NoBackup := __ctl[0]
+///   IWKey.KeySource := __ctl[4:1]
+///   ZF := 0
+/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
+///   IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
+///     IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
+///     IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
+///     IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
+///     IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
+///     IWKey.NoBackup := __ctl[0]
+///     IWKey.KeySource := __ctl[4:1]
+///     ZF := 0
+///   ELSE // Random data was not returned from RDSEED. IWKey was not loaded
+///     ZF := 1
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
+               __m128i __enkey_lo, __m128i __enkey_hi) {
+  __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
+}
+
+/// Wrap a 128-bit AES key from __key into a key handle and output in
+/// ((__m128i*)__h) to ((__m128i*)__h) + 2  and a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
+///
+/// \code{.operation}
+/// InputKey[127:0] := __key[127:0]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h] := Handle[127:0]   // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
+  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
+}
+
+/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
+/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
+/// a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
+///
+/// \code{.operation}
+/// InputKey[127:0] := __key_lo[127:0]
+/// InputKey[255:128] := __key_hi[255:128]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
+///                  IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h]   := Handle[127:0] // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
+/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
+                     void *__h) {
+  return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
+                                         (__v2di)__key_hi, __h);
+}
+
+/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[383:256] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
+/// IF (IllegalHandle)
+///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
+///                  (Handle[127:0] AND (CPL > 0)) ||
+///                  Handle[383:256] ||
+///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
+/// IF (IllegalHandle)
+///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
+///   ELSE
+///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[383:256] ||
+///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
+/// IF (IllegalHandle)
+///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF (Authentic == 0)
+///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
+///   ELSE
+///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !defined(__SCE__ || __has_feature(modules) || defined(__KL__) */
+
+#if !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
+                 __min_vector_width__(128)))
+
+/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
+/// IF (IllegalHandle)
+///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[383:0] := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+///                    (Handle[127:0] AND (CPL > 0)) ||
+///                    Handle[255:128] ||
+///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
+///
+/// \code{.operation}
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
+///                   (Handle[127:0] AND (CPL > 0)) ||
+///                   Handle[255:128] ||
+///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
+/// If (IllegalHandle)
+///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
+/// ELSE
+///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+///   IF Authentic == 0
+///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
+///   ELSE
+///     FOR i := 0 to 7
+///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
+///     ENDFOR
+///     ZF := 0
+///   FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endcode
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
+                                           (const __v2di *)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !defined(__SCE__) || __has_feature(modules) || defined(__WIDEKL__)   \
+        */
+
+#endif /* _KEYLOCKERINTRIN_H */
--- a/third_party/intel/clang/lwpintrin.h
+++ b/third_party/intel/clang/lwpintrin.h
@ -0,0 +1,136 @@
+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LWPINTRIN_H
+#define __LWPINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
+
+/// Parses the LWPCB at the specified address and enables
+///        profiling if valid.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
+///
+/// \param __addr
+///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
+///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
+///    Lightweight Profiling.
+static __inline__ void __DEFAULT_FN_ATTRS
+__llwpcb (void *__addr)
+{
+  __builtin_ia32_llwpcb(__addr);
+}
+
+/// Flushes the LWP state to memory and returns the address of the LWPCB.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
+///
+/// \return
+///    Address to the current Lightweight Profiling Control Block (LWPCB).
+///    If LWP is not currently enabled, returns NULL.
+static __inline__ void* __DEFAULT_FN_ATTRS
+__slwpcb (void)
+{
+  return __builtin_ia32_slwpcb();
+}
+
+/// Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#ifdef __x86_64__
+
+/// Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LWPINTRIN_H */
--- a/third_party/intel/clang/lzcntintrin.h
+++ b/third_party/intel/clang/lzcntintrin.h
@ -0,0 +1,104 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+#ifndef _MSC_VER
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
+#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X))
+#endif // _MSC_VER
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see _lzcnt_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __builtin_ia32_lzcnt_u32(__X);
+}
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see __lzcnt32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __builtin_ia32_lzcnt_u32(__X);
+}
+
+#ifdef __x86_64__
+#ifndef _MSC_VER
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see _lzcnt_u64
+#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X))
+#endif // _MSC_VER
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+/// \see __lzcnt64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ia32_lzcnt_u64(__X);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */
--- a/third_party/intel/clang/mm_malloc.h
+++ b/third_party/intel/clang/mm_malloc.h
@ -0,0 +1,67 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__, __alloc_size__(1),
+                                       __alloc_align__(2)))
+_mm_malloc(size_t __size, size_t __align) {
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+#if defined(__MINGW32__)
+  __mingw_aligned_free(__p);
+#elif defined(_WIN32)
+  _aligned_free(__p);
+#else
+  free(__p);
+#endif
+}
+#endif
+
+#endif /* __MM_MALLOC_H */
--- a/third_party/intel/clang/mmintrin.h
+++ b/third_party/intel/clang/mmintrin.h
--- a/third_party/intel/clang/movdirintrin.h
+++ b/third_party/intel/clang/movdirintrin.h
@ -0,0 +1,49 @@
+/*===------------------------- movdirintrin.h ------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H
+#define _MOVDIRINTRIN_H
+
+/* Move doubleword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u32 (void *__dst, unsigned int  __value)
+{
+  __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
+}
+
+#ifdef __x86_64__
+
+/* Move quadword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u64 (void *__dst, unsigned long __value)
+{
+  __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
+}
+
+#endif /* __x86_64__ */
+
+/*
+ * movdir64b - Move 64 bytes as direct store.
+ * The destination must be 64 byte aligned, and the store is atomic.
+ * The source address has no alignment requirement, and the load from
+ * the source address is not atomic.
+ */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdir64b")))
+_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
+{
+  __builtin_ia32_movdir64b(__dst, __src);
+}
+
+#endif /* _MOVDIRINTRIN_H */
--- a/third_party/intel/clang/mwaitxintrin.h
+++ b/third_party/intel/clang/mwaitxintrin.h
@ -0,0 +1,62 @@
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __MWAITXINTRIN_H
+#define __MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+
+/// Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MONITORX instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitorx(__p, __extensions, __hints);
+}
+
+/// Used with the \c MONITORX instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MWAITX instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which can vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which can vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __MWAITXINTRIN_H */
--- a/third_party/intel/clang/nmmintrin.h
+++ b/third_party/intel/clang/nmmintrin.h
@ -0,0 +1,20 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __NMMINTRIN_H
+#define __NMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include "smmintrin.h"
+#endif /* __NMMINTRIN_H */
--- a/third_party/intel/clang/pconfigintrin.h
+++ b/third_party/intel/clang/pconfigintrin.h
@ -0,0 +1,40 @@
+/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PCONFIGINTRIN_H
+#define __PCONFIGINTRIN_H
+
+#define __PCONFIG_KEY_PROGRAM 0x00000001
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("pconfig")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("pconfig"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif
--- a/third_party/intel/clang/pkuintrin.h
+++ b/third_party/intel/clang/pkuintrin.h
@ -0,0 +1,34 @@
+/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __PKUINTRIN_H
+#define __PKUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rdpkru_u32(void)
+{
+  return __builtin_ia32_rdpkru();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wrpkru(unsigned int __val)
+{
+  __builtin_ia32_wrpkru(__val);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/third_party/intel/clang/pmmintrin.h
+++ b/third_party/intel/clang/pmmintrin.h
@ -0,0 +1,301 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include "emmintrin.h"
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse3,no-evex512"), __min_vector_width__(128)))
+
+/// Loads data from an unaligned memory location to elements in a 128-bit
+///    vector.
+///
+///    If the address of the data is not 16-byte aligned, the instruction may
+///    read two adjacent aligned blocks of memory to retrieve the requested
+///    data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i_u const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the lower
+///    bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the upper
+///    bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Moves and duplicates odd-indexed values from a 128-bit vector
+///    of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
+}
+
+/// Duplicates even-indexed values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+///    and differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Horizontally adds the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Horizontally subtracts the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Moves and duplicates one double-precision value to double-precision
+///    values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const *dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+/// Moves and duplicates the double-precision value in the lower bits of
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+///    [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+/// Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// The \c MONITOR instruction can be used in kernel mode, and in other modes
+/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MONITOR instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor(__p, __extensions, __hints);
+}
+
+/// Used with the \c MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
+/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MWAIT instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which can vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which can vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */
--- a/third_party/intel/clang/popcntintrin.h
+++ b/third_party/intel/clang/popcntintrin.h
@ -0,0 +1,59 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __POPCNTINTRIN_H
+#define __POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
+#endif /* __POPCNTINTRIN_H */
--- a/third_party/intel/clang/prfchiintrin.h
+++ b/third_party/intel/clang/prfchiintrin.h
@ -0,0 +1,61 @@
+/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PRFCHIINTRIN_H
+#define __PRFCHIINTRIN_H
+
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit0(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
+}
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all but the first-level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit1(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
+#pragma clang diagnostic pop
+}
+#endif /* __x86_64__ */
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PRFCHWINTRIN_H */
--- a/third_party/intel/clang/prfchwintrin.h
+++ b/third_party/intel/clang/prfchwintrin.h
@ -0,0 +1,60 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+/// Loads a memory sequence containing the specified memory address into
+///    all data cache levels.
+///
+///    The cache-coherency state is set to exclusive. Data can be read from
+///    and written to the cache line without additional delay.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+/// Loads a memory sequence containing the specified memory address into
+///    the L1 data cache and sets the cache-coherency state to modified.
+///
+///    This provides a hint to the processor that the cache line will be
+///    modified. It is intended for use when the cache line will be written to
+///    shortly after the prefetch is performed.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHW instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(volatile const void *__P)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
+}
+
+#endif /* __PRFCHWINTRIN_H */
--- a/third_party/intel/clang/ptwriteintrin.h
+++ b/third_party/intel/clang/ptwriteintrin.h
@ -0,0 +1,37 @@
+/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PTWRITEINTRIN_H
+#define __PTWRITEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("ptwrite")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite32(unsigned int __value) {
+  __builtin_ia32_ptwrite32(__value);
+}
+
+#ifdef __x86_64__
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite64(unsigned long long __value) {
+  __builtin_ia32_ptwrite64(__value);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PTWRITEINTRIN_H */
--- a/third_party/intel/clang/raointintrin.h
+++ b/third_party/intel/clang/raointintrin.h
@ -0,0 +1,203 @@
+/*===----------------------- raointintrin.h - RAOINT ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __RAOINTINTRIN_H
+#define __RAOINTINTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("raoint")))
+
+/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
+  __builtin_ia32_aadd32((int *)__A, __B);
+}
+
+/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
+  __builtin_ia32_aand32((int *)__A, __B);
+}
+
+/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
+  __builtin_ia32_aor32((int *)__A, __B);
+}
+
+/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
+  __builtin_ia32_axor32((int *)__A, __B);
+}
+
+#ifdef __x86_64__
+/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aadd64((long long *)__A, __B);
+}
+
+/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aand64((long long *)__A, __B);
+}
+
+/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
+                                                   long long __B) {
+  __builtin_ia32_aor64((long long *)__A, __B);
+}
+
+/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_axor64((long long *)__A, __B);
+}
+#endif // __x86_64__
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __RAOINTINTRIN_H
--- a/third_party/intel/clang/rdpruintrin.h
+++ b/third_party/intel/clang/rdpruintrin.h
@ -0,0 +1,57 @@
+/*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H
+#error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDPRUINTRIN_H
+#define __RDPRUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("rdpru")))
+
+
+/// Reads the content of a processor register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPRU </c> instruction.
+///
+/// \param reg_id
+///    A processor register identifier.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdpru (int reg_id)
+{
+  return __builtin_ia32_rdpru(reg_id);
+}
+
+#define __RDPRU_MPERF 0
+#define __RDPRU_APERF 1
+
+/// Reads the content of processor register MPERF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
+/// register MPERF.
+#define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
+
+/// Reads the content of processor register APERF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
+/// register APERF.
+#define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDPRUINTRIN_H */
--- a/third_party/intel/clang/rdseedintrin.h
+++ b/third_party/intel/clang/rdseedintrin.h
@ -0,0 +1,105 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+/// Stores a hardware-generated 16-bit random value in the memory at \a __p.
+///
+///    The random number generator complies with NIST SP800-90B and SP800-90C.
+///
+/// \code{.operation}
+/// IF HW_NRND_GEN.ready == 1
+///   Store16(__p, HW_NRND_GEN.data)
+///   result := 1
+/// ELSE
+///   Store16(__p, 0)
+///   result := 0
+/// END
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c RDSEED instruction.
+///
+/// \param __p
+///    Pointer to memory for storing the 16-bit random number.
+/// \returns 1 if a random number was generated, 0 if not.
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return (int) __builtin_ia32_rdseed16_step(__p);
+}
+
+/// Stores a hardware-generated 32-bit random value in the memory at \a __p.
+///
+///    The random number generator complies with NIST SP800-90B and SP800-90C.
+///
+/// \code{.operation}
+/// IF HW_NRND_GEN.ready == 1
+///   Store32(__p, HW_NRND_GEN.data)
+///   result := 1
+/// ELSE
+///   Store32(__p, 0)
+///   result := 0
+/// END
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c RDSEED instruction.
+///
+/// \param __p
+///    Pointer to memory for storing the 32-bit random number.
+/// \returns 1 if a random number was generated, 0 if not.
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return (int) __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+/// Stores a hardware-generated 64-bit random value in the memory at \a __p.
+///
+///    The random number generator complies with NIST SP800-90B and SP800-90C.
+///
+/// \code{.operation}
+/// IF HW_NRND_GEN.ready == 1
+///   Store64(__p, HW_NRND_GEN.data)
+///   result := 1
+/// ELSE
+///   Store64(__p, 0)
+///   result := 0
+/// END
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c RDSEED instruction.
+///
+/// \param __p
+///    Pointer to memory for storing the 64-bit random number.
+/// \returns 1 if a random number was generated, 0 if not.
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return (int) __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */
--- a/third_party/intel/clang/rtmintrin.h
+++ b/third_party/intel/clang/rtmintrin.h
@ -0,0 +1,45 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return (unsigned int)__builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */
--- a/third_party/intel/clang/serializeintrin.h
+++ b/third_party/intel/clang/serializeintrin.h
@ -0,0 +1,30 @@
+/*===--------------- serializeintrin.h - serialize intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SERIALIZEINTRIN_H
+#define __SERIALIZEINTRIN_H
+
+/// Serialize instruction fetch and execution.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
+///
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
+_serialize (void)
+{
+  __builtin_ia32_serialize ();
+}
+
+#endif /* __SERIALIZEINTRIN_H */
--- a/Show more
+++ b/Show more