Upgrade to Cosmopolitan GCC 11.2.0 for aarch64

2025-09-09 02:03:49 +00:00 · 2023-06-05 02:07:28 -07:00 · 2023-06-05 02:07:28 -07:00 · 9cc3e37263
commit 9cc3e37263
parent 39f20dbb13
63 changed files with 30429 additions and 22750 deletions
--- a/build/definitions.mk
+++ b/build/definitions.mk
@ -203,7 +203,8 @@ ifeq ($(ARCH), aarch64)
 #
 DEFAULT_COPTS +=							\
 	-ffixed-x18							\
-	-ffixed-x28
+	-ffixed-x28							\
+	-mno-outline-atomics
 endif

 MATHEMATICAL =								\
--- a/dsp/core/sad16x8n.c
+++ b/dsp/core/sad16x8n.c
@ -19,7 +19,7 @@
 #include "dsp/core/core.h"
 #include "libc/limits.h"
 #include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/intel/emmintrin.internal.h"

 /**
--- a/libc/calls/calls.mk
+++ b/libc/calls/calls.mk
@ -190,6 +190,7 @@ o/$(MODE)/libc/calls/unveil.o: private			\

 ifeq ($(ARCH), aarch64)
 o/$(MODE)/libc/calls/sigaction.o: private OVERRIDE_CFLAGS += -mcmodel=large
+o/$(MODE)/libc/calls/getloadavg-nt.o: private OVERRIDE_CFLAGS += -ffreestanding
 endif

 # we want -Os because:
--- a/third_party/aarch64/acc_prof.internal.h
+++ b/third_party/aarch64/acc_prof.internal.h
@ -0,0 +1,165 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+#include "third_party/aarch64/openacc.internal.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef enum acc_event_t
+{
+  acc_ev_none = 0,
+  acc_ev_device_init_start,
+  acc_ev_device_init_end,
+  acc_ev_device_shutdown_start,
+  acc_ev_device_shutdown_end,
+  acc_ev_runtime_shutdown,
+  acc_ev_create,
+  acc_ev_delete,
+  acc_ev_alloc,
+  acc_ev_free,
+  acc_ev_enter_data_start,
+  acc_ev_enter_data_end,
+  acc_ev_exit_data_start,
+  acc_ev_exit_data_end,
+  acc_ev_update_start,
+  acc_ev_update_end,
+  acc_ev_compute_construct_start,
+  acc_ev_compute_construct_end,
+  acc_ev_enqueue_launch_start,
+  acc_ev_enqueue_launch_end,
+  acc_ev_enqueue_upload_start,
+  acc_ev_enqueue_upload_end,
+  acc_ev_enqueue_download_start,
+  acc_ev_enqueue_download_end,
+  acc_ev_wait_start,
+  acc_ev_wait_end,
+  acc_ev_last
+} acc_event_t;
+typedef signed long int _acc_prof_ssize_t;
+typedef unsigned long int _acc_prof_size_t;
+typedef int _acc_prof_int_t;
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) (sizeof (_basictype))
+typedef struct acc_prof_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  _acc_prof_int_t version;
+  acc_device_t device_type;
+  _acc_prof_int_t device_number;
+  _acc_prof_int_t thread_id;
+  _acc_prof_ssize_t async;
+  _acc_prof_ssize_t async_queue;
+  const char *src_file;
+  const char *func_name;
+  _acc_prof_int_t line_no, end_line_no;
+  _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t))
+} acc_prof_info;
+#define _ACC_PROF_INFO_VERSION 201711
+typedef enum acc_construct_t
+{
+  acc_construct_parallel = 0,
+  acc_construct_kernels,
+  acc_construct_loop,
+  acc_construct_data,
+  acc_construct_enter_data,
+  acc_construct_exit_data,
+  acc_construct_host_data,
+  acc_construct_atomic,
+  acc_construct_declare,
+  acc_construct_init,
+  acc_construct_shutdown,
+  acc_construct_set,
+  acc_construct_update,
+  acc_construct_routine,
+  acc_construct_wait,
+  acc_construct_runtime_api,
+  acc_construct_serial
+} acc_construct_t;
+typedef struct acc_data_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *var_name;
+  _acc_prof_size_t bytes;
+  const void *host_ptr;
+  const void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+typedef struct acc_launch_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  const char *kernel_name;
+  _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t))
+} acc_launch_event_info;
+typedef struct acc_other_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+typedef union acc_event_info
+{
+  acc_event_t event_type;
+  acc_data_event_info data_event;
+  acc_launch_event_info launch_event;
+  acc_other_event_info other_event;
+} acc_event_info;
+typedef enum acc_device_api
+{
+  acc_device_api_none = 0,
+  acc_device_api_cuda,
+  acc_device_api_opencl,
+  acc_device_api_coi,
+  acc_device_api_other
+} acc_device_api;
+typedef struct acc_api_info
+{
+  acc_device_api device_api;
+  _acc_prof_int_t valid_bytes;
+  acc_device_t device_type;
+  _acc_prof_int_t vendor;
+  const void *device_handle;
+  const void *context_handle;
+  const void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+       acc_api_info *);
+typedef enum acc_register_t
+{
+  acc_reg = 0,
+  acc_toggle = 1,
+  acc_toggle_per_thread = 2
+} acc_register_t;
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback,
+          acc_register_t) __GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback,
+     acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+extern void acc_register_library (acc_prof_reg, acc_prof_reg,
+      acc_prof_lookup_func);
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
--- a/third_party/aarch64/arm_acle.h
+++ b/third_party/aarch64/arm_acle.h
@ -1,63 +0,0 @@
-#ifndef _GCC_ARM_ACLE_H
-#define _GCC_ARM_ACLE_H
-#ifdef __aarch64__
-#include "libc/inttypes.h"
-#include "libc/limits.h"
-#include "libc/literal.h"
-
-#pragma GCC push_options
-
-#pragma GCC target("+nothing+crc")
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32b(uint32_t __a, uint8_t __b) {
-  return __builtin_aarch64_crc32b(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32cb(uint32_t __a, uint8_t __b) {
-  return __builtin_aarch64_crc32cb(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32ch(uint32_t __a, uint16_t __b) {
-  return __builtin_aarch64_crc32ch(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32cw(uint32_t __a, uint32_t __b) {
-  return __builtin_aarch64_crc32cw(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32cd(uint32_t __a, uint64_t __b) {
-  return __builtin_aarch64_crc32cx(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32h(uint32_t __a, uint16_t __b) {
-  return __builtin_aarch64_crc32h(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32w(uint32_t __a, uint32_t __b) {
-  return __builtin_aarch64_crc32w(__a, __b);
-}
-
-__extension__ static __inline uint32_t __attribute__((__always_inline__))
-__crc32d(uint32_t __a, uint64_t __b) {
-  return __builtin_aarch64_crc32x(__a, __b);
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#pragma GCC pop_options
-
-#endif /* __aarch64__ */
-#endif
--- a/third_party/aarch64/arm_acle.internal.h
+++ b/third_party/aarch64/arm_acle.internal.h
@ -0,0 +1,164 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _GCC_ARM_ACLE_H
+#define _GCC_ARM_ACLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.3-a")
+__funline int32_t
+__jcvt (double __a)
+{
+  return __builtin_aarch64_jcvtzs (__a);
+}
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.5-a")
+__funline float
+__rint32zf (float __a)
+{
+  return __builtin_aarch64_frint32zsf (__a);
+}
+__funline double
+__rint32z (double __a)
+{
+  return __builtin_aarch64_frint32zdf (__a);
+}
+__funline float
+__rint64zf (float __a)
+{
+  return __builtin_aarch64_frint64zsf (__a);
+}
+__funline double
+__rint64z (double __a)
+{
+  return __builtin_aarch64_frint64zdf (__a);
+}
+__funline float
+__rint32xf (float __a)
+{
+  return __builtin_aarch64_frint32xsf (__a);
+}
+__funline double
+__rint32x (double __a)
+{
+  return __builtin_aarch64_frint32xdf (__a);
+}
+__funline float
+__rint64xf (float __a)
+{
+  return __builtin_aarch64_frint64xsf (__a);
+}
+__funline double
+__rint64x (double __a)
+{
+  return __builtin_aarch64_frint64xdf (__a);
+}
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target ("+nothing+crc")
+__funline uint32_t
+__crc32b (uint32_t __a, uint8_t __b)
+{
+  return __builtin_aarch64_crc32b (__a, __b);
+}
+__funline uint32_t
+__crc32cb (uint32_t __a, uint8_t __b)
+{
+  return __builtin_aarch64_crc32cb (__a, __b);
+}
+__funline uint32_t
+__crc32ch (uint32_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_crc32ch (__a, __b);
+}
+__funline uint32_t
+__crc32cw (uint32_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_crc32cw (__a, __b);
+}
+__funline uint32_t
+__crc32cd (uint32_t __a, uint64_t __b)
+{
+  return __builtin_aarch64_crc32cx (__a, __b);
+}
+__funline uint32_t
+__crc32h (uint32_t __a, uint16_t __b)
+{
+  return __builtin_aarch64_crc32h (__a, __b);
+}
+__funline uint32_t
+__crc32w (uint32_t __a, uint32_t __b)
+{
+  return __builtin_aarch64_crc32w (__a, __b);
+}
+__funline uint32_t
+__crc32d (uint32_t __a, uint64_t __b)
+{
+  return __builtin_aarch64_crc32x (__a, __b);
+}
+#pragma GCC pop_options
+#ifdef __ARM_FEATURE_TME
+#pragma GCC push_options
+#pragma GCC target ("+nothing+tme")
+#define _TMFAILURE_REASON 0x00007fffu
+#define _TMFAILURE_RTRY 0x00008000u
+#define _TMFAILURE_CNCL 0x00010000u
+#define _TMFAILURE_MEM 0x00020000u
+#define _TMFAILURE_IMP 0x00040000u
+#define _TMFAILURE_ERR 0x00080000u
+#define _TMFAILURE_SIZE 0x00100000u
+#define _TMFAILURE_NEST 0x00200000u
+#define _TMFAILURE_DBG 0x00400000u
+#define _TMFAILURE_INT 0x00800000u
+#define _TMFAILURE_TRIVIAL 0x01000000u
+__funline uint64_t
+__tstart (void)
+{
+  return __builtin_aarch64_tstart ();
+}
+__funline void
+__tcommit (void)
+{
+  __builtin_aarch64_tcommit ();
+}
+__funline void
+__tcancel (const uint64_t __reason)
+{
+  __builtin_aarch64_tcancel (__reason);
+}
+__funline uint64_t
+__ttest (void)
+{
+  return __builtin_aarch64_ttest ();
+}
+#pragma GCC pop_options
+#endif
+#pragma GCC push_options
+#pragma GCC target ("+nothing+rng")
+__funline int
+__rndr (uint64_t *__res)
+{
+  return __builtin_aarch64_rndr (__res);
+}
+__funline int
+__rndrrs (uint64_t *__res)
+{
+  return __builtin_aarch64_rndrrs (__res);
+}
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.5-a+memtag")
+#define __arm_mte_create_random_tag(__ptr, __u64_mask) __builtin_aarch64_memtag_irg(__ptr, __u64_mask)
+#define __arm_mte_exclude_tag(__ptr, __u64_excluded) __builtin_aarch64_memtag_gmi(__ptr, __u64_excluded)
+#define __arm_mte_ptrdiff(__ptr_a, __ptr_b) __builtin_aarch64_memtag_subp(__ptr_a, __ptr_b)
+#define __arm_mte_increment_tag(__ptr, __u_offset) __builtin_aarch64_memtag_inc_tag(__ptr, __u_offset)
+#define __arm_mte_set_tag(__tagged_address) __builtin_aarch64_memtag_set_tag(__tagged_address)
+#define __arm_mte_get_tag(__address) __builtin_aarch64_memtag_get_tag(__address)
+#pragma GCC pop_options
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
--- a/third_party/aarch64/arm_bf16.internal.h
+++ b/third_party/aarch64/arm_bf16.internal.h
@ -0,0 +1,23 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+typedef __bf16 bfloat16_t;
+typedef float float32_t;
+#pragma GCC push_options
+#pragma GCC target ("+nothing+bf16+nosimd")
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvth_bf16_f32 (float32_t __a)
+{
+  return __builtin_aarch64_bfcvtbf (__a);
+}
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtah_f32_bf16 (bfloat16_t __a)
+{
+  return __builtin_aarch64_bfcvtsf (__a);
+}
+#pragma GCC pop_options
+#endif
+#endif
--- a/third_party/aarch64/arm_fp16.h
+++ b/third_party/aarch64/arm_fp16.h
@ -1,373 +0,0 @@
-#ifndef _AARCH64_FP16_H_
-#define _AARCH64_FP16_H_
-#ifdef __aarch64__
-#include "libc/inttypes.h"
-#include "libc/limits.h"
-#include "libc/literal.h"
-
-#pragma GCC push_options
-#pragma GCC target("arch=armv8.2-a+fp16")
-
-typedef __fp16 float16_t;
-
-__funline float16_t vabsh_f16(float16_t __a) {
-  return __builtin_aarch64_abshf(__a);
-}
-
-__funline uint16_t vceqzh_f16(float16_t __a) {
-  return __builtin_aarch64_cmeqhf_uss(__a, 0.0f);
-}
-
-__funline uint16_t vcgezh_f16(float16_t __a) {
-  return __builtin_aarch64_cmgehf_uss(__a, 0.0f);
-}
-
-__funline uint16_t vcgtzh_f16(float16_t __a) {
-  return __builtin_aarch64_cmgthf_uss(__a, 0.0f);
-}
-
-__funline uint16_t vclezh_f16(float16_t __a) {
-  return __builtin_aarch64_cmlehf_uss(__a, 0.0f);
-}
-
-__funline uint16_t vcltzh_f16(float16_t __a) {
-  return __builtin_aarch64_cmlthf_uss(__a, 0.0f);
-}
-
-__funline float16_t vcvth_f16_s16(int16_t __a) {
-  return __builtin_aarch64_floathihf(__a);
-}
-
-__funline float16_t vcvth_f16_s32(int32_t __a) {
-  return __builtin_aarch64_floatsihf(__a);
-}
-
-__funline float16_t vcvth_f16_s64(int64_t __a) {
-  return __builtin_aarch64_floatdihf(__a);
-}
-
-__funline float16_t vcvth_f16_u16(uint16_t __a) {
-  return __builtin_aarch64_floatunshihf_us(__a);
-}
-
-__funline float16_t vcvth_f16_u32(uint32_t __a) {
-  return __builtin_aarch64_floatunssihf_us(__a);
-}
-
-__funline float16_t vcvth_f16_u64(uint64_t __a) {
-  return __builtin_aarch64_floatunsdihf_us(__a);
-}
-
-__funline int16_t vcvth_s16_f16(float16_t __a) {
-  return __builtin_aarch64_fix_trunchfhi(__a);
-}
-
-__funline int32_t vcvth_s32_f16(float16_t __a) {
-  return __builtin_aarch64_fix_trunchfsi(__a);
-}
-
-__funline int64_t vcvth_s64_f16(float16_t __a) {
-  return __builtin_aarch64_fix_trunchfdi(__a);
-}
-
-__funline uint16_t vcvth_u16_f16(float16_t __a) {
-  return __builtin_aarch64_fixuns_trunchfhi_us(__a);
-}
-
-__funline uint32_t vcvth_u32_f16(float16_t __a) {
-  return __builtin_aarch64_fixuns_trunchfsi_us(__a);
-}
-
-__funline uint64_t vcvth_u64_f16(float16_t __a) {
-  return __builtin_aarch64_fixuns_trunchfdi_us(__a);
-}
-
-__funline int16_t vcvtah_s16_f16(float16_t __a) {
-  return __builtin_aarch64_lroundhfhi(__a);
-}
-
-__funline int32_t vcvtah_s32_f16(float16_t __a) {
-  return __builtin_aarch64_lroundhfsi(__a);
-}
-
-__funline int64_t vcvtah_s64_f16(float16_t __a) {
-  return __builtin_aarch64_lroundhfdi(__a);
-}
-
-__funline uint16_t vcvtah_u16_f16(float16_t __a) {
-  return __builtin_aarch64_lrounduhfhi_us(__a);
-}
-
-__funline uint32_t vcvtah_u32_f16(float16_t __a) {
-  return __builtin_aarch64_lrounduhfsi_us(__a);
-}
-
-__funline uint64_t vcvtah_u64_f16(float16_t __a) {
-  return __builtin_aarch64_lrounduhfdi_us(__a);
-}
-
-__funline int16_t vcvtmh_s16_f16(float16_t __a) {
-  return __builtin_aarch64_lfloorhfhi(__a);
-}
-
-__funline int32_t vcvtmh_s32_f16(float16_t __a) {
-  return __builtin_aarch64_lfloorhfsi(__a);
-}
-
-__funline int64_t vcvtmh_s64_f16(float16_t __a) {
-  return __builtin_aarch64_lfloorhfdi(__a);
-}
-
-__funline uint16_t vcvtmh_u16_f16(float16_t __a) {
-  return __builtin_aarch64_lflooruhfhi_us(__a);
-}
-
-__funline uint32_t vcvtmh_u32_f16(float16_t __a) {
-  return __builtin_aarch64_lflooruhfsi_us(__a);
-}
-
-__funline uint64_t vcvtmh_u64_f16(float16_t __a) {
-  return __builtin_aarch64_lflooruhfdi_us(__a);
-}
-
-__funline int16_t vcvtnh_s16_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnhfhi(__a);
-}
-
-__funline int32_t vcvtnh_s32_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnhfsi(__a);
-}
-
-__funline int64_t vcvtnh_s64_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnhfdi(__a);
-}
-
-__funline uint16_t vcvtnh_u16_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnuhfhi_us(__a);
-}
-
-__funline uint32_t vcvtnh_u32_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnuhfsi_us(__a);
-}
-
-__funline uint64_t vcvtnh_u64_f16(float16_t __a) {
-  return __builtin_aarch64_lfrintnuhfdi_us(__a);
-}
-
-__funline int16_t vcvtph_s16_f16(float16_t __a) {
-  return __builtin_aarch64_lceilhfhi(__a);
-}
-
-__funline int32_t vcvtph_s32_f16(float16_t __a) {
-  return __builtin_aarch64_lceilhfsi(__a);
-}
-
-__funline int64_t vcvtph_s64_f16(float16_t __a) {
-  return __builtin_aarch64_lceilhfdi(__a);
-}
-
-__funline uint16_t vcvtph_u16_f16(float16_t __a) {
-  return __builtin_aarch64_lceiluhfhi_us(__a);
-}
-
-__funline uint32_t vcvtph_u32_f16(float16_t __a) {
-  return __builtin_aarch64_lceiluhfsi_us(__a);
-}
-
-__funline uint64_t vcvtph_u64_f16(float16_t __a) {
-  return __builtin_aarch64_lceiluhfdi_us(__a);
-}
-
-__funline float16_t vnegh_f16(float16_t __a) {
-  return __builtin_aarch64_neghf(__a);
-}
-
-__funline float16_t vrecpeh_f16(float16_t __a) {
-  return __builtin_aarch64_frecpehf(__a);
-}
-
-__funline float16_t vrecpxh_f16(float16_t __a) {
-  return __builtin_aarch64_frecpxhf(__a);
-}
-
-__funline float16_t vrndh_f16(float16_t __a) {
-  return __builtin_aarch64_btrunchf(__a);
-}
-
-__funline float16_t vrndah_f16(float16_t __a) {
-  return __builtin_aarch64_roundhf(__a);
-}
-
-__funline float16_t vrndih_f16(float16_t __a) {
-  return __builtin_aarch64_nearbyinthf(__a);
-}
-
-__funline float16_t vrndmh_f16(float16_t __a) {
-  return __builtin_aarch64_floorhf(__a);
-}
-
-__funline float16_t vrndnh_f16(float16_t __a) {
-  return __builtin_aarch64_frintnhf(__a);
-}
-
-__funline float16_t vrndph_f16(float16_t __a) {
-  return __builtin_aarch64_ceilhf(__a);
-}
-
-__funline float16_t vrndxh_f16(float16_t __a) {
-  return __builtin_aarch64_rinthf(__a);
-}
-
-__funline float16_t vrsqrteh_f16(float16_t __a) {
-  return __builtin_aarch64_rsqrtehf(__a);
-}
-
-__funline float16_t vsqrth_f16(float16_t __a) {
-  return __builtin_aarch64_sqrthf(__a);
-}
-
-__funline float16_t vaddh_f16(float16_t __a, float16_t __b) {
-  return __a + __b;
-}
-
-__funline float16_t vabdh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fabdhf(__a, __b);
-}
-
-__funline uint16_t vcageh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_facgehf_uss(__a, __b);
-}
-
-__funline uint16_t vcagth_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_facgthf_uss(__a, __b);
-}
-
-__funline uint16_t vcaleh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_faclehf_uss(__a, __b);
-}
-
-__funline uint16_t vcalth_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_faclthf_uss(__a, __b);
-}
-
-__funline uint16_t vceqh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_cmeqhf_uss(__a, __b);
-}
-
-__funline uint16_t vcgeh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_cmgehf_uss(__a, __b);
-}
-
-__funline uint16_t vcgth_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_cmgthf_uss(__a, __b);
-}
-
-__funline uint16_t vcleh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_cmlehf_uss(__a, __b);
-}
-
-__funline uint16_t vclth_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_cmlthf_uss(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_s16(int16_t __a, const int __b) {
-  return __builtin_aarch64_scvtfhi(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_s32(int32_t __a, const int __b) {
-  return __builtin_aarch64_scvtfsihf(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_s64(int64_t __a, const int __b) {
-  return __builtin_aarch64_scvtfdihf(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_u16(uint16_t __a, const int __b) {
-  return __builtin_aarch64_ucvtfhi_sus(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_u32(uint32_t __a, const int __b) {
-  return __builtin_aarch64_ucvtfsihf_sus(__a, __b);
-}
-
-__funline float16_t vcvth_n_f16_u64(uint64_t __a, const int __b) {
-  return __builtin_aarch64_ucvtfdihf_sus(__a, __b);
-}
-
-__funline int16_t vcvth_n_s16_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzshf(__a, __b);
-}
-
-__funline int32_t vcvth_n_s32_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzshfsi(__a, __b);
-}
-
-__funline int64_t vcvth_n_s64_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzshfdi(__a, __b);
-}
-
-__funline uint16_t vcvth_n_u16_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzuhf_uss(__a, __b);
-}
-
-__funline uint32_t vcvth_n_u32_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzuhfsi_uss(__a, __b);
-}
-
-__funline uint64_t vcvth_n_u64_f16(float16_t __a, const int __b) {
-  return __builtin_aarch64_fcvtzuhfdi_uss(__a, __b);
-}
-
-__funline float16_t vdivh_f16(float16_t __a, float16_t __b) {
-  return __a / __b;
-}
-
-__funline float16_t vmaxh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fmaxhf(__a, __b);
-}
-
-__funline float16_t vmaxnmh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fmaxhf(__a, __b);
-}
-
-__funline float16_t vminh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fminhf(__a, __b);
-}
-
-__funline float16_t vminnmh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fminhf(__a, __b);
-}
-
-__funline float16_t vmulh_f16(float16_t __a, float16_t __b) {
-  return __a * __b;
-}
-
-__funline float16_t vmulxh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_fmulxhf(__a, __b);
-}
-
-__funline float16_t vrecpsh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_frecpshf(__a, __b);
-}
-
-__funline float16_t vrsqrtsh_f16(float16_t __a, float16_t __b) {
-  return __builtin_aarch64_rsqrtshf(__a, __b);
-}
-
-__funline float16_t vsubh_f16(float16_t __a, float16_t __b) {
-  return __a - __b;
-}
-
-__funline float16_t vfmah_f16(float16_t __a, float16_t __b, float16_t __c) {
-  return __builtin_aarch64_fmahf(__b, __c, __a);
-}
-
-__funline float16_t vfmsh_f16(float16_t __a, float16_t __b, float16_t __c) {
-  return __builtin_aarch64_fnmahf(__b, __c, __a);
-}
-
-#pragma GCC pop_options
-
-#undef FUNC
-#endif /* __aarch64__ */
-#endif /* _AARCH64_FP16_H_ */
--- a/third_party/aarch64/arm_fp16.internal.h
+++ b/third_party/aarch64/arm_fp16.internal.h
@ -0,0 +1,455 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _AARCH64_FP16_H_
+#define _AARCH64_FP16_H_
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16")
+typedef __fp16 float16_t;
+__funline float16_t
+vabsh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_abshf (__a);
+}
+__funline uint16_t
+vceqzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmeqhf_uss (__a, 0.0f);
+}
+__funline uint16_t
+vcgezh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmgehf_uss (__a, 0.0f);
+}
+__funline uint16_t
+vcgtzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmgthf_uss (__a, 0.0f);
+}
+__funline uint16_t
+vclezh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmlehf_uss (__a, 0.0f);
+}
+__funline uint16_t
+vcltzh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_cmlthf_uss (__a, 0.0f);
+}
+__funline float16_t
+vcvth_f16_s16 (int16_t __a)
+{
+  return __builtin_aarch64_floathihf (__a);
+}
+__funline float16_t
+vcvth_f16_s32 (int32_t __a)
+{
+  return __builtin_aarch64_floatsihf (__a);
+}
+__funline float16_t
+vcvth_f16_s64 (int64_t __a)
+{
+  return __builtin_aarch64_floatdihf (__a);
+}
+__funline float16_t
+vcvth_f16_u16 (uint16_t __a)
+{
+  return __builtin_aarch64_floatunshihf_us (__a);
+}
+__funline float16_t
+vcvth_f16_u32 (uint32_t __a)
+{
+  return __builtin_aarch64_floatunssihf_us (__a);
+}
+__funline float16_t
+vcvth_f16_u64 (uint64_t __a)
+{
+  return __builtin_aarch64_floatunsdihf_us (__a);
+}
+__funline int16_t
+vcvth_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfhi (__a);
+}
+__funline int32_t
+vcvth_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfsi (__a);
+}
+__funline int64_t
+vcvth_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fix_trunchfdi (__a);
+}
+__funline uint16_t
+vcvth_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfhi_us (__a);
+}
+__funline uint32_t
+vcvth_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfsi_us (__a);
+}
+__funline uint64_t
+vcvth_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_fixuns_trunchfdi_us (__a);
+}
+__funline int16_t
+vcvtah_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfhi (__a);
+}
+__funline int32_t
+vcvtah_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfsi (__a);
+}
+__funline int64_t
+vcvtah_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lroundhfdi (__a);
+}
+__funline uint16_t
+vcvtah_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfhi_us (__a);
+}
+__funline uint32_t
+vcvtah_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfsi_us (__a);
+}
+__funline uint64_t
+vcvtah_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lrounduhfdi_us (__a);
+}
+__funline int16_t
+vcvtmh_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfhi (__a);
+}
+__funline int32_t
+vcvtmh_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfsi (__a);
+}
+__funline int64_t
+vcvtmh_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfloorhfdi (__a);
+}
+__funline uint16_t
+vcvtmh_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfhi_us (__a);
+}
+__funline uint32_t
+vcvtmh_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfsi_us (__a);
+}
+__funline uint64_t
+vcvtmh_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lflooruhfdi_us (__a);
+}
+__funline int16_t
+vcvtnh_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfhi (__a);
+}
+__funline int32_t
+vcvtnh_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfsi (__a);
+}
+__funline int64_t
+vcvtnh_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnhfdi (__a);
+}
+__funline uint16_t
+vcvtnh_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfhi_us (__a);
+}
+__funline uint32_t
+vcvtnh_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfsi_us (__a);
+}
+__funline uint64_t
+vcvtnh_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lfrintnuhfdi_us (__a);
+}
+__funline int16_t
+vcvtph_s16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfhi (__a);
+}
+__funline int32_t
+vcvtph_s32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfsi (__a);
+}
+__funline int64_t
+vcvtph_s64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceilhfdi (__a);
+}
+__funline uint16_t
+vcvtph_u16_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfhi_us (__a);
+}
+__funline uint32_t
+vcvtph_u32_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfsi_us (__a);
+}
+__funline uint64_t
+vcvtph_u64_f16 (float16_t __a)
+{
+  return __builtin_aarch64_lceiluhfdi_us (__a);
+}
+__funline float16_t
+vnegh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_neghf (__a);
+}
+__funline float16_t
+vrecpeh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frecpehf (__a);
+}
+__funline float16_t
+vrecpxh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frecpxhf (__a);
+}
+__funline float16_t
+vrndh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_btrunchf (__a);
+}
+__funline float16_t
+vrndah_f16 (float16_t __a)
+{
+  return __builtin_aarch64_roundhf (__a);
+}
+__funline float16_t
+vrndih_f16 (float16_t __a)
+{
+  return __builtin_aarch64_nearbyinthf (__a);
+}
+__funline float16_t
+vrndmh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_floorhf (__a);
+}
+__funline float16_t
+vrndnh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_frintnhf (__a);
+}
+__funline float16_t
+vrndph_f16 (float16_t __a)
+{
+  return __builtin_aarch64_ceilhf (__a);
+}
+__funline float16_t
+vrndxh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_rinthf (__a);
+}
+__funline float16_t
+vrsqrteh_f16 (float16_t __a)
+{
+  return __builtin_aarch64_rsqrtehf (__a);
+}
+__funline float16_t
+vsqrth_f16 (float16_t __a)
+{
+  return __builtin_aarch64_sqrthf (__a);
+}
+__funline float16_t
+vaddh_f16 (float16_t __a, float16_t __b)
+{
+  return __a + __b;
+}
+__funline float16_t
+vabdh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fabdhf (__a, __b);
+}
+__funline uint16_t
+vcageh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_facgehf_uss (__a, __b);
+}
+__funline uint16_t
+vcagth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_facgthf_uss (__a, __b);
+}
+__funline uint16_t
+vcaleh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_faclehf_uss (__a, __b);
+}
+__funline uint16_t
+vcalth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_faclthf_uss (__a, __b);
+}
+__funline uint16_t
+vceqh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmeqhf_uss (__a, __b);
+}
+__funline uint16_t
+vcgeh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmgehf_uss (__a, __b);
+}
+__funline uint16_t
+vcgth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmgthf_uss (__a, __b);
+}
+__funline uint16_t
+vcleh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmlehf_uss (__a, __b);
+}
+__funline uint16_t
+vclth_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_cmlthf_uss (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_s16 (int16_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfhi (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_s32 (int32_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfsihf (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_s64 (int64_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfdihf (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_u16 (uint16_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfhi_sus (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_u32 (uint32_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfsihf_sus (__a, __b);
+}
+__funline float16_t
+vcvth_n_f16_u64 (uint64_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfdihf_sus (__a, __b);
+}
+__funline int16_t
+vcvth_n_s16_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshf (__a, __b);
+}
+__funline int32_t
+vcvth_n_s32_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshfsi (__a, __b);
+}
+__funline int64_t
+vcvth_n_s64_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzshfdi (__a, __b);
+}
+__funline uint16_t
+vcvth_n_u16_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhf_uss (__a, __b);
+}
+__funline uint32_t
+vcvth_n_u32_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhfsi_uss (__a, __b);
+}
+__funline uint64_t
+vcvth_n_u64_f16 (float16_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuhfdi_uss (__a, __b);
+}
+__funline float16_t
+vdivh_f16 (float16_t __a, float16_t __b)
+{
+  return __a / __b;
+}
+__funline float16_t
+vmaxh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmaxhf (__a, __b);
+}
+__funline float16_t
+vmaxnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmaxhf (__a, __b);
+}
+__funline float16_t
+vminh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fminhf (__a, __b);
+}
+__funline float16_t
+vminnmh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fminhf (__a, __b);
+}
+__funline float16_t
+vmulh_f16 (float16_t __a, float16_t __b)
+{
+  return __a * __b;
+}
+__funline float16_t
+vmulxh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_fmulxhf (__a, __b);
+}
+__funline float16_t
+vrecpsh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_frecpshf (__a, __b);
+}
+__funline float16_t
+vrsqrtsh_f16 (float16_t __a, float16_t __b)
+{
+  return __builtin_aarch64_rsqrtshf (__a, __b);
+}
+__funline float16_t
+vsubh_f16 (float16_t __a, float16_t __b)
+{
+  return __a - __b;
+}
+__funline float16_t
+vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_aarch64_fmahf (__b, __c, __a);
+}
+__funline float16_t
+vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
+{
+  return __builtin_aarch64_fnmahf (__b, __c, __a);
+}
+#pragma GCC pop_options
+#endif
+#endif
--- a/third_party/aarch64/arm_neon.h
+++ b/third_party/aarch64/arm_neon.h
--- a/third_party/aarch64/arm_neon.internal.h
+++ b/third_party/aarch64/arm_neon.internal.h
--- a/third_party/aarch64/arm_sve.internal.h
+++ b/third_party/aarch64/arm_sve.internal.h
@ -0,0 +1,11 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _ARM_SVE_H_
+#define _ARM_SVE_H_
+#include "third_party/aarch64/arm_bf16.internal.h"
+typedef __fp16 float16_t;
+typedef float float32_t;
+typedef double float64_t;
+#pragma GCC aarch64 "third_party/aarch64/arm_sve.internal.h"
+#endif
+#endif
--- a/third_party/aarch64/openacc.internal.h
+++ b/third_party/aarch64/openacc.internal.h
@ -0,0 +1,112 @@
+/* clang-format off */
+#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
+#ifndef _OPENACC_H
+#define _OPENACC_H 1
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if __cplusplus >= 201103
+# define __GOACC_NOTHROW noexcept
+#elif __cplusplus
+# define __GOACC_NOTHROW throw ()
+#else
+# define __GOACC_NOTHROW __attribute__ ((__nothrow__))
+#endif
+typedef enum acc_device_t {
+  acc_device_current = -1,
+  acc_device_none = 0,
+  acc_device_default = 1,
+  acc_device_host = 2,
+  acc_device_not_host = 4,
+  acc_device_nvidia = 5,
+  acc_device_radeon = 8,
+  _ACC_device_hwm,
+  _ACC_highest = __INT_MAX__,
+  _ACC_neg = -1
+} acc_device_t;
+typedef enum acc_device_property_t {
+  acc_property_memory = 1,
+  acc_property_free_memory = 2,
+  acc_property_name = 0x10001,
+  acc_property_vendor = 0x10002,
+  acc_property_driver = 0x10003
+} acc_device_property_t;
+typedef enum acc_async_t {
+  acc_async_noval = -1,
+  acc_async_sync = -2
+} acc_async_t;
+int acc_get_num_devices (acc_device_t) __GOACC_NOTHROW;
+void acc_set_device_type (acc_device_t) __GOACC_NOTHROW;
+acc_device_t acc_get_device_type (void) __GOACC_NOTHROW;
+void acc_set_device_num (int, acc_device_t) __GOACC_NOTHROW;
+int acc_get_device_num (acc_device_t) __GOACC_NOTHROW;
+size_t acc_get_property
+  (int, acc_device_t, acc_device_property_t) __GOACC_NOTHROW;
+const char *acc_get_property_string
+  (int, acc_device_t, acc_device_property_t) __GOACC_NOTHROW;
+int acc_async_test (int) __GOACC_NOTHROW;
+int acc_async_test_all (void) __GOACC_NOTHROW;
+void acc_wait (int) __GOACC_NOTHROW;
+void acc_async_wait (int) __GOACC_NOTHROW;
+void acc_wait_async (int, int) __GOACC_NOTHROW;
+void acc_wait_all (void) __GOACC_NOTHROW;
+void acc_async_wait_all (void) __GOACC_NOTHROW;
+void acc_wait_all_async (int) __GOACC_NOTHROW;
+void acc_init (acc_device_t) __GOACC_NOTHROW;
+void acc_shutdown (acc_device_t) __GOACC_NOTHROW;
+#ifdef __cplusplus
+int acc_on_device (int __arg) __GOACC_NOTHROW;
+#else
+int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
+#endif
+void *acc_malloc (size_t) __GOACC_NOTHROW;
+void acc_free (void *) __GOACC_NOTHROW;
+void *acc_copyin (void *, size_t) __GOACC_NOTHROW;
+void *acc_present_or_copyin (void *, size_t) __GOACC_NOTHROW;
+void *acc_pcopyin (void *, size_t) __GOACC_NOTHROW;
+void *acc_create (void *, size_t) __GOACC_NOTHROW;
+void *acc_present_or_create (void *, size_t) __GOACC_NOTHROW;
+void *acc_pcreate (void *, size_t) __GOACC_NOTHROW;
+void acc_copyout (void *, size_t) __GOACC_NOTHROW;
+void acc_delete (void *, size_t) __GOACC_NOTHROW;
+void acc_update_device (void *, size_t) __GOACC_NOTHROW;
+void acc_update_self (void *, size_t) __GOACC_NOTHROW;
+void acc_map_data (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_unmap_data (void *) __GOACC_NOTHROW;
+void *acc_deviceptr (void *) __GOACC_NOTHROW;
+void *acc_hostptr (void *) __GOACC_NOTHROW;
+int acc_is_present (void *, size_t) __GOACC_NOTHROW;
+void acc_memcpy_to_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_memcpy_from_device (void *, void *, size_t) __GOACC_NOTHROW;
+void acc_attach (void **) __GOACC_NOTHROW;
+void acc_attach_async (void **, int) __GOACC_NOTHROW;
+void acc_detach (void **) __GOACC_NOTHROW;
+void acc_detach_async (void **, int) __GOACC_NOTHROW;
+void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
+void acc_copyout_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
+void acc_delete_finalize_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_detach_finalize (void **) __GOACC_NOTHROW;
+void acc_detach_finalize_async (void **, int) __GOACC_NOTHROW;
+void acc_copyin_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_create_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_copyout_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_delete_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_update_device_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_update_self_async (void *, size_t, int) __GOACC_NOTHROW;
+void acc_memcpy_to_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
+void acc_memcpy_from_device_async (void *, void *, size_t, int) __GOACC_NOTHROW;
+void *acc_get_current_cuda_device (void) __GOACC_NOTHROW;
+void *acc_get_current_cuda_context (void) __GOACC_NOTHROW;
+void *acc_get_cuda_stream (int) __GOACC_NOTHROW;
+int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
+#ifdef __cplusplus
+}
+#pragma acc routine seq
+inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
+{
+  return acc_on_device ((int) __arg);
+}
+#endif
+#endif
+#endif
--- a/third_party/aarch64/upgrade.sh
+++ b/third_party/aarch64/upgrade.sh
@ -0,0 +1,72 @@
+#!/bin/sh
+# /opt/aarch64o2/lib/gcc/aarch64-linux-musl/9.2.0/include
+# /opt/cross11portcosmo/lib/gcc/aarch64-linux-musl/11.2.0/include
+
+# IMPORTANT NOTES
+#
+# 1. You also need:
+#      #pragma GCC diagnostic ignored "-Wmissing-braces"
+#    In third_party/aarch64/arm_neon.internal.h
+#
+# 2. You have to rewrite arm_fp16 to use `__funline`.
+#
+# 3. You should fix up the `#pragma GCC aarch64` things.
+#
+
+s=/opt/cross11portcosmo/lib/gcc/aarch64-linux-musl/11.2.0/include
+d=third_party/aarch64
+
+FILES='
+arm_acle
+arm_fp16
+arm_neon
+acc_prof
+arm_bf16
+arm_sve
+acc_prof
+openacc
+'
+
+strip_c_comments() {
+  # https://stackoverflow.com/a/13062682/1653720
+  [ $# -eq 2 ] && arg="$1" || arg=""
+  eval file="\$$#"
+  sed 's/a/aA/g; s/__/aB/g; s/#/aC/g' "$file" |
+    gcc -P -E $arg - |
+    sed 's/aC/#/g; s/aB/__/g; s/aA/a/g'
+}
+
+rm -f third_party/aarch64/*.h
+
+for f in $FILES; do
+  echo cp $s/$f.h $d/$f.internal.h
+  cp $s/$f.h $d/$f.internal.h || exit
+done
+
+sed -i \
+    -e 's/#  *include/#include/' \
+    -e '/#include .std/d' \
+    -e 's!#include [<"]!#include "third_party/aarch64/!' \
+    -e 's!\.h[>"]$!.internal.h"!' \
+    third_party/aarch64/*.h
+
+# solve the pedantic gcc linter warning `'vmulxh_f16' is static but used
+# in inline function 'vmulxh_laneq_f16' which is not static [-Werror]`
+sed -i \
+    -e 's/static/extern/g' \
+    third_party/aarch64/arm_fp16.internal.h
+
+for f in third_party/aarch64/*.h; do
+  strip_c_comments $f >$f.tmp || exit
+  mv $f.tmp $f
+done
+
+for f in third_party/aarch64/*.h; do
+  (
+    printf %s\\n '/* clang-format off */'
+    printf %s\\n '#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)'
+    cat $f
+    printf %s\\n '#endif'
+  ) >$f.tmp
+  mv $f.tmp $f
+done
--- a/third_party/gcc/aarch64-linux-musl/bin/ld.bfd.gz
+++ b/third_party/gcc/aarch64-linux-musl/bin/ld.bfd.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-addr2line.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-addr2line.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-ar.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-ar.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-as.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-as.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-c++filt.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-c++filt.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-cpp.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-cpp.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-elfedit.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-elfedit.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-g++.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-g++.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcc-ar.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcc-ar.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcc-nm.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcc-nm.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcc-ranlib.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcc-ranlib.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcc.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcc.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcov-dump.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcov-dump.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcov-tool.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcov-tool.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gcov.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gcov.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-gprof.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-gprof.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-nm.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-nm.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-objcopy.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-objcopy.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-objdump.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-objdump.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-ranlib.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-ranlib.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-readelf.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-readelf.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-size.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-size.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-strings.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-strings.gz
--- a/third_party/gcc/bin/aarch64-linux-musl-strip.gz
+++ b/third_party/gcc/bin/aarch64-linux-musl-strip.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/as.sym
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/as.sym
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/cc1.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/cc1.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/cc1plus.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/cc1plus.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/collect-ld.sym
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/collect-ld.sym
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/collect2.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/collect2.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/ld.sym
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/11.2.0/ld.sym
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/cc1.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/cc1.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/cc1plus.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/cc1plus.gz
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect2.gz
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect2.gz
--- a/third_party/gcc/upgrade-cosmo-gcc.sh
+++ b/third_party/gcc/upgrade-cosmo-gcc.sh
@ -1,6 +1,6 @@
 #!/bin/sh

-ARCH=${1:-x86_64}
+ARCH=${1:-aarch64}
 IMPORT=${2:-/opt/cross11portcosmo}
 PREFIX=third_party/gcc/
 OLDVERSION=9.2.0
--- a/third_party/ggml/ggjt.v1.q4_0.c
+++ b/third_party/ggml/ggjt.v1.q4_0.c
@ -29,7 +29,7 @@
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q8_0.h"
 #include "third_party/intel/immintrin.internal.h"
--- a/third_party/ggml/ggjt.v1.q4_1.c
+++ b/third_party/ggml/ggjt.v1.q4_1.c
@ -29,7 +29,7 @@
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q4_1.h"
 #include "third_party/ggml/ggjt.v1.q8_1.h"
--- a/third_party/ggml/ggjt.v1.q4_2.c
+++ b/third_party/ggml/ggjt.v1.q4_2.c
@ -29,7 +29,7 @@
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/fp16.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q8_0.h"
--- a/third_party/ggml/ggjt.v1.q5_0.c
+++ b/third_party/ggml/ggjt.v1.q5_0.c
@ -28,7 +28,7 @@
 #include "third_party/ggml/ggjt.v1.q5_0.h"
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/fp16.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q8_0.h"
--- a/third_party/ggml/ggjt.v1.q5_1.c
+++ b/third_party/ggml/ggjt.v1.q5_1.c
@ -20,7 +20,7 @@
 #include "libc/assert.h"
 #include "libc/math.h"
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/fp16.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q8_1.h"
--- a/third_party/ggml/ggjt.v1.q8_0.c
+++ b/third_party/ggml/ggjt.v1.q8_0.c
@ -19,7 +19,7 @@
 #include "third_party/ggml/ggjt.v1.q8_0.h"
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/ggjt.v1.internal.h"
 #include "third_party/ggml/ggjt.v1.q8_0.h"
 #include "third_party/intel/immintrin.internal.h"
--- a/third_party/ggml/ggjt.v1.q8_1.c
+++ b/third_party/ggml/ggjt.v1.q8_1.c
@ -19,7 +19,7 @@
 #include "third_party/ggml/ggjt.v1.q8_1.h"
 #include "libc/assert.h"
 #include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/ggml/ggml.h"
 #include "third_party/intel/immintrin.internal.h"
 #include "third_party/libcxx/math.h"
--- a/third_party/ggml/ggjt.v2.internal.h
+++ b/third_party/ggml/ggjt.v2.internal.h
@ -1,7 +1,7 @@
 #ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
 #define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
 #include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/intel/immintrin.internal.h"
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
@ -39,7 +39,7 @@
 #include "libc/sysv/consts/clock.h"
 #include "libc/thread/thread.h"
 #include "libc/time/time.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/intel/immintrin.internal.h"
 #include "libc/macros.internal.h"
 #include "libc/tinymath/magicu.h"
--- a/third_party/ggml/ggml.mk
+++ b/third_party/ggml/ggml.mk
@ -192,6 +192,8 @@ o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private		\
 		ZIPOBJ_FLAGS +=						\
 			-B

+o/$(MODE)/third_party/ggml/ggml.o: private QUOTA = -C64
+
 ################################################################################

 THIRD_PARTY_GGML_COMS =							\
--- a/third_party/stb/stb_image.c
+++ b/third_party/stb/stb_image.c
@ -32,7 +32,7 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/x/x.h"
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 #include "third_party/intel/ammintrin.internal.h"

 asm(".ident\t\"\\n\\n\
--- a/third_party/zlib/adler32_simd.c
+++ b/third_party/zlib/adler32_simd.c
@ -203,7 +203,7 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* SSSE3 */

 #elif defined(ADLER32_SIMD_NEON)

-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"

 uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
    uint32_t adler,
--- a/third_party/zlib/chunkcopy.inc
+++ b/third_party/zlib/chunkcopy.inc
@ -28,7 +28,7 @@
 #endif

 #if defined(INFLATE_CHUNK_SIMD_NEON)
-#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_neon.internal.h"
 typedef uint8x16_t z_vec128i_t;
 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
 #include "third_party/intel/emmintrin.internal.h"
--- a/third_party/zlib/crc32_simd.c
+++ b/third_party/zlib/crc32_simd.c
@ -10,6 +10,7 @@ Chromium (BSD-3 License)\\n\
 Copyright 2017 The Chromium Authors\"");
 // clang-format off

+#include "third_party/intel/x86gprintrin.internal.h"
 #include "third_party/zlib/crc32_simd.internal.h"
 #if defined(CRC32_SIMD_AVX512_PCLMUL)

@ -358,8 +359,8 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
 /* We need some extra types for using PMULL.
 */
 #if defined(__aarch64__)
-#include "third_party/aarch64/arm_neon.h"
-#include "third_party/aarch64/arm_acle.h"
+#include "third_party/aarch64/arm_neon.internal.h"
+#include "third_party/aarch64/arm_acle.internal.h"
 #endif

 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
@ -400,8 +401,8 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
 * allowed. We can just include arm_acle.h.
 */
-#include "third_party/aarch64/arm_neon.h"
-#include "third_party/aarch64/arm_acle.h"
+#include "third_party/aarch64/arm_neon.internal.h"
+#include "third_party/aarch64/arm_acle.internal.h"
 #define TARGET_ARMV8_WITH_CRC
 #else  // !defined(__GNUC__) && !defined(_aarch64__)
 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
--- a/third_party/zlib/slide_hash_simd.inc
+++ b/third_party/zlib/slide_hash_simd.inc
@ -41,7 +41,7 @@ typedef __m128i z_vec128i_u16x8_t;

 #elif defined(DEFLATE_SLIDE_HASH_NEON)

-#include "third_party/aarch64/arm_neon.h"  /* NEON */
+#include "third_party/aarch64/arm_neon.internal.h"  /* NEON */

 #define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))