cosmopolitan/third_party/aarch64/arm_fp16.internal.h
Justine Tunney 957c61cbbf
Release Cosmopolitan v3.3
This change upgrades to GCC 12.3 and GNU binutils 2.42. The GNU linker
appears to have changed things so that only a single de-duplicated str
table is present in the binary, and it gets placed wherever the linker
wants, regardless of what the linker script says. To cope with that we
need to stop using .ident to embed licenses. As such, this change does
significant work to revamp how third party licenses are defined in the
codebase, using `.section .notice,"aR",@progbits`.

This new GCC 12.3 toolchain has support for GNU indirect functions. It
lets us support __target_clones__ for the first time. This is used for
optimizing the performance of libc string functions such as strlen and
friends so far on x86, by ensuring AVX systems favor a second codepath
that uses VEX encoding. It shaves some latency off certain operations.
It's a useful feature to have for scientific computing for the reasons
explained by the test/libcxx/openmp_test.cc example which compiles for
fifteen different microarchitectures. Thanks to the upgrades, it's now
also possible to use newer instruction sets, such as AVX512FP16, VNNI.

Cosmo now uses the %gs register on x86 by default for TLS. Doing it is
helpful for any program that links `cosmo_dlopen()`. Such programs had
to recompile their binaries at startup to change the TLS instructions.
That's not great, since it means every page in the executable needs to
be faulted. The work of rewriting TLS-related x86 opcodes, is moved to
fixupobj.com instead. This is great news for MacOS x86 users, since we
previously needed to morph the binary every time for that platform but
now that's no longer necessary. The only platforms where we need fixup
of TLS x86 opcodes at runtime are now Windows, OpenBSD, and NetBSD. On
Windows we morph TLS to point deeper into the TIB, based on a TlsAlloc
assignment, and on OpenBSD/NetBSD we morph %gs back into %fs since the
kernels do not allow us to specify a value for the %gs register.

OpenBSD users are now required to use APE Loader to run Cosmo binaries
and assimilation is no longer possible. OpenBSD kernel needs to change
to allow programs to specify a value for the %gs register, or it needs
to stop marking executable pages loaded by the kernel as mimmutable().

This release fixes __constructor__, .ctor, .init_array, and lastly the
.preinit_array so they behave the exact same way as glibc.

We no longer use hex constants to define math.h symbols like M_PI.
2024-02-20 13:27:59 -08:00

543 lines
17 KiB
C

#if defined(__aarch64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
#ifndef _AARCH64_FP16_H_
#define _AARCH64_FP16_H_
#pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+fp16")
typedef __fp16 float16_t;
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabsh_f16 (float16_t __a)
{
return __builtin_aarch64_abshf (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vceqzh_f16 (float16_t __a)
{
return __builtin_aarch64_cmeqhf_uss (__a, 0.0f);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcgezh_f16 (float16_t __a)
{
return __builtin_aarch64_cmgehf_uss (__a, 0.0f);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcgtzh_f16 (float16_t __a)
{
return __builtin_aarch64_cmgthf_uss (__a, 0.0f);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vclezh_f16 (float16_t __a)
{
return __builtin_aarch64_cmlehf_uss (__a, 0.0f);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcltzh_f16 (float16_t __a)
{
return __builtin_aarch64_cmlthf_uss (__a, 0.0f);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_s16 (int16_t __a)
{
return __builtin_aarch64_floathihf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_s32 (int32_t __a)
{
return __builtin_aarch64_floatsihf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_s64 (int64_t __a)
{
return __builtin_aarch64_floatdihf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_u16 (uint16_t __a)
{
return __builtin_aarch64_floatunshihf_us (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_u32 (uint32_t __a)
{
return __builtin_aarch64_floatunssihf_us (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_f16_u64 (uint64_t __a)
{
return __builtin_aarch64_floatunsdihf_us (__a);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_s16_f16 (float16_t __a)
{
return __builtin_aarch64_fix_trunchfhi (__a);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_s32_f16 (float16_t __a)
{
return __builtin_aarch64_fix_trunchfsi (__a);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_s64_f16 (float16_t __a)
{
return __builtin_aarch64_fix_trunchfdi (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_u16_f16 (float16_t __a)
{
return __builtin_aarch64_fixuns_trunchfhi_us (__a);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_u32_f16 (float16_t __a)
{
return __builtin_aarch64_fixuns_trunchfsi_us (__a);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_u64_f16 (float16_t __a)
{
return __builtin_aarch64_fixuns_trunchfdi_us (__a);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_s16_f16 (float16_t __a)
{
return __builtin_aarch64_lroundhfhi (__a);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_s32_f16 (float16_t __a)
{
return __builtin_aarch64_lroundhfsi (__a);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_s64_f16 (float16_t __a)
{
return __builtin_aarch64_lroundhfdi (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_u16_f16 (float16_t __a)
{
return __builtin_aarch64_lrounduhfhi_us (__a);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_u32_f16 (float16_t __a)
{
return __builtin_aarch64_lrounduhfsi_us (__a);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtah_u64_f16 (float16_t __a)
{
return __builtin_aarch64_lrounduhfdi_us (__a);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_s16_f16 (float16_t __a)
{
return __builtin_aarch64_lfloorhfhi (__a);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_s32_f16 (float16_t __a)
{
return __builtin_aarch64_lfloorhfsi (__a);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_s64_f16 (float16_t __a)
{
return __builtin_aarch64_lfloorhfdi (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_u16_f16 (float16_t __a)
{
return __builtin_aarch64_lflooruhfhi_us (__a);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_u32_f16 (float16_t __a)
{
return __builtin_aarch64_lflooruhfsi_us (__a);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtmh_u64_f16 (float16_t __a)
{
return __builtin_aarch64_lflooruhfdi_us (__a);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_s16_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnhfhi (__a);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_s32_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnhfsi (__a);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_s64_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnhfdi (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_u16_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnuhfhi_us (__a);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_u32_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnuhfsi_us (__a);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtnh_u64_f16 (float16_t __a)
{
return __builtin_aarch64_lfrintnuhfdi_us (__a);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_s16_f16 (float16_t __a)
{
return __builtin_aarch64_lceilhfhi (__a);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_s32_f16 (float16_t __a)
{
return __builtin_aarch64_lceilhfsi (__a);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_s64_f16 (float16_t __a)
{
return __builtin_aarch64_lceilhfdi (__a);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_u16_f16 (float16_t __a)
{
return __builtin_aarch64_lceiluhfhi_us (__a);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_u32_f16 (float16_t __a)
{
return __builtin_aarch64_lceiluhfsi_us (__a);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvtph_u64_f16 (float16_t __a)
{
return __builtin_aarch64_lceiluhfdi_us (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vnegh_f16 (float16_t __a)
{
return __builtin_aarch64_neghf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrecpeh_f16 (float16_t __a)
{
return __builtin_aarch64_frecpehf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrecpxh_f16 (float16_t __a)
{
return __builtin_aarch64_frecpxhf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndh_f16 (float16_t __a)
{
return __builtin_aarch64_btrunchf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndah_f16 (float16_t __a)
{
return __builtin_aarch64_roundhf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndih_f16 (float16_t __a)
{
return __builtin_aarch64_nearbyinthf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndmh_f16 (float16_t __a)
{
return __builtin_aarch64_floorhf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndnh_f16 (float16_t __a)
{
return __builtin_aarch64_roundevenhf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndph_f16 (float16_t __a)
{
return __builtin_aarch64_ceilhf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndxh_f16 (float16_t __a)
{
return __builtin_aarch64_rinthf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrsqrteh_f16 (float16_t __a)
{
return __builtin_aarch64_rsqrtehf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsqrth_f16 (float16_t __a)
{
return __builtin_aarch64_sqrthf (__a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaddh_f16 (float16_t __a, float16_t __b)
{
return __a + __b;
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fabdhf (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcageh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_facgehf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagth_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_facgthf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaleh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_faclehf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcalth_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_faclthf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vceqh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_cmeqhf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcgeh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_cmgehf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcgth_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_cmgthf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcleh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_cmlehf_uss (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vclth_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_cmlthf_uss (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_s16 (int16_t __a, const int __b)
{
return __builtin_aarch64_scvtfhi (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_s32 (int32_t __a, const int __b)
{
return __builtin_aarch64_scvtfsihf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_s64 (int64_t __a, const int __b)
{
return __builtin_aarch64_scvtfdihf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_u16 (uint16_t __a, const int __b)
{
return __builtin_aarch64_ucvtfhi_sus (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_u32 (uint32_t __a, const int __b)
{
return __builtin_aarch64_ucvtfsihf_sus (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_f16_u64 (uint64_t __a, const int __b)
{
return __builtin_aarch64_ucvtfdihf_sus (__a, __b);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_s16_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzshf (__a, __b);
}
__extension__ extern __inline int32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_s32_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzshfsi (__a, __b);
}
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_s64_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzshfdi (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_u16_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzuhf_uss (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_u32_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzuhfsi_uss (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvth_n_u64_f16 (float16_t __a, const int __b)
{
return __builtin_aarch64_fcvtzuhfdi_uss (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdivh_f16 (float16_t __a, float16_t __b)
{
return __a / __b;
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmaxh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fmaxhf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmaxnmh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fmaxhf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vminh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fminhf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vminnmh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fminhf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulh_f16 (float16_t __a, float16_t __b)
{
return __a * __b;
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulxh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_fmulxhf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrecpsh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_frecpshf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrsqrtsh_f16 (float16_t __a, float16_t __b)
{
return __builtin_aarch64_rsqrtshf (__a, __b);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsubh_f16 (float16_t __a, float16_t __b)
{
return __a - __b;
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmah_f16 (float16_t __a, float16_t __b, float16_t __c)
{
return __builtin_aarch64_fmahf (__b, __c, __a);
}
__extension__ extern __inline float16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vfmsh_f16 (float16_t __a, float16_t __b, float16_t __c)
{
return __builtin_aarch64_fnmahf (__b, __c, __a);
}
#pragma GCC pop_options
#endif
#endif