mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
c9152b6f14
This change switches c++ exception handling from sjlj to standard dwarf. It's needed because clang for aarch64 doesn't support sjlj. It turns out that libunwind had a bare-metal configuration that made this easy to do. This change gets the new experimental cosmocc -mclang flag in a state of working so well that it can now be used to build all of llamafile and it goes 3x faster in terms of build latency, without trading away any perf. The int_fast16_t and int_fast32_t types are now always defined as 32-bit in the interest of having more abi consistency between cosmocc -mgcc and -mclang mode.
162 lines
5.4 KiB
C
162 lines
5.4 KiB
C
/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
|
|
*
|
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
* See https://llvm.org/LICENSE.txt for license information.
|
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
*
|
|
*===-----------------------------------------------------------------------===
|
|
*/
|
|
|
|
#if !defined __IMMINTRIN_H
|
|
#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
|
|
#endif
|
|
|
|
#ifndef __F16CINTRIN_H
|
|
#define __F16CINTRIN_H
|
|
|
|
/* Define the default attributes for the functions in this file. */
|
|
#define __DEFAULT_FN_ATTRS128 \
|
|
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
|
|
#define __DEFAULT_FN_ATTRS256 \
|
|
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
|
|
|
|
/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
|
|
* but that's because icc can emulate these without f16c using a library call.
|
|
* Since we don't do that let's leave these in f16cintrin.h.
|
|
*/
|
|
|
|
/// Converts a 16-bit half-precision float value into a 32-bit float
|
|
/// value.
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
|
///
|
|
/// \param __a
|
|
/// A 16-bit half-precision float value.
|
|
/// \returns The converted 32-bit float value.
|
|
static __inline float __DEFAULT_FN_ATTRS128
|
|
_cvtsh_ss(unsigned short __a)
|
|
{
|
|
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
|
|
__v4sf __r = __builtin_ia32_vcvtph2ps(__v);
|
|
return __r[0];
|
|
}
|
|
|
|
/// Converts a 32-bit single-precision float value to a 16-bit
|
|
/// half-precision float value.
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// \code
|
|
/// unsigned short _cvtss_sh(float a, const int imm);
|
|
/// \endcode
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
|
///
|
|
/// \param a
|
|
/// A 32-bit single-precision float value to be converted to a 16-bit
|
|
/// half-precision float value.
|
|
/// \param imm
|
|
/// An immediate value controlling rounding using bits [2:0]: \n
|
|
/// 000: Nearest \n
|
|
/// 001: Down \n
|
|
/// 010: Up \n
|
|
/// 011: Truncate \n
|
|
/// 1XX: Use MXCSR.RC for rounding
|
|
/// \returns The converted 16-bit half-precision float value.
|
|
#define _cvtss_sh(a, imm) __extension__ ({ \
|
|
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
|
(imm)))[0]); })
|
|
|
|
/// Converts a 128-bit vector containing 32-bit float values into a
|
|
/// 128-bit vector containing 16-bit half-precision float values.
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// \code
|
|
/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
|
|
/// \endcode
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
|
///
|
|
/// \param a
|
|
/// A 128-bit vector containing 32-bit float values.
|
|
/// \param imm
|
|
/// An immediate value controlling rounding using bits [2:0]: \n
|
|
/// 000: Nearest \n
|
|
/// 001: Down \n
|
|
/// 010: Up \n
|
|
/// 011: Truncate \n
|
|
/// 1XX: Use MXCSR.RC for rounding
|
|
/// \returns A 128-bit vector containing converted 16-bit half-precision float
|
|
/// values. The lower 64 bits are used to store the converted 16-bit
|
|
/// half-precision floating-point values.
|
|
#define _mm_cvtps_ph(a, imm) \
|
|
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
|
|
|
|
/// Converts a 128-bit vector containing 16-bit half-precision float
|
|
/// values into a 128-bit vector containing 32-bit float values.
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
|
///
|
|
/// \param __a
|
|
/// A 128-bit vector containing 16-bit half-precision float values. The lower
|
|
/// 64 bits are used in the conversion.
|
|
/// \returns A 128-bit vector of [4 x float] containing converted float values.
|
|
static __inline __m128 __DEFAULT_FN_ATTRS128
|
|
_mm_cvtph_ps(__m128i __a)
|
|
{
|
|
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
|
|
}
|
|
|
|
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
|
|
/// containing 16-bit half-precision float values.
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// \code
|
|
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
|
|
/// \endcode
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
|
|
///
|
|
/// \param a
|
|
/// A 256-bit vector containing 32-bit single-precision float values to be
|
|
/// converted to 16-bit half-precision float values.
|
|
/// \param imm
|
|
/// An immediate value controlling rounding using bits [2:0]: \n
|
|
/// 000: Nearest \n
|
|
/// 001: Down \n
|
|
/// 010: Up \n
|
|
/// 011: Truncate \n
|
|
/// 1XX: Use MXCSR.RC for rounding
|
|
/// \returns A 128-bit vector containing the converted 16-bit half-precision
|
|
/// float values.
|
|
#define _mm256_cvtps_ph(a, imm) \
|
|
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
|
|
|
|
/// Converts a 128-bit vector containing 16-bit half-precision float
|
|
/// values into a 256-bit vector of [8 x float].
|
|
///
|
|
/// \headerfile <x86intrin.h>
|
|
///
|
|
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
|
|
///
|
|
/// \param __a
|
|
/// A 128-bit vector containing 16-bit half-precision float values to be
|
|
/// converted to 32-bit single-precision float values.
|
|
/// \returns A vector of [8 x float] containing the converted 32-bit
|
|
/// single-precision float values.
|
|
static __inline __m256 __DEFAULT_FN_ATTRS256
|
|
_mm256_cvtph_ps(__m128i __a)
|
|
{
|
|
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
|
|
}
|
|
|
|
#undef __DEFAULT_FN_ATTRS128
|
|
#undef __DEFAULT_FN_ATTRS256
|
|
|
|
#endif /* __F16CINTRIN_H */
|