#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ #define COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ #include "libc/literal.h" #include "libc/str/str.h" #include "third_party/ggml/fp16.h" #include "third_party/intel/immintrin.internal.h" #include "third_party/libcxx/math.h" #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ #define GGML_GELU_FP16 #define GGML_SILU_FP16 extern ggml_fp16_t table_gelu_f16[1 << 16]; extern ggml_fp16_t table_silu_f16[1 << 16]; extern ggml_fp16_t table_exp_f16[1 << 16]; extern float table_f32_f16[1 << 16]; #if defined(__ARM_NEON) || defined(__wasm_simd128__) extern const uint64_t table_b2b_0[1 << 8]; extern const uint64_t table_b2b_1[1 << 8]; #endif inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t* y, const ggml_fp16_t* x) { const uint16_t* i16 = (const uint16_t*)x; for (int i = 0; i < n; ++i) { y[i] = table_gelu_f16[i16[i]]; } } // clang-format off // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t #ifdef __ARM_NEON // if YCM cannot find , make a symbolic link to it, for example: // // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ // #define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) #define GGML_COMPUTE_FP32_TO_FP16(x) (x) #define GGML_FP16_TO_FP32(x) ((float) (x)) #define GGML_FP32_TO_FP16(x) (x) #else #ifdef __F16C__ #ifdef _MSC_VER #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) #else #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) #endif #elif defined(__POWER9_VECTOR__) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) /* the inline asm below is about 12% faster than the lookup method */ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { register float f; register double d; __asm__( "mtfprd %0,%2\n" "xscvhpdp %0,%0\n" "frsp %1,%0\n" : /* temp */ "=d"(d), /* out */ "=f"(f): /* in */ "r"(h)); return f; } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { register double d; register ggml_fp16_t r; __asm__( /* xscvdphp can work on double or single precision */ "xscvdphp %0,%2\n" "mffprd %1,%0\n" : /* temp */ "=d"(d), /* out */ "=r"(r): /* in */ "f"(f)); return r; } #else // FP16 <-> FP32 // ref: https://github.com/Maratyszcza/FP16 static inline float fp32_from_bits(uint32_t w) { union { uint32_t as_bits; float as_value; } fp32; fp32.as_bits = w; return fp32.as_value; } static inline uint32_t fp32_to_bits(float f) { union { float as_value; uint32_t as_bits; } fp32; fp32.as_value = f; return fp32.as_bits; } static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; const uint32_t exp_offset = UINT32_C(0xE0) << 23; #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float exp_scale = 0x1.0p-112f; #else const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); #endif const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; const uint32_t magic_mask = UINT32_C(126) << 23; const float magic_bias = 0.5f; const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; const uint32_t denormalized_cutoff = UINT32_C(1) << 27; const uint32_t result = sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); return fp32_from_bits(result); } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; #else const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); #endif float base = (fabsf(f) * scale_to_inf) * scale_to_zero; const uint32_t w = fp32_to_bits(f); const uint32_t shl1_w = w + w; const uint32_t sign = w & UINT32_C(0x80000000); uint32_t bias = shl1_w & UINT32_C(0xFF000000); if (bias < UINT32_C(0x71000000)) { bias = UINT32_C(0x71000000); } base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; const uint32_t bits = fp32_to_bits(base); const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); const uint32_t nonsign = exp_bits + mantissa_bits; return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #endif // __F16C__ #endif // __ARM_NEON // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); return table_f32_f16[s]; } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ */