mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 11:37:35 +00:00
e7eb0b3070
- Fix UX issues with llama.com - Do housekeeping on libm code - Add more vectorization to GGML - Get GGJT quantizer programs working well - Have the quantizer keep the output layer as f16c - Prefetching improves performance 15% if you use fewer threads
192 lines
6 KiB
C
192 lines
6 KiB
C
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
|
|
#define COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
|
|
#include "libc/literal.h"
|
|
#include "libc/str/str.h"
|
|
#include "third_party/ggml/fp16.h"
|
|
#include "third_party/intel/immintrin.internal.h"
|
|
#include "third_party/libcxx/math.h"
|
|
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
|
COSMOPOLITAN_C_START_
|
|
|
|
#define GGML_GELU_FP16
|
|
#define GGML_SILU_FP16
|
|
|
|
extern ggml_fp16_t table_gelu_f16[1 << 16];
|
|
extern ggml_fp16_t table_silu_f16[1 << 16];
|
|
extern ggml_fp16_t table_exp_f16[1 << 16];
|
|
extern float table_f32_f16[1 << 16];
|
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
|
extern const uint64_t table_b2b_0[1 << 8];
|
|
extern const uint64_t table_b2b_1[1 << 8];
|
|
#endif
|
|
|
|
inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t* y,
|
|
const ggml_fp16_t* x) {
|
|
const uint16_t* i16 = (const uint16_t*)x;
|
|
for (int i = 0; i < n; ++i) {
|
|
y[i] = table_gelu_f16[i16[i]];
|
|
}
|
|
}
|
|
|
|
// clang-format off
|
|
|
|
// 16-bit float
|
|
// on Arm, we use __fp16
|
|
// on x86, we use uint16_t
|
|
#ifdef __ARM_NEON
|
|
|
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
//
|
|
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
//
|
|
|
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
|
|
#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
|
|
|
|
#define GGML_FP16_TO_FP32(x) ((float) (x))
|
|
#define GGML_FP32_TO_FP16(x) (x)
|
|
|
|
#else
|
|
|
|
#ifdef __F16C__
|
|
|
|
#ifdef _MSC_VER
|
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
#else
|
|
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
#endif
|
|
|
|
#elif defined(__POWER9_VECTOR__)
|
|
|
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
/* the inline asm below is about 12% faster than the lookup method */
|
|
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
|
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
register float f;
|
|
register double d;
|
|
__asm__(
|
|
"mtfprd %0,%2\n"
|
|
"xscvhpdp %0,%0\n"
|
|
"frsp %1,%0\n" :
|
|
/* temp */ "=d"(d),
|
|
/* out */ "=f"(f):
|
|
/* in */ "r"(h));
|
|
return f;
|
|
}
|
|
|
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
register double d;
|
|
register ggml_fp16_t r;
|
|
__asm__( /* xscvdphp can work on double or single precision */
|
|
"xscvdphp %0,%2\n"
|
|
"mffprd %1,%0\n" :
|
|
/* temp */ "=d"(d),
|
|
/* out */ "=r"(r):
|
|
/* in */ "f"(f));
|
|
return r;
|
|
}
|
|
|
|
#else
|
|
|
|
// FP16 <-> FP32
|
|
// ref: https://github.com/Maratyszcza/FP16
|
|
|
|
static inline float fp32_from_bits(uint32_t w) {
|
|
union {
|
|
uint32_t as_bits;
|
|
float as_value;
|
|
} fp32;
|
|
fp32.as_bits = w;
|
|
return fp32.as_value;
|
|
}
|
|
|
|
static inline uint32_t fp32_to_bits(float f) {
|
|
union {
|
|
float as_value;
|
|
uint32_t as_bits;
|
|
} fp32;
|
|
fp32.as_value = f;
|
|
return fp32.as_bits;
|
|
}
|
|
|
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
const uint32_t w = (uint32_t) h << 16;
|
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
const uint32_t two_w = w + w;
|
|
|
|
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
const float exp_scale = 0x1.0p-112f;
|
|
#else
|
|
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
#endif
|
|
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
|
|
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
const float magic_bias = 0.5f;
|
|
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
|
|
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
const uint32_t result = sign |
|
|
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
return fp32_from_bits(result);
|
|
}
|
|
|
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
const float scale_to_inf = 0x1.0p+112f;
|
|
const float scale_to_zero = 0x1.0p-110f;
|
|
#else
|
|
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
#endif
|
|
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
|
|
const uint32_t w = fp32_to_bits(f);
|
|
const uint32_t shl1_w = w + w;
|
|
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
if (bias < UINT32_C(0x71000000)) {
|
|
bias = UINT32_C(0x71000000);
|
|
}
|
|
|
|
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
const uint32_t bits = fp32_to_bits(base);
|
|
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
}
|
|
|
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
|
|
#endif // __F16C__
|
|
|
|
#endif // __ARM_NEON
|
|
|
|
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
// This is also true for POWER9.
|
|
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
|
|
|
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
uint16_t s;
|
|
memcpy(&s, &f, sizeof(uint16_t));
|
|
return table_f32_f16[s];
|
|
}
|
|
|
|
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
|
|
#endif
|
|
|
|
|
|
COSMOPOLITAN_C_END_
|
|
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
|
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ */
|