651 lines
		
	
	
	
		
			17 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			651 lines
		
	
	
	
		
			17 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include "ggml.h"
 | |
| 
 | |
| // GGML internal header
 | |
| 
 | |
| #include <assert.h>
 | |
| #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 | |
| #include <stddef.h>
 | |
| #include <stdbool.h>
 | |
| #include <string.h> // memcpy
 | |
| #include <math.h>   // fabsf
 | |
| 
 | |
| #undef MIN
 | |
| #undef MAX
 | |
| 
 | |
| #define MIN(a, b) ((a) < (b) ? (a) : (b))
 | |
| #define MAX(a, b) ((a) > (b) ? (a) : (b))
 | |
| 
 | |
| #if defined(_WIN32)
 | |
| 
 | |
| #define m512bh(p) p
 | |
| #define m512i(p) p
 | |
| 
 | |
| #else
 | |
| 
 | |
| #define m512bh(p) (__m512bh)(p)
 | |
| #define m512i(p) (__m512i)(p)
 | |
| 
 | |
| #endif
 | |
| 
 | |
| /**
 | |
|  * Converts brain16 to float32.
 | |
|  *
 | |
|  * The bfloat16 floating point format has the following structure:
 | |
|  *
 | |
|  *       ┌sign
 | |
|  *       │
 | |
|  *       │   ┌exponent
 | |
|  *       │   │
 | |
|  *       │   │      ┌mantissa
 | |
|  *       │   │      │
 | |
|  *       │┌──┴───┐┌─┴───┐
 | |
|  *     0b0000000000000000 brain16
 | |
|  *
 | |
|  * Since bf16 has the same number of exponent bits as a 32bit float,
 | |
|  * encoding and decoding numbers becomes relatively straightforward.
 | |
|  *
 | |
|  *       ┌sign
 | |
|  *       │
 | |
|  *       │   ┌exponent
 | |
|  *       │   │
 | |
|  *       │   │      ┌mantissa
 | |
|  *       │   │      │
 | |
|  *       │┌──┴───┐┌─┴───────────────────┐
 | |
|  *     0b00000000000000000000000000000000 IEEE binary32
 | |
|  *
 | |
|  * For comparison, the standard fp16 format has fewer exponent bits.
 | |
|  *
 | |
|  *       ┌sign
 | |
|  *       │
 | |
|  *       │  ┌exponent
 | |
|  *       │  │
 | |
|  *       │  │    ┌mantissa
 | |
|  *       │  │    │
 | |
|  *       │┌─┴─┐┌─┴──────┐
 | |
|  *     0b0000000000000000 IEEE binary16
 | |
|  *
 | |
|  * @see IEEE 754-2008
 | |
|  */
 | |
| static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
 | |
|     union {
 | |
|         float f;
 | |
|         uint32_t i;
 | |
|     } u;
 | |
|     u.i = (uint32_t)h.bits << 16;
 | |
|     return u.f;
 | |
| }
 | |
| 
 | |
| /**
 | |
|  * Converts float32 to brain16.
 | |
|  *
 | |
|  * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
 | |
|  * Subnormals shall be flushed to zero, and NANs will be quiet.
 | |
|  * This code should vectorize nicely if using modern compilers.
 | |
|  */
 | |
| static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 | |
|     ggml_bf16_t h;
 | |
|     union {
 | |
|         float f;
 | |
|         uint32_t i;
 | |
|     } u;
 | |
|     u.f = s;
 | |
|     if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
 | |
|         h.bits = (u.i >> 16) | 64; /* force to quiet */
 | |
|         return h;
 | |
|     }
 | |
|     if (!(u.i & 0x7f800000)) { /* subnormal */
 | |
|         h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
 | |
|         return h;
 | |
|     }
 | |
|     h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
 | |
|     return h;
 | |
| }
 | |
| 
 | |
| #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 | |
| #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 | |
| 
 | |
| #ifdef __cplusplus
 | |
| extern "C" {
 | |
| #endif
 | |
| 
 | |
| // static_assert should be a #define, but if it's not,
 | |
| // fall back to the _Static_assert C11 keyword.
 | |
| // if C99 - static_assert is noop
 | |
| // ref: https://stackoverflow.com/a/53923785/4039976
 | |
| #ifndef __cplusplus
 | |
| #ifndef static_assert
 | |
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 | |
| #define static_assert(cond, msg) _Static_assert(cond, msg)
 | |
| #else
 | |
| #define static_assert(cond, msg) struct global_scope_noop_trick
 | |
| #endif
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 | |
| #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 | |
| #ifndef __FMA__
 | |
| #define __FMA__
 | |
| #endif
 | |
| #ifndef __F16C__
 | |
| #define __F16C__
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 | |
| #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 | |
| #ifndef __SSE3__
 | |
| #define __SSE3__
 | |
| #endif
 | |
| #ifndef __SSSE3__
 | |
| #define __SSSE3__
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #if defined(__ARM_FEATURE_SVE)
 | |
| #include <arm_sve.h>
 | |
| #endif
 | |
| 
 | |
| // 16-bit float
 | |
| // on Arm, we use __fp16
 | |
| // on x86, we use uint16_t
 | |
| #if defined(__ARM_NEON)
 | |
| 
 | |
| // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 | |
| //
 | |
| //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 | |
| //
 | |
| #include <arm_neon.h>
 | |
| 
 | |
| #ifdef _MSC_VER
 | |
| 
 | |
| typedef uint16_t ggml_fp16_internal_t;
 | |
| 
 | |
| #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
 | |
| 
 | |
| #else
 | |
| 
 | |
| typedef __fp16 ggml_fp16_internal_t;
 | |
| 
 | |
| #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
 | |
| 
 | |
| #endif // _MSC_VER
 | |
| 
 | |
| #if !defined(__aarch64__)
 | |
| 
 | |
| // 32-bit ARM compatibility
 | |
| 
 | |
| // vaddvq_s16
 | |
| // vpaddq_s16
 | |
| // vpaddq_s32
 | |
| // vaddvq_s32
 | |
| // vaddvq_f32
 | |
| // vmaxvq_f32
 | |
| // vcvtnq_s32_f32
 | |
| // vzip1_u8
 | |
| // vzip2_u8
 | |
| 
 | |
| inline static int32_t vaddvq_s16(int16x8_t v) {
 | |
|     return
 | |
|         (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
 | |
|         (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
 | |
|         (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
 | |
|         (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
 | |
| }
 | |
| 
 | |
| inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
 | |
|     int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
 | |
|     int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
 | |
|     return vcombine_s16(a0, b0);
 | |
| }
 | |
| 
 | |
| inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
 | |
|     int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
 | |
|     int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
 | |
|     return vcombine_s32(a0, b0);
 | |
| }
 | |
| 
 | |
| inline static int32_t vaddvq_s32(int32x4_t v) {
 | |
|     return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 | |
| }
 | |
| 
 | |
| inline static float vaddvq_f32(float32x4_t v) {
 | |
|     return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 | |
| }
 | |
| 
 | |
| inline static float vmaxvq_f32(float32x4_t v) {
 | |
|     return
 | |
|         MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
 | |
|             MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 | |
| }
 | |
| 
 | |
| inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
 | |
|     int32x4_t res;
 | |
| 
 | |
|     res[0] = roundf(vgetq_lane_f32(v, 0));
 | |
|     res[1] = roundf(vgetq_lane_f32(v, 1));
 | |
|     res[2] = roundf(vgetq_lane_f32(v, 2));
 | |
|     res[3] = roundf(vgetq_lane_f32(v, 3));
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
 | |
|     uint8x8_t res;
 | |
| 
 | |
|     res[0] = a[0]; res[1] = b[0];
 | |
|     res[2] = a[1]; res[3] = b[1];
 | |
|     res[4] = a[2]; res[5] = b[2];
 | |
|     res[6] = a[3]; res[7] = b[3];
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
 | |
|     uint8x8_t res;
 | |
| 
 | |
|     res[0] = a[4]; res[1] = b[4];
 | |
|     res[2] = a[5]; res[3] = b[5];
 | |
|     res[4] = a[6]; res[5] = b[6];
 | |
|     res[6] = a[7]; res[7] = b[7];
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| // vld1q_s16_x2
 | |
| // vld1q_u8_x2
 | |
| // vld1q_u8_x4
 | |
| // vld1q_s8_x2
 | |
| // vld1q_s8_x4
 | |
| // TODO: double-check these work correctly
 | |
| 
 | |
| typedef struct ggml_int16x8x2_t {
 | |
|     int16x8_t val[2];
 | |
| } ggml_int16x8x2_t;
 | |
| 
 | |
| inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
 | |
|     ggml_int16x8x2_t res;
 | |
| 
 | |
|     res.val[0] = vld1q_s16(ptr + 0);
 | |
|     res.val[1] = vld1q_s16(ptr + 8);
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| typedef struct ggml_uint8x16x2_t {
 | |
|     uint8x16_t val[2];
 | |
| } ggml_uint8x16x2_t;
 | |
| 
 | |
| inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
 | |
|     ggml_uint8x16x2_t res;
 | |
| 
 | |
|     res.val[0] = vld1q_u8(ptr + 0);
 | |
|     res.val[1] = vld1q_u8(ptr + 16);
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| typedef struct ggml_uint8x16x4_t {
 | |
|     uint8x16_t val[4];
 | |
| } ggml_uint8x16x4_t;
 | |
| 
 | |
| inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
 | |
|     ggml_uint8x16x4_t res;
 | |
| 
 | |
|     res.val[0] = vld1q_u8(ptr + 0);
 | |
|     res.val[1] = vld1q_u8(ptr + 16);
 | |
|     res.val[2] = vld1q_u8(ptr + 32);
 | |
|     res.val[3] = vld1q_u8(ptr + 48);
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| typedef struct ggml_int8x16x2_t {
 | |
|     int8x16_t val[2];
 | |
| } ggml_int8x16x2_t;
 | |
| 
 | |
| inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
 | |
|     ggml_int8x16x2_t res;
 | |
| 
 | |
|     res.val[0] = vld1q_s8(ptr + 0);
 | |
|     res.val[1] = vld1q_s8(ptr + 16);
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| typedef struct ggml_int8x16x4_t {
 | |
|     int8x16_t val[4];
 | |
| } ggml_int8x16x4_t;
 | |
| 
 | |
| inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
 | |
|     ggml_int8x16x4_t res;
 | |
| 
 | |
|     res.val[0] = vld1q_s8(ptr + 0);
 | |
|     res.val[1] = vld1q_s8(ptr + 16);
 | |
|     res.val[2] = vld1q_s8(ptr + 32);
 | |
|     res.val[3] = vld1q_s8(ptr + 48);
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| // NOTE: not tested
 | |
| inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
 | |
|     int8x16_t res;
 | |
| 
 | |
|     res[ 0] = a[b[ 0]];
 | |
|     res[ 1] = a[b[ 1]];
 | |
|     res[ 2] = a[b[ 2]];
 | |
|     res[ 3] = a[b[ 3]];
 | |
|     res[ 4] = a[b[ 4]];
 | |
|     res[ 5] = a[b[ 5]];
 | |
|     res[ 6] = a[b[ 6]];
 | |
|     res[ 7] = a[b[ 7]];
 | |
|     res[ 8] = a[b[ 8]];
 | |
|     res[ 9] = a[b[ 9]];
 | |
|     res[10] = a[b[10]];
 | |
|     res[11] = a[b[11]];
 | |
|     res[12] = a[b[12]];
 | |
|     res[13] = a[b[13]];
 | |
|     res[14] = a[b[14]];
 | |
|     res[15] = a[b[15]];
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| // NOTE: not tested
 | |
| inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
 | |
|     uint8x16_t res;
 | |
| 
 | |
|     res[ 0] = a[b[ 0]];
 | |
|     res[ 1] = a[b[ 1]];
 | |
|     res[ 2] = a[b[ 2]];
 | |
|     res[ 3] = a[b[ 3]];
 | |
|     res[ 4] = a[b[ 4]];
 | |
|     res[ 5] = a[b[ 5]];
 | |
|     res[ 6] = a[b[ 6]];
 | |
|     res[ 7] = a[b[ 7]];
 | |
|     res[ 8] = a[b[ 8]];
 | |
|     res[ 9] = a[b[ 9]];
 | |
|     res[10] = a[b[10]];
 | |
|     res[11] = a[b[11]];
 | |
|     res[12] = a[b[12]];
 | |
|     res[13] = a[b[13]];
 | |
|     res[14] = a[b[14]];
 | |
|     res[15] = a[b[15]];
 | |
| 
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| #define ggml_int16x8x2_t  int16x8x2_t
 | |
| #define ggml_uint8x16x2_t uint8x16x2_t
 | |
| #define ggml_uint8x16x4_t uint8x16x4_t
 | |
| #define ggml_int8x16x2_t  int8x16x2_t
 | |
| #define ggml_int8x16x4_t  int8x16x4_t
 | |
| 
 | |
| #define ggml_vld1q_s16_x2 vld1q_s16_x2
 | |
| #define ggml_vld1q_u8_x2  vld1q_u8_x2
 | |
| #define ggml_vld1q_u8_x4  vld1q_u8_x4
 | |
| #define ggml_vld1q_s8_x2  vld1q_s8_x2
 | |
| #define ggml_vld1q_s8_x4  vld1q_s8_x4
 | |
| #define ggml_vqtbl1q_s8   vqtbl1q_s8
 | |
| #define ggml_vqtbl1q_u8   vqtbl1q_u8
 | |
| 
 | |
| #endif // !defined(__aarch64__)
 | |
| 
 | |
| #if !defined(__ARM_FEATURE_DOTPROD)
 | |
| 
 | |
| inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
 | |
|     const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
 | |
|     const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
 | |
| 
 | |
|     return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
 | |
| 
 | |
| #endif // !defined(__ARM_FEATURE_DOTPROD)
 | |
| 
 | |
| #endif // defined(__ARM_NEON)
 | |
| 
 | |
| #if defined(__ARM_NEON) && !defined(_MSC_VER)
 | |
| 
 | |
| #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | |
| #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | |
| 
 | |
| #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | |
| 
 | |
| static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | |
|     ggml_fp16_internal_t tmp;
 | |
|     memcpy(&tmp, &h, sizeof(ggml_fp16_t));
 | |
|     return (float)tmp;
 | |
| }
 | |
| 
 | |
| static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | |
|     ggml_fp16_t res;
 | |
|     ggml_fp16_internal_t tmp = f;
 | |
|     memcpy(&res, &tmp, sizeof(ggml_fp16_t));
 | |
|     return res;
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| #ifdef __wasm_simd128__
 | |
| #include <wasm_simd128.h>
 | |
| #else
 | |
| #ifdef __POWER9_VECTOR__
 | |
| #include <altivec.h>
 | |
| #undef bool
 | |
| #define bool _Bool
 | |
| #else
 | |
| #if defined(_MSC_VER) || defined(__MINGW32__)
 | |
| #include <intrin.h>
 | |
| #else
 | |
| #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 | |
| #if !defined(__riscv)
 | |
| #include <immintrin.h>
 | |
| #endif
 | |
| #endif
 | |
| #endif
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #ifdef __riscv_v_intrinsic
 | |
| #include <riscv_vector.h>
 | |
| #endif
 | |
| 
 | |
| #if defined(__loongarch64)
 | |
| #if defined(__loongarch_asx)
 | |
| #include <lasxintrin.h>
 | |
| #endif
 | |
| #if defined(__loongarch_sx)
 | |
| #include <lsxintrin.h>
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #if defined(__loongarch_asx)
 | |
| 
 | |
| typedef union {
 | |
|     int32_t i;
 | |
|     float f;
 | |
| } ft_union;
 | |
| 
 | |
| /* float type data load instructions */
 | |
| static __m128 __lsx_vreplfr2vr_s(float val) {
 | |
|     ft_union fi_tmpval = {.f = val};
 | |
|     return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
 | |
| }
 | |
| 
 | |
| static __m256 __lasx_xvreplfr2vr_s(float val) {
 | |
|     ft_union fi_tmpval = {.f = val};
 | |
|     return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
 | |
| }
 | |
| #endif
 | |
| 
 | |
| #ifdef __F16C__
 | |
| 
 | |
| #ifdef _MSC_VER
 | |
| #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 | |
| #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 | |
| #else
 | |
| #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 | |
| #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 | |
| #endif
 | |
| 
 | |
| #elif defined(__POWER9_VECTOR__)
 | |
| 
 | |
| #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | |
| #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | |
| /* the inline asm below is about 12% faster than the lookup method */
 | |
| #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 | |
| #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 | |
| 
 | |
| static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | |
|     register float f;
 | |
|     register double d;
 | |
|     __asm__(
 | |
|         "mtfprd %0,%2\n"
 | |
|         "xscvhpdp %0,%0\n"
 | |
|         "frsp %1,%0\n" :
 | |
|         /* temp */ "=d"(d),
 | |
|         /* out */  "=f"(f):
 | |
|         /* in */   "r"(h));
 | |
|     return f;
 | |
| }
 | |
| 
 | |
| static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | |
|     register double d;
 | |
|     register ggml_fp16_t r;
 | |
|     __asm__( /* xscvdphp can work on double or single precision */
 | |
|         "xscvdphp %0,%2\n"
 | |
|         "mffprd %1,%0\n" :
 | |
|         /* temp */ "=d"(d),
 | |
|         /* out */  "=r"(r):
 | |
|         /* in */   "f"(f));
 | |
|     return r;
 | |
| }
 | |
| 
 | |
| #else
 | |
| 
 | |
| // FP16 <-> FP32
 | |
| // ref: https://github.com/Maratyszcza/FP16
 | |
| 
 | |
| static inline float fp32_from_bits(uint32_t w) {
 | |
|     union {
 | |
|         uint32_t as_bits;
 | |
|         float as_value;
 | |
|     } fp32;
 | |
|     fp32.as_bits = w;
 | |
|     return fp32.as_value;
 | |
| }
 | |
| 
 | |
| static inline uint32_t fp32_to_bits(float f) {
 | |
|     union {
 | |
|         float as_value;
 | |
|         uint32_t as_bits;
 | |
|     } fp32;
 | |
|     fp32.as_value = f;
 | |
|     return fp32.as_bits;
 | |
| }
 | |
| 
 | |
| static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
 | |
|     const uint32_t w = (uint32_t) h << 16;
 | |
|     const uint32_t sign = w & UINT32_C(0x80000000);
 | |
|     const uint32_t two_w = w + w;
 | |
| 
 | |
|     const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 | |
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
 | |
|     const float exp_scale = 0x1.0p-112f;
 | |
| #else
 | |
|     const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 | |
| #endif
 | |
|     const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
 | |
| 
 | |
|     const uint32_t magic_mask = UINT32_C(126) << 23;
 | |
|     const float magic_bias = 0.5f;
 | |
|     const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
 | |
| 
 | |
|     const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
 | |
|     const uint32_t result = sign |
 | |
|         (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
 | |
|     return fp32_from_bits(result);
 | |
| }
 | |
| 
 | |
| static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 | |
| #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
 | |
|     const float scale_to_inf = 0x1.0p+112f;
 | |
|     const float scale_to_zero = 0x1.0p-110f;
 | |
| #else
 | |
|     const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
 | |
|     const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 | |
| #endif
 | |
|     float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
 | |
| 
 | |
|     const uint32_t w = fp32_to_bits(f);
 | |
|     const uint32_t shl1_w = w + w;
 | |
|     const uint32_t sign = w & UINT32_C(0x80000000);
 | |
|     uint32_t bias = shl1_w & UINT32_C(0xFF000000);
 | |
|     if (bias < UINT32_C(0x71000000)) {
 | |
|         bias = UINT32_C(0x71000000);
 | |
|     }
 | |
| 
 | |
|     base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
 | |
|     const uint32_t bits = fp32_to_bits(base);
 | |
|     const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
 | |
|     const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
 | |
|     const uint32_t nonsign = exp_bits + mantissa_bits;
 | |
|     return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 | |
| }
 | |
| 
 | |
| #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 | |
| #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 | |
| 
 | |
| #endif // __F16C__
 | |
| 
 | |
| #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 | |
| 
 | |
| // precomputed f32 table for f16 (256 KB)
 | |
| // defined in ggml.c, initialized in ggml_init()
 | |
| extern float ggml_table_f32_f16[1 << 16];
 | |
| 
 | |
| // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 | |
| // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 | |
| // This is also true for POWER9.
 | |
| #if !defined(GGML_FP16_TO_FP32)
 | |
| inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 | |
|     uint16_t s;
 | |
|     memcpy(&s, &f, sizeof(uint16_t));
 | |
|     return ggml_table_f32_f16[s];
 | |
| }
 | |
| 
 | |
| #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 | |
| #endif
 | |
| 
 | |
| #if !defined(GGML_FP32_TO_FP16)
 | |
| #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 | |
| #endif
 | |
| 
 | |
| #define GGML_HASHTABLE_FULL ((size_t)-1)
 | |
| #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 | |
| 
 | |
| struct ggml_hash_set ggml_hash_set_new(size_t size);
 | |
| 
 | |
| bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | |
| 
 | |
| // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 | |
| size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | |
| 
 | |
| // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 | |
| size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | |
| 
 | |
| // return index, asserts if table is full
 | |
| size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 | |
| 
 | |
| #ifdef __cplusplus
 | |
| }
 | |
| #endif
 |