From 180bfcd8d5c82c0b2fe06bc7538683c28a3790e9 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@mozilla.com>
Date: Fri, 26 Apr 2024 02:33:26 -0700
Subject: [PATCH] Minimize the GGML API surface area for BF16

---
 ggml-impl.h | 87 ++++++++++++++++++++++++++++++++++++++++++++++++--
 ggml.c      | 20 +++++++++---
 ggml.h      | 92 +++++------------------------------------------------
 3 files changed, 107 insertions(+), 92 deletions(-)

diff --git a/ggml-impl.h b/ggml-impl.h
index 3e7484d29..83c32c743 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -17,6 +17,90 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
+/**
+ * Google Brain 16-bit floating point number.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * So be warned that converting between them, destroys several bits.
+ *
+ * @see IEEE 754-2008
+ */
+struct ggml_bf16_s {
+    uint16_t bits;
+};
+
+/**
+ * Converts brain16 to float32.
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
+ * Subnormals shall be flushed to zero, and NANs will be quiet.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    if (!(u.i & 0x7f800000)) { /* subnormal */
+        h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -518,9 +602,6 @@ size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml
 // return index, asserts if table is full
 size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 
-#define GGML_FP32_TO_BF16(x) ggml_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_bf16_to_fp32(x)
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml.c b/ggml.c
index 60b0e7b6c..64868213c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -339,16 +339,26 @@ GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
     return "GGML status: unknown";
 }
 
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
 float ggml_fp16_to_fp32(ggml_fp16_t x) {
+#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
     return GGML_FP16_TO_FP32(x);
 }
 
 ggml_fp16_t ggml_fp32_to_fp16(float x) {
+#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
     return GGML_FP32_TO_FP16(x);
 }
 
+float ggml_bf16_to_fp32(ggml_bf16_t x) {
+#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
+    return GGML_BF16_TO_FP32(x);  // it just left shifts
+}
+
+ggml_bf16_t ggml_fp32_to_bf16(float x) {
+#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
+    return GGML_FP32_TO_BF16(x);
+}
+
 void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
     for (int64_t i = 0; i < n; i++) {
         y[i] = GGML_FP16_TO_FP32(x[i]);
@@ -374,8 +384,8 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
     }
 }
 
-void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n) {
-    int i = 0;
+void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
 #if defined(__AVX512F__)
     for (; i + 16 <= n; i += 16) {
         _mm512_storeu_ps(y + i,
@@ -402,7 +412,7 @@ void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n) {
     }
 }
 
-void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int n) {
+void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
   int i = 0;
 #if defined(__AVX512BF16__)
   for (; i + 32 <= n; i += 32) {
diff --git a/ggml.h b/ggml.h
index 63e8d6b21..a422e0df0 100644
--- a/ggml.h
+++ b/ggml.h
@@ -335,6 +335,14 @@ extern "C" {
     GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
     GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
 
+    // bfloat16
+    struct ggml_bf16_s;
+    typedef struct ggml_bf16_s ggml_bf16_t;
+    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
+    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
+    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
+    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
+
     struct ggml_object;
     struct ggml_context;
 
@@ -2392,90 +2400,6 @@ extern "C" {
     GGML_API int ggml_cpu_has_vsx        (void);
     GGML_API int ggml_cpu_has_matmul_int8(void);
 
-    /**
-     * Google Brain 16-bit floating point number.
-     *
-     *       ┌sign
-     *       │
-     *       │   ┌exponent
-     *       │   │
-     *       │   │      ┌mantissa
-     *       │   │      │
-     *       │┌──┴───┐┌─┴───┐
-     *     0b0000000000000000 brain16
-     *
-     * Since bf16 has the same number of exponent bits as a 32bit float,
-     * encoding and decoding numbers becomes relatively straightforward.
-     *
-     *       ┌sign
-     *       │
-     *       │   ┌exponent
-     *       │   │
-     *       │   │      ┌mantissa
-     *       │   │      │
-     *       │┌──┴───┐┌─┴───────────────────┐
-     *     0b00000000000000000000000000000000 IEEE binary32
-     *
-     * For comparison, the standard fp16 format has fewer exponent bits.
-     *
-     *       ┌sign
-     *       │
-     *       │  ┌exponent
-     *       │  │
-     *       │  │    ┌mantissa
-     *       │  │    │
-     *       │┌─┴─┐┌─┴──────┐
-     *     0b0000000000000000 IEEE binary16
-     *
-     * So be warned that converting between them, destroys several bits.
-     *
-     * @see IEEE 754-2008
-     */
-    typedef struct {
-        uint16_t x;
-    } ggml_bf16_t;
-
-    /**
-     * Converts brain16 to float32.
-     */
-    static inline float ggml_bf16_to_fp32(ggml_bf16_t h) {
-        union {
-            float f;
-            uint32_t i;
-        } u;
-        u.i = (uint32_t)h.x << 16;
-        return u.f;
-    }
-
-    /**
-     * Converts float32 to brain16.
-     *
-     * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
-     * Subnormals shall be flushed to zero, and NANs will be quiet.
-     * This code should vectorize nicely if using modern compilers.
-     */
-    static inline ggml_bf16_t ggml_fp32_to_bf16(float s) {
-        ggml_bf16_t h;
-        union {
-            float f;
-            uint32_t i;
-        } u;
-        u.f = s;
-        if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
-            h.x = (u.i >> 16) | 64; /* force to quiet */
-            return h;
-        }
-        if (!(u.i & 0x7f800000)) { /* subnormal */
-            h.x = (u.i & 0x80000000) >> 16; /* flush to zero */
-            return h;
-        }
-        h.x = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
-        return h;
-    }
-
-    GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n);
-    GGML_API void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int n);
-
     //
     // Internal types and functions exposed for tests and benchmarks
     //