From a80ab3f8fe92752f492e6a4833cd9a71ab1b2cf2 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Thu, 1 Aug 2024 19:42:14 -0700 Subject: [PATCH] Implement bf16 compiler runtime library --- libc/intrin/extendbfsf2.c | 39 +++++++++++ libc/intrin/truncdfbf2.c | 24 +++++++ libc/intrin/truncsfbf2.c | 40 +++++++++++ test/libc/tinymath/fdot_test.cc | 47 +++++++++++-- test/math/bf16_test.c | 113 ++++++++++++++++++++++++++++++++ 5 files changed, 256 insertions(+), 7 deletions(-) create mode 100644 libc/intrin/extendbfsf2.c create mode 100644 libc/intrin/truncdfbf2.c create mode 100644 libc/intrin/truncsfbf2.c create mode 100644 test/math/bf16_test.c diff --git a/libc/intrin/extendbfsf2.c b/libc/intrin/extendbfsf2.c new file mode 100644 index 000000000..1773bac67 --- /dev/null +++ b/libc/intrin/extendbfsf2.c @@ -0,0 +1,39 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +float __extendbfsf2(__bf16 f) { + union { + __bf16 f; + unsigned short i; + } ub = {f}; + + // convert brain16 to binary32 + unsigned x = (unsigned)ub.i << 16; + + // force nan to quiet + if ((x & 0x7fffffff) > 0x7f800000) + x |= 0x00400000; + + // pun to float + union { + unsigned i; + float f; + } uf = {x}; + return uf.f; +} diff --git a/libc/intrin/truncdfbf2.c b/libc/intrin/truncdfbf2.c new file mode 100644 index 000000000..65dfff08c --- /dev/null +++ b/libc/intrin/truncdfbf2.c @@ -0,0 +1,24 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +__bf16 __truncsfbf2(float); +__bf16 __truncdfbf2(double f) { + // TODO(jart): What else are we supposed to do here? + return __truncsfbf2(f); +} diff --git a/libc/intrin/truncsfbf2.c b/libc/intrin/truncsfbf2.c new file mode 100644 index 000000000..b2d12e33d --- /dev/null +++ b/libc/intrin/truncsfbf2.c @@ -0,0 +1,40 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ + +__bf16 __truncsfbf2(float f) { + union { + float f; + unsigned i; + } uf = {f}; + unsigned x = uf.i; + + if ((x & 0x7fffffff) > 0x7f800000) + // force nan to quiet + x = (x | 0x00400000) >> 16; + else + // convert binary32 to brain16 with nearest rounding + x = (x + (0x7fff + ((x >> 16) & 1))) >> 16; + + // pun to bf16 + union { + unsigned short i; + __bf16 f; + } ub = {x}; + return ub.f; +} diff --git a/test/libc/tinymath/fdot_test.cc b/test/libc/tinymath/fdot_test.cc index 4d2543196..b5b6ab6ca 100644 --- a/test/libc/tinymath/fdot_test.cc +++ b/test/libc/tinymath/fdot_test.cc @@ -10,6 +10,8 @@ #include "libc/stdio/stdio.h" #include "libc/testlib/benchmark.h" #include "libc/x/xasprintf.h" +#include "third_party/aarch64/arm_neon.internal.h" +#include "third_party/intel/immintrin.internal.h" #define EXPENSIVE_TESTS 0 @@ -18,12 +20,11 @@ #define FASTMATH __attribute__((__optimize__("-O3,-ffast-math"))) #define PORTABLE __target_clones("avx512f,avx") -static unsigned long long lcg = 1; - int rand32(void) { /* Knuth, D.E., "The Art of Computer Programming," Vol 2, Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998, p. 106 (line 26) & p. 108 */ + static unsigned long long lcg = 1; lcg *= 6364136223846793005; lcg += 1442695040888963407; return lcg >> 32; @@ -122,6 +123,34 @@ float fdotf_recursive(const float *A, const float *B, size_t n) { } } +optimizespeed float fdotf_intrin(const float *A, const float *B, size_t n) { + size_t i = 0; +#ifdef __AVX512F__ + __m512 vec[CHUNK] = {}; + for (; i + CHUNK * 16 <= n; i += CHUNK * 16) + for (int j = 0; j < CHUNK; ++j) + vec[j] = _mm512_fmadd_ps(_mm512_loadu_ps(A + i + j * 16), + _mm512_loadu_ps(B + i + j * 16), vec[j]); + float res = 0; + for (int j = 0; j < CHUNK; ++j) + res += _mm512_reduce_add_ps(vec[j]); +#elif defined(__aarch64__) + float32x4_t vec[CHUNK] = {}; + for (; i + CHUNK * 4 <= n; i += CHUNK * 4) + for (int j = 0; j < CHUNK; ++j) + vec[j] = + vfmaq_f32(vec[j], vld1q_f32(A + i + j * 4), vld1q_f32(B + i + j * 4)); + float res = 0; + for (int j = 0; j < CHUNK; ++j) + res += vaddvq_f32(vec[j]); +#else + float res = 0; +#endif + for (; i < n; ++i) + res += A[i] * B[i]; + return res; +} + FASTMATH float fdotf_ruler(const float *A, const float *B, size_t n) { int rule, step = 2; size_t chunk, sp = 0; @@ -179,6 +208,8 @@ void test_fdotf_ruler(void) { } PORTABLE float fdotf_hefty(const float *A, const float *B, size_t n) { + if (1) + return 0; unsigned i, par, len = 0; float sum, res[n / CHUNK + 1]; for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK) @@ -244,7 +275,7 @@ int main() { #if EXPENSIVE_TESTS size_t n = 512 * 1024; #else - size_t n = 1024; + size_t n = 4096; #endif float *A = new float[n]; @@ -253,22 +284,24 @@ int main() { A[i] = numba(); B[i] = numba(); } - float kahan, naive, dubble, recursive, hefty, ruler; + float kahan, naive, dubble, recursive, ruler, intrin; test_fdotf_naive(); - test_fdotf_hefty(); + // test_fdotf_hefty(); test_fdotf_ruler(); BENCHMARK(20, 1, (kahan = barrier(fdotf_kahan(A, B, n)))); BENCHMARK(20, 1, (dubble = barrier(fdotf_dubble(A, B, n)))); BENCHMARK(20, 1, (naive = barrier(fdotf_naive(A, B, n)))); BENCHMARK(20, 1, (recursive = barrier(fdotf_recursive(A, B, n)))); + BENCHMARK(20, 1, (intrin = barrier(fdotf_intrin(A, B, n)))); BENCHMARK(20, 1, (ruler = barrier(fdotf_ruler(A, B, n)))); - BENCHMARK(20, 1, (hefty = barrier(fdotf_hefty(A, B, n)))); + // BENCHMARK(20, 1, (hefty = barrier(fdotf_hefty(A, B, n)))); printf("dubble = %f (%g)\n", dubble, fabs(dubble - dubble)); printf("kahan = %f (%g)\n", kahan, fabs(kahan - dubble)); printf("naive = %f (%g)\n", naive, fabs(naive - dubble)); printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble)); + printf("intrin = %f (%g)\n", intrin, fabs(intrin - dubble)); printf("ruler = %f (%g)\n", ruler, fabs(ruler - dubble)); - printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble)); + // printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble)); delete[] B; delete[] A; diff --git a/test/math/bf16_test.c b/test/math/bf16_test.c new file mode 100644 index 000000000..532cebda5 --- /dev/null +++ b/test/math/bf16_test.c @@ -0,0 +1,113 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/math.h" + +#define CHECK(x) \ + if (!(x)) \ + return __LINE__ +#define FALSE(x) \ + { \ + volatile bool x_ = x; \ + if (x_) \ + return __LINE__; \ + } +#define TRUE(x) \ + { \ + volatile bool x_ = x; \ + if (!x_) \ + return __LINE__; \ + } + +__bf16 identity(__bf16 x) { + return x; +} +__bf16 (*half)(__bf16) = identity; + +unsigned toint(float f) { + union { + float f; + unsigned i; + } u = {f}; + return u.i; +} + +int main() { + volatile float f; + volatile double d; + volatile __bf16 pi = 3.141; + + // half → float → half + f = pi; + pi = f; + + // half → float + float __extendbfsf2(__bf16); + CHECK(0.f == __extendbfsf2(0)); + CHECK(3.140625f == __extendbfsf2(pi)); + CHECK(3.140625f == pi); + + // half → double → half + d = pi; + pi = d; + + // float → half + __bf16 __truncsfbf2(float); + CHECK(0 == (float)__truncsfbf2(0)); + CHECK(pi == (float)__truncsfbf2(3.141f)); + CHECK(3.140625f == (float)__truncsfbf2(3.141f)); + + // double → half + __bf16 __truncdfbf2(double); + CHECK(0 == (double)__truncdfbf2(0)); + CHECK(3.140625 == (double)__truncdfbf2(3.141)); + + // specials + volatile __bf16 nan = NAN; + volatile __bf16 positive_infinity = +INFINITY; + volatile __bf16 negative_infinity = -INFINITY; + CHECK(isnan(nan)); + CHECK(!isinf(pi)); + CHECK(!isnan(pi)); + CHECK(isinf(positive_infinity)); + CHECK(isinf(negative_infinity)); + CHECK(!isnan(positive_infinity)); + CHECK(!isnan(negative_infinity)); + CHECK(!signbit(pi)); + CHECK(signbit(half(-pi))); + CHECK(!signbit(half(+0.))); + CHECK(signbit(half(-0.))); + + // arithmetic + CHECK(half(-3) == -half(3)); + CHECK(half(9) == half(3) * half(3)); + CHECK(half(0) == half(pi) - half(pi)); + CHECK(half(6.28125) == half(pi) + half(pi)); + + // comparisons + CHECK(half(3) > half(2)); + CHECK(half(3) < half(4)); + CHECK(half(3) <= half(3)); + CHECK(half(3) >= half(3)); + TRUE(half(NAN) != half(NAN)); + FALSE(half(NAN) == half(NAN)); + TRUE(half(3) != half(NAN)); + FALSE(half(3) == half(NAN)); + TRUE(half(NAN) != half(3)); + FALSE(half(NAN) == half(3)); +}