cosmopolitan/test/libc/tinymath/fsum_test.cc

#include "libc/assert.h"
#include "libc/calls/struct/timespec.h"
#include "libc/intrin/bsr.h"
#include "libc/macros.internal.h"
#include "libc/math.h"
#include "libc/mem/gc.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/x/xasprintf.h"

int rand32(void) {
  /* Knuth, D.E., "The Art of Computer Programming," Vol 2,
     Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
     p. 106 (line 26) & p. 108 */
  static unsigned long long lcg = 1;
  lcg *= 6364136223846793005;
  lcg += 1442695040888963407;
  return lcg >> 32;
}

float float01(unsigned x) {  // (0,1)
  return 1.f / 8388608 * ((x >> 9) + .5f);
}

float numba(void) {  // (-1,1)
  return float01(rand32()) * 2 - 1;
}

double fsumf_gold(const float *p, size_t n) {
  size_t i;
  double s;
  if (n > 8)
    return fsumf_gold(p, n / 2) + fsumf_gold(p + n / 2, n - n / 2);
  for (s = i = 0; i < n; ++i)
    s += p[i];
  return s;
}

float fsumf_linear(const float *p, size_t n) {
  float s = 0;
  for (size_t i = 0; i < n; ++i)
    s += p[i];
  return s;
}

float fsumf_kahan(const float *p, size_t n) {
  size_t i;
  float err, sum, t, y;
  sum = err = 0;
  for (i = 0; i < n; ++i) {
    y = p[i] - err;
    t = sum + y;
    err = (t - sum) - y;
    sum = t;
  }
  return sum;
}

float fsumf_logarithmic(const float *p, size_t n) {
  size_t i;
  float s;
  if (n > 32)
    return fsumf_logarithmic(p, n / 2) +
           fsumf_logarithmic(p + n / 2, n - n / 2);
  for (s = i = 0; i < n; ++i)
    s += p[i];
  return s;
}

template <int N>
inline float hsum(const float *p) {
  return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);
}

template <>
inline float hsum<1>(const float *p) {
  return *p;
}

#define CHUNK 8

#define OPTIMIZE __attribute__((__optimize__("-O3")))
#define PORTABLE __target_clones("avx512f,avx")

OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {
  unsigned i, par, len = 0;
  float sum, res[n / CHUNK + 1];
  for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
    res[len++] = hsum<CHUNK>(p + i);
  if (i < n) {
    for (sum = 0; i < n; i++)
      sum += p[i];
    res[len++] = sum;
  }
  for (par = len >> 1; par; par >>= 1, len >>= 1) {
    for (i = 0; i < par; ++i)
      res[i] += res[par + i];
    if (len & 1)
      res[par - 1] += res[len - 1];
  }
  return res[0];
}

void test_fsumf_nonrecursive(void) {
  float A[CHUNK * 3];
  for (int i = 0; i < CHUNK * 3; ++i)
    A[i] = numba();
  for (int n = 0; n < CHUNK * 3; ++n)
    if (fabsf(fsumf_nonrecursive(A, n) - fsumf_kahan(A, n)) > 1e-3)
      exit(7);
}

float nothing(float x) {
  return x;
}

float (*barrier)(float) = nothing;

#define BENCH(ITERATIONS, WORK_PER_RUN, CODE)                                 \
  do {                                                                        \
    struct timespec start = timespec_real();                                  \
    for (int __i = 0; __i < ITERATIONS; ++__i) {                              \
      asm volatile("" ::: "memory");                                          \
      CODE;                                                                   \
    }                                                                         \
    long long work = (WORK_PER_RUN) * (ITERATIONS);                           \
    long nanos =                                                              \
        (timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / \
        (double)work;                                                         \
    printf("%8ld ns %2dx %s\n", nanos, (ITERATIONS), #CODE);                  \
  } while (0)

int main() {
  size_t n = 1024;
  float *p = (float *)malloc(sizeof(float) * n);
  for (size_t i = 0; i < n; ++i)
    p[i] = numba();
  float kahan, gold, linear, logarithmic, nonrecursive;
  test_fsumf_nonrecursive();
  BENCH(100, 1, (kahan = barrier(fsumf_kahan(p, n))));
  BENCH(100, 1, (gold = barrier(fsumf_gold(p, n))));
  BENCH(100, 1, (linear = barrier(fsumf_linear(p, n))));
  BENCH(100, 1, (logarithmic = barrier(fsumf_logarithmic(p, n))));
  BENCH(100, 1, (nonrecursive = barrier(fsumf_nonrecursive(p, n))));
  printf("gold = %.12g (%.12g)\n", gold, fabs(gold - gold));
  printf("linear = %.12g (%.12g)\n", linear, fabs(linear - gold));
  printf("kahan = %.12g (%.12g)\n", kahan, fabs(kahan - gold));
  printf("logarithmic = %.12g (%.12g)\n", logarithmic,
         fabs(logarithmic - gold));
  printf("nonrecursive = %.12g (%.12g)\n", nonrecursive,
         fabs(nonrecursive - gold));
  free(p);
}
Fix the build 2024-07-29 04:02:04 +00:00			`#include "libc/assert.h"`
			`#include "libc/calls/struct/timespec.h"`
			`#include "libc/intrin/bsr.h"`
			`#include "libc/macros.internal.h"`
			`#include "libc/math.h"`
			`#include "libc/mem/gc.h"`
			`#include "libc/mem/mem.h"`
			`#include "libc/runtime/runtime.h"`
			`#include "libc/stdio/stdio.h"`
			`#include "libc/x/xasprintf.h"`

			`int rand32(void) {`
			`/* Knuth, D.E., "The Art of Computer Programming," Vol 2,`
			`Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,`
			`p. 106 (line 26) & p. 108 */`
			`static unsigned long long lcg = 1;`
			`lcg *= 6364136223846793005;`
			`lcg += 1442695040888963407;`
			`return lcg >> 32;`
			`}`

			`float float01(unsigned x) { // (0,1)`
			`return 1.f / 8388608 * ((x >> 9) + .5f);`
			`}`

			`float numba(void) { // (-1,1)`
			`return float01(rand32()) * 2 - 1;`
			`}`

			`double fsumf_gold(const float *p, size_t n) {`
			`size_t i;`
			`double s;`
			`if (n > 8)`
			`return fsumf_gold(p, n / 2) + fsumf_gold(p + n / 2, n - n / 2);`
			`for (s = i = 0; i < n; ++i)`
			`s += p[i];`
			`return s;`
			`}`

			`float fsumf_linear(const float *p, size_t n) {`
			`float s = 0;`
			`for (size_t i = 0; i < n; ++i)`
			`s += p[i];`
			`return s;`
			`}`

			`float fsumf_kahan(const float *p, size_t n) {`
			`size_t i;`
			`float err, sum, t, y;`
			`sum = err = 0;`
			`for (i = 0; i < n; ++i) {`
			`y = p[i] - err;`
			`t = sum + y;`
			`err = (t - sum) - y;`
			`sum = t;`
			`}`
			`return sum;`
			`}`

			`float fsumf_logarithmic(const float *p, size_t n) {`
			`size_t i;`
			`float s;`
			`if (n > 32)`
			`return fsumf_logarithmic(p, n / 2) +`
			`fsumf_logarithmic(p + n / 2, n - n / 2);`
			`for (s = i = 0; i < n; ++i)`
			`s += p[i];`
			`return s;`
			`}`

			`template <int N>`
			`inline float hsum(const float *p) {`
			`return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);`
			`}`

			`template <>`
			`inline float hsum<1>(const float *p) {`
			`return *p;`
			`}`

			`#define CHUNK 8`

			`#define OPTIMIZE __attribute__((__optimize__("-O3")))`
			`#define PORTABLE __target_clones("avx512f,avx")`

			`OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {`
			`unsigned i, par, len = 0;`
			`float sum, res[n / CHUNK + 1];`
			`for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)`
			`res[len++] = hsum<CHUNK>(p + i);`
			`if (i < n) {`
			`for (sum = 0; i < n; i++)`
			`sum += p[i];`
			`res[len++] = sum;`
			`}`
			`for (par = len >> 1; par; par >>= 1, len >>= 1) {`
			`for (i = 0; i < par; ++i)`
			`res[i] += res[par + i];`
			`if (len & 1)`
			`res[par - 1] += res[len - 1];`
			`}`
			`return res[0];`
			`}`

			`void test_fsumf_nonrecursive(void) {`
			`float A[CHUNK * 3];`
			`for (int i = 0; i < CHUNK * 3; ++i)`
			`A[i] = numba();`
			`for (int n = 0; n < CHUNK * 3; ++n)`
			`if (fabsf(fsumf_nonrecursive(A, n) - fsumf_kahan(A, n)) > 1e-3)`
			`exit(7);`
			`}`

			`float nothing(float x) {`
			`return x;`
			`}`

			`float (*barrier)(float) = nothing;`

			`#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \`
			`do { \`
			`struct timespec start = timespec_real(); \`
			`for (int __i = 0; __i < ITERATIONS; ++__i) { \`
			`asm volatile("" ::: "memory"); \`
			`CODE; \`
			`} \`
			`long long work = (WORK_PER_RUN) * (ITERATIONS); \`
			`long nanos = \`
			`(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / \`
			`(double)work; \`
			`printf("%8ld ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \`
			`} while (0)`

			`int main() {`
			`size_t n = 1024;`
			`float p = (float )malloc(sizeof(float) * n);`
			`for (size_t i = 0; i < n; ++i)`
			`p[i] = numba();`
			`float kahan, gold, linear, logarithmic, nonrecursive;`
			`test_fsumf_nonrecursive();`
			`BENCH(100, 1, (kahan = barrier(fsumf_kahan(p, n))));`
			`BENCH(100, 1, (gold = barrier(fsumf_gold(p, n))));`
			`BENCH(100, 1, (linear = barrier(fsumf_linear(p, n))));`
			`BENCH(100, 1, (logarithmic = barrier(fsumf_logarithmic(p, n))));`
			`BENCH(100, 1, (nonrecursive = barrier(fsumf_nonrecursive(p, n))));`
			`printf("gold = %.12g (%.12g)\n", gold, fabs(gold - gold));`
			`printf("linear = %.12g (%.12g)\n", linear, fabs(linear - gold));`
			`printf("kahan = %.12g (%.12g)\n", kahan, fabs(kahan - gold));`
			`printf("logarithmic = %.12g (%.12g)\n", logarithmic,`
			`fabs(logarithmic - gold));`
			`printf("nonrecursive = %.12g (%.12g)\n", nonrecursive,`
			`fabs(nonrecursive - gold));`
			`free(p);`
			`}`