Check in ruler summation experiments

This commit is contained in:
Justine Tunney 2024-07-29 18:02:16 -07:00
parent 3dab207351
commit 8cdb3e136b
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
3 changed files with 474 additions and 54 deletions

View file

@ -4,16 +4,25 @@
#include "libc/macros.internal.h"
#include "libc/math.h"
#include "libc/mem/gc.h"
#include "libc/mem/leaks.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/x/xasprintf.h"
#define EXPENSIVE_TESTS 0
#define CHUNK 8
#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
#define PORTABLE __target_clones("avx512f,avx")
static unsigned long long lcg = 1;
int rand32(void) {
/* Knuth, D.E., "The Art of Computer Programming," Vol 2,
Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
p. 106 (line 26) & p. 108 */
static unsigned long long lcg = 1;
lcg *= 6364136223846793005;
lcg += 1442695040888963407;
return lcg >> 32;
@ -27,24 +36,14 @@ float numba(void) { // (-1,1)
return float01(rand32()) * 2 - 1;
}
double fsumf_gold(const float *p, size_t n) {
size_t i;
double s;
if (n > 8)
return fsumf_gold(p, n / 2) + fsumf_gold(p + n / 2, n - n / 2);
for (s = i = 0; i < n; ++i)
s += p[i];
return s;
}
float fsumf_linear(const float *p, size_t n) {
float s = 0;
FASTMATH PORTABLE float fsumf_dubble(const float *p, size_t n) {
double s = 0;
for (size_t i = 0; i < n; ++i)
s += p[i];
return s;
}
float fsumf_kahan(const float *p, size_t n) {
PORTABLE float fsumf_kahan(const float *p, size_t n) {
size_t i;
float err, sum, t, y;
sum = err = 0;
@ -57,33 +56,120 @@ float fsumf_kahan(const float *p, size_t n) {
return sum;
}
float fsumf_logarithmic(const float *p, size_t n) {
size_t i;
float s;
if (n > 32)
return fsumf_logarithmic(p, n / 2) +
fsumf_logarithmic(p + n / 2, n - n / 2);
for (s = i = 0; i < n; ++i)
FASTMATH PORTABLE float fsumf_naive(const float *p, size_t n) {
float s = 0;
for (size_t i = 0; i < n; ++i)
s += p[i];
return s;
}
#define fsumf_naive_tester(A, n, tol) \
do { \
float err = fabsf(fsumf_naive(A, n) - fsumf_dubble(A, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)
void test_fsumf_naive(void) {
float *A = new float[2 * 1024 * 1024 + 1];
for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i)
A[i] = numba();
for (size_t n = 0; n < 1024; ++n)
fsumf_naive_tester(A, n, 1e-4);
#if EXPENSIVE_TESTS
fsumf_naive_tester(A, 128 * 1024, 1e-2);
fsumf_naive_tester(A, 256 * 1024, 1e-2);
fsumf_naive_tester(A, 1024 * 1024, 1e-1);
fsumf_naive_tester(A, 1024 * 1024 - 1, 1e-1);
fsumf_naive_tester(A, 1024 * 1024 + 1, 1e-1);
fsumf_naive_tester(A, 2 * 1024 * 1024, 1e-1);
fsumf_naive_tester(A, 2 * 1024 * 1024 - 1, 1e-1);
fsumf_naive_tester(A, 2 * 1024 * 1024 + 1, 1e-1);
#endif
delete[] A;
}
template <int N>
inline float hsum(const float *p) {
forceinline float hsum(const float *p) {
return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);
}
template <>
inline float hsum<1>(const float *p) {
forceinline float hsum<1>(const float *p) {
return *p;
}
#define CHUNK 8
FASTMATH PORTABLE float fsumf_recursive(const float *p, size_t n) {
if (n > 32) {
float x, y;
x = fsumf_recursive(p, n / 2);
y = fsumf_recursive(p + n / 2, n - n / 2);
return x + y;
} else {
float s;
size_t i;
for (s = i = 0; i < n; ++i)
s += p[i];
return s;
}
}
#define OPTIMIZE __attribute__((__optimize__("-O3")))
#define PORTABLE __target_clones("avx512f,avx")
FASTMATH PORTABLE float fsumf_ruler(const float *p, size_t n) {
size_t i, sp = 0;
int rule, step = 2;
float stack[bsr(n / CHUNK + 1) + 1];
for (i = 0; i + CHUNK * 4 <= n; i += CHUNK * 4, step += 2) {
float sum = 0;
for (size_t j = 0; j < CHUNK * 4; ++j)
sum += p[i + j];
for (rule = bsr(step & -step); --rule;)
sum += stack[--sp];
stack[sp++] = sum;
}
float res = 0;
while (sp)
res += stack[--sp];
while (i < n)
res += p[i++];
return res;
}
OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {
#define fsumf_ruler_tester(A, n, tol) \
do { \
float err = fabsf(fsumf_ruler(A, n) - fsumf_dubble(A, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)
void test_fsumf_ruler(void) {
float *A = new float[10 * 1024 * 1024 + 1];
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
A[i] = numba();
fsumf_ruler_tester(A, 96, 1e-6);
for (size_t n = 0; n < 1024; ++n)
fsumf_ruler_tester(A, n, 1e-5);
#if EXPENSIVE_TESTS
fsumf_ruler_tester(A, 128 * 1024, 1e-4);
fsumf_ruler_tester(A, 256 * 1024, 1e-4);
fsumf_ruler_tester(A, 1024 * 1024, 1e-3);
fsumf_ruler_tester(A, 1024 * 1024 - 1, 1e-3);
fsumf_ruler_tester(A, 1024 * 1024 + 1, 1e-3);
fsumf_ruler_tester(A, 2 * 1024 * 1024, 1e-3);
fsumf_ruler_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
fsumf_ruler_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
fsumf_ruler_tester(A, 8 * 1024 * 1024, 1e-3);
fsumf_ruler_tester(A, 10 * 1024 * 1024, 1e-3);
#endif
delete[] A;
}
FASTMATH PORTABLE float fsumf_hefty(const float *p, size_t n) {
unsigned i, par, len = 0;
float sum, res[n / CHUNK + 1];
for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
@ -102,13 +188,35 @@ OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {
return res[0];
}
void test_fsumf_nonrecursive(void) {
float A[CHUNK * 3];
for (int i = 0; i < CHUNK * 3; ++i)
#define fsumf_hefty_tester(A, n, tol) \
do { \
float err = fabsf(fsumf_hefty(A, n) - fsumf_dubble(A, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)
void test_fsumf_hefty(void) {
float *A = new float[10 * 1024 * 1024 + 1];
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
A[i] = numba();
for (int n = 0; n < CHUNK * 3; ++n)
if (fabsf(fsumf_nonrecursive(A, n) - fsumf_kahan(A, n)) > 1e-3)
exit(7);
for (size_t n = 0; n < 1024; ++n)
fsumf_hefty_tester(A, n, 1e-5);
#if EXPENSIVE_TESTS
fsumf_hefty_tester(A, 128 * 1024, 1e-4);
fsumf_hefty_tester(A, 256 * 1024, 1e-4);
fsumf_hefty_tester(A, 1024 * 1024, 1e-3);
fsumf_hefty_tester(A, 1024 * 1024 - 1, 1e-3);
fsumf_hefty_tester(A, 1024 * 1024 + 1, 1e-3);
fsumf_hefty_tester(A, 2 * 1024 * 1024, 1e-3);
fsumf_hefty_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
fsumf_hefty_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
fsumf_hefty_tester(A, 8 * 1024 * 1024, 1e-3);
fsumf_hefty_tester(A, 10 * 1024 * 1024, 1e-3);
#endif
delete[] A;
}
float nothing(float x) {
@ -132,23 +240,34 @@ float (*barrier)(float) = nothing;
} while (0)
int main() {
ShowCrashReports();
#if EXPENSIVE_TESTS
size_t n = 4 * 1024 * 1024;
#else
size_t n = 1024;
float *p = (float *)malloc(sizeof(float) * n);
#endif
float *p = new float[n];
for (size_t i = 0; i < n; ++i)
p[i] = numba();
float kahan, gold, linear, logarithmic, nonrecursive;
test_fsumf_nonrecursive();
BENCH(100, 1, (kahan = barrier(fsumf_kahan(p, n))));
BENCH(100, 1, (gold = barrier(fsumf_gold(p, n))));
BENCH(100, 1, (linear = barrier(fsumf_linear(p, n))));
BENCH(100, 1, (logarithmic = barrier(fsumf_logarithmic(p, n))));
BENCH(100, 1, (nonrecursive = barrier(fsumf_nonrecursive(p, n))));
printf("gold = %.12g (%.12g)\n", gold, fabs(gold - gold));
printf("linear = %.12g (%.12g)\n", linear, fabs(linear - gold));
printf("kahan = %.12g (%.12g)\n", kahan, fabs(kahan - gold));
printf("logarithmic = %.12g (%.12g)\n", logarithmic,
fabs(logarithmic - gold));
printf("nonrecursive = %.12g (%.12g)\n", nonrecursive,
fabs(nonrecursive - gold));
free(p);
float kahan, naive, dubble, recursive, hefty, ruler;
test_fsumf_naive();
test_fsumf_hefty();
test_fsumf_ruler();
BENCH(20, 1, (kahan = barrier(fsumf_kahan(p, n))));
BENCH(20, 1, (dubble = barrier(fsumf_dubble(p, n))));
BENCH(20, 1, (naive = barrier(fsumf_naive(p, n))));
BENCH(20, 1, (recursive = barrier(fsumf_recursive(p, n))));
BENCH(20, 1, (ruler = barrier(fsumf_ruler(p, n))));
BENCH(20, 1, (hefty = barrier(fsumf_hefty(p, n))));
printf("dubble = %f (%g)\n", dubble, fabs(dubble - dubble));
printf("kahan = %f (%g)\n", kahan, fabs(kahan - dubble));
printf("naive = %f (%g)\n", naive, fabs(naive - dubble));
printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
printf("ruler = %f (%g)\n", ruler, fabs(ruler - dubble));
printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble));
delete[] p;
CheckForMemoryLeaks();
}