mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-03-16 05:46:25 +00:00
Check in ruler summation experiments
This commit is contained in:
parent
3dab207351
commit
8cdb3e136b
3 changed files with 474 additions and 54 deletions
|
@ -3,11 +3,17 @@
|
||||||
|
|
||||||
PKGS += TEST_LIBC_TINYMATH
|
PKGS += TEST_LIBC_TINYMATH
|
||||||
|
|
||||||
TEST_LIBC_TINYMATH_SRCS := $(wildcard test/libc/tinymath/*.c)
|
TEST_LIBC_TINYMATH_SRCS_C := $(wildcard test/libc/tinymath/*.c)
|
||||||
|
TEST_LIBC_TINYMATH_SRCS_CC := $(wildcard test/libc/tinymath/*.cc)
|
||||||
TEST_LIBC_TINYMATH_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_TINYMATH_SRCS))
|
TEST_LIBC_TINYMATH_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_TINYMATH_SRCS))
|
||||||
|
|
||||||
|
TEST_LIBC_TINYMATH_SRCS = \
|
||||||
|
$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o) \
|
||||||
|
$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)
|
||||||
|
|
||||||
TEST_LIBC_TINYMATH_OBJS = \
|
TEST_LIBC_TINYMATH_OBJS = \
|
||||||
$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%.o)
|
$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o) \
|
||||||
|
$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)
|
||||||
|
|
||||||
TEST_LIBC_TINYMATH_COMS = \
|
TEST_LIBC_TINYMATH_COMS = \
|
||||||
$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%)
|
$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%)
|
||||||
|
@ -26,19 +32,21 @@ TEST_LIBC_TINYMATH_DIRECTDEPS = \
|
||||||
LIBC_CALLS \
|
LIBC_CALLS \
|
||||||
LIBC_FMT \
|
LIBC_FMT \
|
||||||
LIBC_INTRIN \
|
LIBC_INTRIN \
|
||||||
|
LIBC_LOG \
|
||||||
LIBC_MEM \
|
LIBC_MEM \
|
||||||
LIBC_NEXGEN32E \
|
LIBC_NEXGEN32E \
|
||||||
LIBC_STDIO \
|
|
||||||
LIBC_RUNTIME \
|
LIBC_RUNTIME \
|
||||||
|
LIBC_STDIO \
|
||||||
LIBC_STR \
|
LIBC_STR \
|
||||||
LIBC_SYSV \
|
LIBC_SYSV \
|
||||||
LIBC_TESTLIB \
|
LIBC_TESTLIB \
|
||||||
LIBC_TINYMATH \
|
LIBC_TINYMATH \
|
||||||
LIBC_X \
|
LIBC_X \
|
||||||
THIRD_PARTY_COMPILER_RT \
|
THIRD_PARTY_COMPILER_RT \
|
||||||
THIRD_PARTY_GDTOA \
|
|
||||||
THIRD_PARTY_COMPILER_RT \
|
THIRD_PARTY_COMPILER_RT \
|
||||||
THIRD_PARTY_DOUBLECONVERSION
|
THIRD_PARTY_DOUBLECONVERSION \
|
||||||
|
THIRD_PARTY_GDTOA \
|
||||||
|
THIRD_PARTY_LIBCXX \
|
||||||
|
|
||||||
TEST_LIBC_TINYMATH_DEPS := \
|
TEST_LIBC_TINYMATH_DEPS := \
|
||||||
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
|
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
|
||||||
|
@ -60,6 +68,10 @@ $(TEST_LIBC_TINYMATH_OBJS): private \
|
||||||
CFLAGS += \
|
CFLAGS += \
|
||||||
-fno-builtin
|
-fno-builtin
|
||||||
|
|
||||||
|
$(TEST_LIBC_TINYMATH_OBJS): private \
|
||||||
|
CXXFLAGS += \
|
||||||
|
#-ffast-math
|
||||||
|
|
||||||
.PHONY: o/$(MODE)/test/libc/tinymath
|
.PHONY: o/$(MODE)/test/libc/tinymath
|
||||||
o/$(MODE)/test/libc/tinymath: \
|
o/$(MODE)/test/libc/tinymath: \
|
||||||
$(TEST_LIBC_TINYMATH_BINS) \
|
$(TEST_LIBC_TINYMATH_BINS) \
|
||||||
|
|
289
test/libc/tinymath/fdot_test.cc
Normal file
289
test/libc/tinymath/fdot_test.cc
Normal file
|
@ -0,0 +1,289 @@
|
||||||
|
#include "libc/assert.h"
|
||||||
|
#include "libc/calls/struct/timespec.h"
|
||||||
|
#include "libc/intrin/bsr.h"
|
||||||
|
#include "libc/macros.internal.h"
|
||||||
|
#include "libc/math.h"
|
||||||
|
#include "libc/mem/gc.h"
|
||||||
|
#include "libc/mem/leaks.h"
|
||||||
|
#include "libc/mem/mem.h"
|
||||||
|
#include "libc/runtime/runtime.h"
|
||||||
|
#include "libc/stdio/stdio.h"
|
||||||
|
#include "libc/x/xasprintf.h"
|
||||||
|
|
||||||
|
#define EXPENSIVE_TESTS 0
|
||||||
|
|
||||||
|
#define CHUNK 8
|
||||||
|
|
||||||
|
#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
|
||||||
|
#define PORTABLE __target_clones("avx512f,avx")
|
||||||
|
|
||||||
|
static unsigned long long lcg = 1;
|
||||||
|
|
||||||
|
int rand32(void) {
|
||||||
|
/* Knuth, D.E., "The Art of Computer Programming," Vol 2,
|
||||||
|
Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
|
||||||
|
p. 106 (line 26) & p. 108 */
|
||||||
|
lcg *= 6364136223846793005;
|
||||||
|
lcg += 1442695040888963407;
|
||||||
|
return lcg >> 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
float float01(unsigned x) { // (0,1)
|
||||||
|
return 1.f / 8388608 * ((x >> 9) + .5f);
|
||||||
|
}
|
||||||
|
|
||||||
|
float numba(void) { // (-1,1)
|
||||||
|
return float01(rand32()) * 2 - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
PORTABLE float fdotf_dubble(const float *A, const float *B, size_t n) {
|
||||||
|
double s = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i)
|
||||||
|
s = fma(A[i], B[i], s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
float fdotf_kahan(const float *A, const float *B, size_t n) {
|
||||||
|
size_t i;
|
||||||
|
float err, sum, t, y;
|
||||||
|
sum = err = 0;
|
||||||
|
for (i = 0; i < n; ++i) {
|
||||||
|
y = A[i] * B[i] - err;
|
||||||
|
t = sum + y;
|
||||||
|
err = (t - sum) - y;
|
||||||
|
sum = t;
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
float fdotf_naive(const float *A, const float *B, size_t n) {
|
||||||
|
float s = 0;
|
||||||
|
for (size_t i = 0; i < n; ++i)
|
||||||
|
s = fmaf(A[i], B[i], s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define fdotf_naive_tester(A, B, n, tol) \
|
||||||
|
do { \
|
||||||
|
float err = fabsf(fdotf_naive(A, B, n) - fdotf_dubble(A, B, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fdotf_naive(void) {
|
||||||
|
float *A = new float[2 * 1024 * 1024 + 1];
|
||||||
|
float *B = new float[2 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i) {
|
||||||
|
A[i] = numba();
|
||||||
|
B[i] = numba();
|
||||||
|
}
|
||||||
|
for (size_t n = 0; n < 1024; ++n)
|
||||||
|
fdotf_naive_tester(A, B, n, 1e-4);
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
fdotf_naive_tester(A, B, 128 * 1024, 1e-2);
|
||||||
|
fdotf_naive_tester(A, B, 256 * 1024, 1e-2);
|
||||||
|
fdotf_naive_tester(A, B, 1024 * 1024, 1e-1);
|
||||||
|
fdotf_naive_tester(A, B, 1024 * 1024 - 1, 1e-1);
|
||||||
|
fdotf_naive_tester(A, B, 1024 * 1024 + 1, 1e-1);
|
||||||
|
fdotf_naive_tester(A, B, 2 * 1024 * 1024, 1e-1);
|
||||||
|
fdotf_naive_tester(A, B, 2 * 1024 * 1024 - 1, 1e-1);
|
||||||
|
fdotf_naive_tester(A, B, 2 * 1024 * 1024 + 1, 1e-1);
|
||||||
|
#endif
|
||||||
|
delete[] B;
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int N>
|
||||||
|
forceinline float hdot(const float *A, const float *B) {
|
||||||
|
return hdot<N / 2>(A, B) + hdot<N / 2>(A + N / 2, B + N / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
forceinline float hdot<1>(const float *A, const float *B) {
|
||||||
|
return A[0] * B[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
float fdotf_recursive(const float *A, const float *B, size_t n) {
|
||||||
|
if (n > 32) {
|
||||||
|
float x, y;
|
||||||
|
x = fdotf_recursive(A, B, n / 2);
|
||||||
|
y = fdotf_recursive(A + n / 2, B + n / 2, n - n / 2);
|
||||||
|
return x + y;
|
||||||
|
} else {
|
||||||
|
float s;
|
||||||
|
size_t i;
|
||||||
|
for (s = i = 0; i < n; ++i)
|
||||||
|
s = fmaf(A[i], B[i], s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FASTMATH float fdotf_ruler(const float *A, const float *B, size_t n) {
|
||||||
|
int rule, step = 2;
|
||||||
|
size_t chunk, sp = 0;
|
||||||
|
float stack[bsr(n / CHUNK + 1) + 1];
|
||||||
|
for (chunk = 0; chunk + CHUNK * 4 <= n; chunk += CHUNK * 4, step += 2) {
|
||||||
|
float sum = 0;
|
||||||
|
for (size_t elem = 0; elem < CHUNK * 4; ++elem)
|
||||||
|
sum += A[chunk + elem] * B[chunk + elem];
|
||||||
|
for (rule = bsr(step & -step); --rule;)
|
||||||
|
sum += stack[--sp];
|
||||||
|
stack[sp++] = sum;
|
||||||
|
}
|
||||||
|
float res = 0;
|
||||||
|
while (sp)
|
||||||
|
res += stack[--sp];
|
||||||
|
for (; chunk < n; ++chunk)
|
||||||
|
res += A[chunk] * B[chunk];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define fdotf_ruler_tester(A, B, n, tol) \
|
||||||
|
do { \
|
||||||
|
float err = fabsf(fdotf_ruler(A, B, n) - fdotf_dubble(A, B, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fdotf_ruler(void) {
|
||||||
|
float *A = new float[10 * 1024 * 1024 + 1];
|
||||||
|
float *B = new float[10 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
|
||||||
|
A[i] = numba();
|
||||||
|
B[i] = numba();
|
||||||
|
}
|
||||||
|
fdotf_ruler_tester(A, B, 96, 1e-6);
|
||||||
|
for (size_t n = 0; n < 4096; ++n)
|
||||||
|
fdotf_ruler_tester(A, B, n, 1e-5);
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
fdotf_ruler_tester(A, B, 128 * 1024, 1e-4);
|
||||||
|
fdotf_ruler_tester(A, B, 256 * 1024, 1e-4);
|
||||||
|
fdotf_ruler_tester(A, B, 1024 * 1024, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 1024 * 1024 - 1, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 1024 * 1024 + 1, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 2 * 1024 * 1024, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 8 * 1024 * 1024, 1e-3);
|
||||||
|
fdotf_ruler_tester(A, B, 10 * 1024 * 1024, 1e-3);
|
||||||
|
#endif
|
||||||
|
delete[] B;
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
|
|
||||||
|
PORTABLE float fdotf_hefty(const float *A, const float *B, size_t n) {
|
||||||
|
unsigned i, par, len = 0;
|
||||||
|
float sum, res[n / CHUNK + 1];
|
||||||
|
for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
|
||||||
|
res[len++] = hdot<CHUNK>(A + i, B + i);
|
||||||
|
if (i < n) {
|
||||||
|
for (sum = 0; i < n; i++)
|
||||||
|
sum = fmaf(A[i], B[i], sum);
|
||||||
|
res[len++] = sum;
|
||||||
|
}
|
||||||
|
for (par = len >> 1; par; par >>= 1, len >>= 1) {
|
||||||
|
for (i = 0; i < par; ++i)
|
||||||
|
res[i] += res[par + i];
|
||||||
|
if (len & 1)
|
||||||
|
res[par - 1] += res[len - 1];
|
||||||
|
}
|
||||||
|
return res[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
#define fdotf_hefty_tester(A, B, n, tol) \
|
||||||
|
do { \
|
||||||
|
float err = fabsf(fdotf_hefty(A, B, n) - fdotf_dubble(A, B, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fdotf_hefty(void) {
|
||||||
|
float *A = new float[10 * 1024 * 1024 + 1];
|
||||||
|
float *B = new float[10 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
|
||||||
|
A[i] = numba();
|
||||||
|
B[i] = numba();
|
||||||
|
}
|
||||||
|
for (size_t n = 0; n < 1024; ++n)
|
||||||
|
fdotf_hefty_tester(A, B, n, 1e-5);
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
fdotf_hefty_tester(A, B, 128 * 1024, 1e-4);
|
||||||
|
fdotf_hefty_tester(A, B, 256 * 1024, 1e-4);
|
||||||
|
fdotf_hefty_tester(A, B, 1024 * 1024, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 1024 * 1024 - 1, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 1024 * 1024 + 1, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 2 * 1024 * 1024, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 8 * 1024 * 1024, 1e-3);
|
||||||
|
fdotf_hefty_tester(A, B, 10 * 1024 * 1024, 1e-3);
|
||||||
|
#endif
|
||||||
|
delete[] B;
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
|
|
||||||
|
float nothing(float x) {
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
float (*barrier)(float) = nothing;
|
||||||
|
|
||||||
|
#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
|
||||||
|
do { \
|
||||||
|
struct timespec start = timespec_real(); \
|
||||||
|
for (int __i = 0; __i < ITERATIONS; ++__i) { \
|
||||||
|
asm volatile("" ::: "memory"); \
|
||||||
|
CODE; \
|
||||||
|
} \
|
||||||
|
long long work = (WORK_PER_RUN) * (ITERATIONS); \
|
||||||
|
long nanos = \
|
||||||
|
(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / \
|
||||||
|
(double)work; \
|
||||||
|
printf("%8ld ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
ShowCrashReports();
|
||||||
|
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
size_t n = 512 * 1024;
|
||||||
|
#else
|
||||||
|
size_t n = 1024;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float *A = new float[n];
|
||||||
|
float *B = new float[n];
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
A[i] = numba();
|
||||||
|
B[i] = numba();
|
||||||
|
}
|
||||||
|
float kahan, naive, dubble, recursive, hefty, ruler;
|
||||||
|
test_fdotf_naive();
|
||||||
|
test_fdotf_hefty();
|
||||||
|
test_fdotf_ruler();
|
||||||
|
BENCH(20, 1, (kahan = barrier(fdotf_kahan(A, B, n))));
|
||||||
|
BENCH(20, 1, (dubble = barrier(fdotf_dubble(A, B, n))));
|
||||||
|
BENCH(20, 1, (naive = barrier(fdotf_naive(A, B, n))));
|
||||||
|
BENCH(20, 1, (recursive = barrier(fdotf_recursive(A, B, n))));
|
||||||
|
BENCH(20, 1, (ruler = barrier(fdotf_ruler(A, B, n))));
|
||||||
|
BENCH(20, 1, (hefty = barrier(fdotf_hefty(A, B, n))));
|
||||||
|
printf("dubble = %f (%g)\n", dubble, fabs(dubble - dubble));
|
||||||
|
printf("kahan = %f (%g)\n", kahan, fabs(kahan - dubble));
|
||||||
|
printf("naive = %f (%g)\n", naive, fabs(naive - dubble));
|
||||||
|
printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
|
||||||
|
printf("ruler = %f (%g)\n", ruler, fabs(ruler - dubble));
|
||||||
|
printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble));
|
||||||
|
delete[] B;
|
||||||
|
delete[] A;
|
||||||
|
|
||||||
|
CheckForMemoryLeaks();
|
||||||
|
}
|
|
@ -4,16 +4,25 @@
|
||||||
#include "libc/macros.internal.h"
|
#include "libc/macros.internal.h"
|
||||||
#include "libc/math.h"
|
#include "libc/math.h"
|
||||||
#include "libc/mem/gc.h"
|
#include "libc/mem/gc.h"
|
||||||
|
#include "libc/mem/leaks.h"
|
||||||
#include "libc/mem/mem.h"
|
#include "libc/mem/mem.h"
|
||||||
#include "libc/runtime/runtime.h"
|
#include "libc/runtime/runtime.h"
|
||||||
#include "libc/stdio/stdio.h"
|
#include "libc/stdio/stdio.h"
|
||||||
#include "libc/x/xasprintf.h"
|
#include "libc/x/xasprintf.h"
|
||||||
|
|
||||||
|
#define EXPENSIVE_TESTS 0
|
||||||
|
|
||||||
|
#define CHUNK 8
|
||||||
|
|
||||||
|
#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
|
||||||
|
#define PORTABLE __target_clones("avx512f,avx")
|
||||||
|
|
||||||
|
static unsigned long long lcg = 1;
|
||||||
|
|
||||||
int rand32(void) {
|
int rand32(void) {
|
||||||
/* Knuth, D.E., "The Art of Computer Programming," Vol 2,
|
/* Knuth, D.E., "The Art of Computer Programming," Vol 2,
|
||||||
Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
|
Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
|
||||||
p. 106 (line 26) & p. 108 */
|
p. 106 (line 26) & p. 108 */
|
||||||
static unsigned long long lcg = 1;
|
|
||||||
lcg *= 6364136223846793005;
|
lcg *= 6364136223846793005;
|
||||||
lcg += 1442695040888963407;
|
lcg += 1442695040888963407;
|
||||||
return lcg >> 32;
|
return lcg >> 32;
|
||||||
|
@ -27,24 +36,14 @@ float numba(void) { // (-1,1)
|
||||||
return float01(rand32()) * 2 - 1;
|
return float01(rand32()) * 2 - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
double fsumf_gold(const float *p, size_t n) {
|
FASTMATH PORTABLE float fsumf_dubble(const float *p, size_t n) {
|
||||||
size_t i;
|
double s = 0;
|
||||||
double s;
|
|
||||||
if (n > 8)
|
|
||||||
return fsumf_gold(p, n / 2) + fsumf_gold(p + n / 2, n - n / 2);
|
|
||||||
for (s = i = 0; i < n; ++i)
|
|
||||||
s += p[i];
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
float fsumf_linear(const float *p, size_t n) {
|
|
||||||
float s = 0;
|
|
||||||
for (size_t i = 0; i < n; ++i)
|
for (size_t i = 0; i < n; ++i)
|
||||||
s += p[i];
|
s += p[i];
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
float fsumf_kahan(const float *p, size_t n) {
|
PORTABLE float fsumf_kahan(const float *p, size_t n) {
|
||||||
size_t i;
|
size_t i;
|
||||||
float err, sum, t, y;
|
float err, sum, t, y;
|
||||||
sum = err = 0;
|
sum = err = 0;
|
||||||
|
@ -57,33 +56,120 @@ float fsumf_kahan(const float *p, size_t n) {
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
float fsumf_logarithmic(const float *p, size_t n) {
|
FASTMATH PORTABLE float fsumf_naive(const float *p, size_t n) {
|
||||||
size_t i;
|
float s = 0;
|
||||||
float s;
|
for (size_t i = 0; i < n; ++i)
|
||||||
if (n > 32)
|
|
||||||
return fsumf_logarithmic(p, n / 2) +
|
|
||||||
fsumf_logarithmic(p + n / 2, n - n / 2);
|
|
||||||
for (s = i = 0; i < n; ++i)
|
|
||||||
s += p[i];
|
s += p[i];
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define fsumf_naive_tester(A, n, tol) \
|
||||||
|
do { \
|
||||||
|
float err = fabsf(fsumf_naive(A, n) - fsumf_dubble(A, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fsumf_naive(void) {
|
||||||
|
float *A = new float[2 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i)
|
||||||
|
A[i] = numba();
|
||||||
|
for (size_t n = 0; n < 1024; ++n)
|
||||||
|
fsumf_naive_tester(A, n, 1e-4);
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
fsumf_naive_tester(A, 128 * 1024, 1e-2);
|
||||||
|
fsumf_naive_tester(A, 256 * 1024, 1e-2);
|
||||||
|
fsumf_naive_tester(A, 1024 * 1024, 1e-1);
|
||||||
|
fsumf_naive_tester(A, 1024 * 1024 - 1, 1e-1);
|
||||||
|
fsumf_naive_tester(A, 1024 * 1024 + 1, 1e-1);
|
||||||
|
fsumf_naive_tester(A, 2 * 1024 * 1024, 1e-1);
|
||||||
|
fsumf_naive_tester(A, 2 * 1024 * 1024 - 1, 1e-1);
|
||||||
|
fsumf_naive_tester(A, 2 * 1024 * 1024 + 1, 1e-1);
|
||||||
|
#endif
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
|
|
||||||
template <int N>
|
template <int N>
|
||||||
inline float hsum(const float *p) {
|
forceinline float hsum(const float *p) {
|
||||||
return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);
|
return hsum<N / 2>(p) + hsum<N / 2>(p + N / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
inline float hsum<1>(const float *p) {
|
forceinline float hsum<1>(const float *p) {
|
||||||
return *p;
|
return *p;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define CHUNK 8
|
FASTMATH PORTABLE float fsumf_recursive(const float *p, size_t n) {
|
||||||
|
if (n > 32) {
|
||||||
|
float x, y;
|
||||||
|
x = fsumf_recursive(p, n / 2);
|
||||||
|
y = fsumf_recursive(p + n / 2, n - n / 2);
|
||||||
|
return x + y;
|
||||||
|
} else {
|
||||||
|
float s;
|
||||||
|
size_t i;
|
||||||
|
for (s = i = 0; i < n; ++i)
|
||||||
|
s += p[i];
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define OPTIMIZE __attribute__((__optimize__("-O3")))
|
FASTMATH PORTABLE float fsumf_ruler(const float *p, size_t n) {
|
||||||
#define PORTABLE __target_clones("avx512f,avx")
|
size_t i, sp = 0;
|
||||||
|
int rule, step = 2;
|
||||||
|
float stack[bsr(n / CHUNK + 1) + 1];
|
||||||
|
for (i = 0; i + CHUNK * 4 <= n; i += CHUNK * 4, step += 2) {
|
||||||
|
float sum = 0;
|
||||||
|
for (size_t j = 0; j < CHUNK * 4; ++j)
|
||||||
|
sum += p[i + j];
|
||||||
|
for (rule = bsr(step & -step); --rule;)
|
||||||
|
sum += stack[--sp];
|
||||||
|
stack[sp++] = sum;
|
||||||
|
}
|
||||||
|
float res = 0;
|
||||||
|
while (sp)
|
||||||
|
res += stack[--sp];
|
||||||
|
while (i < n)
|
||||||
|
res += p[i++];
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {
|
#define fsumf_ruler_tester(A, n, tol) \
|
||||||
|
do { \
|
||||||
|
float err = fabsf(fsumf_ruler(A, n) - fsumf_dubble(A, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fsumf_ruler(void) {
|
||||||
|
float *A = new float[10 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
|
||||||
|
A[i] = numba();
|
||||||
|
fsumf_ruler_tester(A, 96, 1e-6);
|
||||||
|
for (size_t n = 0; n < 1024; ++n)
|
||||||
|
fsumf_ruler_tester(A, n, 1e-5);
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
fsumf_ruler_tester(A, 128 * 1024, 1e-4);
|
||||||
|
fsumf_ruler_tester(A, 256 * 1024, 1e-4);
|
||||||
|
fsumf_ruler_tester(A, 1024 * 1024, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 1024 * 1024 - 1, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 1024 * 1024 + 1, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 2 * 1024 * 1024, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 8 * 1024 * 1024, 1e-3);
|
||||||
|
fsumf_ruler_tester(A, 10 * 1024 * 1024, 1e-3);
|
||||||
|
#endif
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
|
|
||||||
|
FASTMATH PORTABLE float fsumf_hefty(const float *p, size_t n) {
|
||||||
unsigned i, par, len = 0;
|
unsigned i, par, len = 0;
|
||||||
float sum, res[n / CHUNK + 1];
|
float sum, res[n / CHUNK + 1];
|
||||||
for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
|
for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
|
||||||
|
@ -102,13 +188,35 @@ OPTIMIZE PORTABLE float fsumf_nonrecursive(const float *p, size_t n) {
|
||||||
return res[0];
|
return res[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_fsumf_nonrecursive(void) {
|
#define fsumf_hefty_tester(A, n, tol) \
|
||||||
float A[CHUNK * 3];
|
do { \
|
||||||
for (int i = 0; i < CHUNK * 3; ++i)
|
float err = fabsf(fsumf_hefty(A, n) - fsumf_dubble(A, n)); \
|
||||||
|
if (err > tol) { \
|
||||||
|
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
|
||||||
|
err); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
void test_fsumf_hefty(void) {
|
||||||
|
float *A = new float[10 * 1024 * 1024 + 1];
|
||||||
|
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i)
|
||||||
A[i] = numba();
|
A[i] = numba();
|
||||||
for (int n = 0; n < CHUNK * 3; ++n)
|
for (size_t n = 0; n < 1024; ++n)
|
||||||
if (fabsf(fsumf_nonrecursive(A, n) - fsumf_kahan(A, n)) > 1e-3)
|
fsumf_hefty_tester(A, n, 1e-5);
|
||||||
exit(7);
|
#if EXPENSIVE_TESTS
|
||||||
|
fsumf_hefty_tester(A, 128 * 1024, 1e-4);
|
||||||
|
fsumf_hefty_tester(A, 256 * 1024, 1e-4);
|
||||||
|
fsumf_hefty_tester(A, 1024 * 1024, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 1024 * 1024 - 1, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 1024 * 1024 + 1, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 2 * 1024 * 1024, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 2 * 1024 * 1024 - 1, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 2 * 1024 * 1024 + 1, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 8 * 1024 * 1024, 1e-3);
|
||||||
|
fsumf_hefty_tester(A, 10 * 1024 * 1024, 1e-3);
|
||||||
|
#endif
|
||||||
|
delete[] A;
|
||||||
}
|
}
|
||||||
|
|
||||||
float nothing(float x) {
|
float nothing(float x) {
|
||||||
|
@ -132,23 +240,34 @@ float (*barrier)(float) = nothing;
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
ShowCrashReports();
|
||||||
|
|
||||||
|
#if EXPENSIVE_TESTS
|
||||||
|
size_t n = 4 * 1024 * 1024;
|
||||||
|
#else
|
||||||
size_t n = 1024;
|
size_t n = 1024;
|
||||||
float *p = (float *)malloc(sizeof(float) * n);
|
#endif
|
||||||
|
|
||||||
|
float *p = new float[n];
|
||||||
for (size_t i = 0; i < n; ++i)
|
for (size_t i = 0; i < n; ++i)
|
||||||
p[i] = numba();
|
p[i] = numba();
|
||||||
float kahan, gold, linear, logarithmic, nonrecursive;
|
float kahan, naive, dubble, recursive, hefty, ruler;
|
||||||
test_fsumf_nonrecursive();
|
test_fsumf_naive();
|
||||||
BENCH(100, 1, (kahan = barrier(fsumf_kahan(p, n))));
|
test_fsumf_hefty();
|
||||||
BENCH(100, 1, (gold = barrier(fsumf_gold(p, n))));
|
test_fsumf_ruler();
|
||||||
BENCH(100, 1, (linear = barrier(fsumf_linear(p, n))));
|
BENCH(20, 1, (kahan = barrier(fsumf_kahan(p, n))));
|
||||||
BENCH(100, 1, (logarithmic = barrier(fsumf_logarithmic(p, n))));
|
BENCH(20, 1, (dubble = barrier(fsumf_dubble(p, n))));
|
||||||
BENCH(100, 1, (nonrecursive = barrier(fsumf_nonrecursive(p, n))));
|
BENCH(20, 1, (naive = barrier(fsumf_naive(p, n))));
|
||||||
printf("gold = %.12g (%.12g)\n", gold, fabs(gold - gold));
|
BENCH(20, 1, (recursive = barrier(fsumf_recursive(p, n))));
|
||||||
printf("linear = %.12g (%.12g)\n", linear, fabs(linear - gold));
|
BENCH(20, 1, (ruler = barrier(fsumf_ruler(p, n))));
|
||||||
printf("kahan = %.12g (%.12g)\n", kahan, fabs(kahan - gold));
|
BENCH(20, 1, (hefty = barrier(fsumf_hefty(p, n))));
|
||||||
printf("logarithmic = %.12g (%.12g)\n", logarithmic,
|
printf("dubble = %f (%g)\n", dubble, fabs(dubble - dubble));
|
||||||
fabs(logarithmic - gold));
|
printf("kahan = %f (%g)\n", kahan, fabs(kahan - dubble));
|
||||||
printf("nonrecursive = %.12g (%.12g)\n", nonrecursive,
|
printf("naive = %f (%g)\n", naive, fabs(naive - dubble));
|
||||||
fabs(nonrecursive - gold));
|
printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
|
||||||
free(p);
|
printf("ruler = %f (%g)\n", ruler, fabs(ruler - dubble));
|
||||||
|
printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble));
|
||||||
|
delete[] p;
|
||||||
|
|
||||||
|
CheckForMemoryLeaks();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue