diff --git a/dsp/core/core.h b/dsp/core/core.h index d054201cb..2619d1af6 100644 --- a/dsp/core/core.h +++ b/dsp/core/core.h @@ -9,14 +9,13 @@ int mulaw(int); int unmulaw(int); void *double2byte(long, const void *, double, double) vallocesque; void *byte2double(long, const void *, double, double) vallocesque; -void *dct(float[restrict hasatleast 8][8], unsigned, float, float, float, float, - float); -void *dctjpeg(float[restrict hasatleast 8][8], unsigned); +void *dct(float[hasatleast 8][8], unsigned, float, float, float, float, float); +void *dctjpeg(float[hasatleast 8][8], unsigned); double det3(const double[3][3]) nosideeffect; -void *inv3(double[restrict 3][3], const double[restrict 3][3], double); -void *matmul3(double[restrict 3][3], const double[3][3], const double[3][3]); -void *vmatmul3(double[restrict 3], const double[3], const double[3][3]); -void *matvmul3(double[restrict 3], const double[3][3], const double[3]); +void *inv3(double[3][3], const double[3][3], double); +void *matmul3(double[3][3], const double[3][3], const double[3][3]); +void *vmatmul3(double[3], const double[3], const double[3][3]); +void *matvmul3(double[3], const double[3][3], const double[3]); double rgb2stdtv(double) pureconst; double rgb2lintv(double) pureconst; double rgb2stdpc(double, double) pureconst; diff --git a/dsp/core/dct.c b/dsp/core/dct.c index 68d5c3bdc..5647ffefb 100644 --- a/dsp/core/dct.c +++ b/dsp/core/dct.c @@ -65,8 +65,8 @@ * * @cost ~100ns */ -void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0, - float c1, float c2, float c3, float c4) { +void *dct(float M[hasatleast 8][8], unsigned stride, float c0, float c1, + float c2, float c3, float c4) { unsigned y, x; for (y = 0; y < stride * 8; y += stride) { DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7], @@ -79,7 +79,7 @@ void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0, return M; } -void *dctjpeg(float M[restrict hasatleast 8][8], unsigned stride) { +void *dctjpeg(float M[hasatleast 8][8], unsigned stride) { return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f, .707106781f); } diff --git a/dsp/core/inv3.c b/dsp/core/inv3.c index 548da5299..7e4828068 100644 --- a/dsp/core/inv3.c +++ b/dsp/core/inv3.c @@ -30,7 +30,7 @@ * @define ๐€โปยน=๐ such that ๐€ร—๐=๐ร—๐€=๐ˆโ‚™ * @see det3() */ -void *inv3(double B[restrict 3][3], const double A[restrict 3][3], double d) { +void *inv3(double B[3][3], const double A[3][3], double d) { d = d ? 1 / d : NAN; B[0][0] = (A[1][1] * A[2][2] - A[2][1] * A[1][2]) * d; B[0][1] = (A[2][1] * A[0][2] - A[0][1] * A[2][2]) * d; diff --git a/dsp/core/matmul3.c b/dsp/core/matmul3.c index 6b309f4ce..e6a06448f 100644 --- a/dsp/core/matmul3.c +++ b/dsp/core/matmul3.c @@ -22,8 +22,7 @@ /** * Multiplies 3ร—3 matrices. */ -void *matmul3(double R[restrict 3][3], const double A[3][3], - const double B[3][3]) { +void *matmul3(double R[3][3], const double A[3][3], const double B[3][3]) { int i, j, k; memset(R, 0, sizeof(double) * 3 * 3); for (i = 0; i < 3; ++i) { diff --git a/dsp/core/matvmul3.c b/dsp/core/matvmul3.c index 197d1ec3b..7bc99864a 100644 --- a/dsp/core/matvmul3.c +++ b/dsp/core/matvmul3.c @@ -23,7 +23,7 @@ * * @see vmatmul3() for noncommutative corollary */ -void *matvmul3(double R[restrict 3], const double M[3][3], const double V[3]) { +void *matvmul3(double R[3], const double M[3][3], const double V[3]) { R[0] = V[0] * M[0][0] + V[1] * M[0][1] + V[2] * M[0][2]; R[1] = V[0] * M[1][0] + V[1] * M[1][1] + V[2] * M[1][2]; R[2] = V[0] * M[2][0] + V[1] * M[2][1] + V[2] * M[2][2]; diff --git a/dsp/core/vmatmul3.c b/dsp/core/vmatmul3.c index fcf58bbaf..ed377b4d4 100644 --- a/dsp/core/vmatmul3.c +++ b/dsp/core/vmatmul3.c @@ -23,7 +23,7 @@ * * @see matvmul3() for noncommutative corollary */ -void *vmatmul3(double R[restrict 3], const double V[3], const double M[3][3]) { +void *vmatmul3(double R[3], const double V[3], const double M[3][3]) { R[0] = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0]; R[1] = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1]; R[2] = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2]; diff --git a/dsp/mpeg/macroblock.c b/dsp/mpeg/macroblock.c index 3639011c3..783f963bc 100644 --- a/dsp/mpeg/macroblock.c +++ b/dsp/mpeg/macroblock.c @@ -31,9 +31,8 @@ #include "dsp/mpeg/video.h" #include "libc/log/check.h" -forceinline void plm_video_process_macroblock(plm_video_t *self, - uint8_t *restrict d, - uint8_t *restrict s, int motion_h, +forceinline void plm_video_process_macroblock(plm_video_t *self, uint8_t *d, + uint8_t *s, int motion_h, int motion_v, bool interpolate, unsigned BW) { unsigned si, di, max_address; @@ -155,17 +154,17 @@ forceinline void plm_video_process_macroblock(plm_video_t *self, } } -void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *restrict d, - uint8_t *restrict s, int motion_h, - int motion_v, bool interpolate) { +void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *d, uint8_t *s, + int motion_h, int motion_v, + bool interpolate) { DCHECK_ALIGNED(8, d); DCHECK_ALIGNED(8, s); plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8); } -void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *restrict d, - uint8_t *restrict s, int motion_h, - int motion_v, bool interpolate) { +void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *d, uint8_t *s, + int motion_h, int motion_v, + bool interpolate) { DCHECK_ALIGNED(16, d); DCHECK_ALIGNED(16, s); plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16); diff --git a/dsp/mpeg/video.h b/dsp/mpeg/video.h index 570ee94ea..d2ed05181 100644 --- a/dsp/mpeg/video.h +++ b/dsp/mpeg/video.h @@ -51,10 +51,10 @@ typedef struct plm_video_t { int assume_no_b_frames; } plm_video_t; -void plm_video_process_macroblock_8(plm_video_t *, uint8_t *restrict, - uint8_t *restrict, int, int, bool); -void plm_video_process_macroblock_16(plm_video_t *, uint8_t *restrict, - uint8_t *restrict, int, int, bool); +void plm_video_process_macroblock_8(plm_video_t *, uint8_t *, uint8_t *, int, + int, bool); +void plm_video_process_macroblock_16(plm_video_t *, uint8_t *, uint8_t *, int, + int, bool); COSMOPOLITAN_C_END_ #endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */ diff --git a/dsp/scale/gyarados.c b/dsp/scale/gyarados.c index 4487b3d79..61beace01 100644 --- a/dsp/scale/gyarados.c +++ b/dsp/scale/gyarados.c @@ -53,11 +53,11 @@ struct SamplingSolution { static double ComputeWeight(double x) { if (-1.5 < x && x < 1.5) { if (-.5 < x && x < .5) { - return .75 - SQR(x); + return.75 - SQR(x); } else if (x < 0) { - return .5 * SQR(x + 1.5); + return.5 * SQR(x + 1.5); } else { - return .5 * SQR(x - 1.5); + return.5 * SQR(x - 1.5); } } else { return 0; @@ -149,12 +149,11 @@ static int Sharpen(int ax, int bx, int cx) { static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw, long sxw, const int src[syw][sxw], long dyn, long dxn, - long syn, long sxn, int tmp0[restrict dyn][sxn], - int tmp1[restrict dyn][sxn], - int tmp2[restrict dyn][dxn], long yfn, long xfn, - const short fyi[dyn][yfn], const short fyw[dyn][yfn], - const short fxi[dxn][xfn], const short fxw[dxn][xfn], - bool sharpen) { + long syn, long sxn, int tmp0[dyn][sxn], + int tmp1[dyn][sxn], int tmp2[dyn][dxn], long yfn, + long xfn, const short fyi[dyn][yfn], + const short fyw[dyn][yfn], const short fxi[dxn][xfn], + const short fxw[dxn][xfn], bool sharpen) { long i; int eax, dy, dx, sx; for (sx = 0; sx < sxn; ++sx) { diff --git a/dsp/scale/magikarp.c b/dsp/scale/magikarp.c index ea5aa7d55..ca5d3cf8c 100644 --- a/dsp/scale/magikarp.c +++ b/dsp/scale/magikarp.c @@ -106,9 +106,9 @@ void *Magkern2xY(long ys, long xs, unsigned char p[ys][xs], long yn, long xn) { return p; } -void *MagikarpY(long dys, long dxs, unsigned char d[restrict dys][dxs], - long sys, long sxs, const unsigned char s[sys][sxs], long yn, - long xn, const signed char K[8]) { +void *MagikarpY(long dys, long dxs, unsigned char d[dys][dxs], long sys, + long sxs, const unsigned char s[sys][sxs], long yn, long xn, + const signed char K[8]) { long y, x; for (y = 0; y < yn; ++y) { for (x = 0; x < xn; ++x) { diff --git a/examples/blas.cc b/examples/blas.cc index 5bcc4b2ba..70c32c451 100644 --- a/examples/blas.cc +++ b/examples/blas.cc @@ -135,7 +135,7 @@ void sgemm(int m, int n, int k, // const float *B, int ldb, // float *C, int ldc) { int nth = sysconf(_SC_NPROCESSORS_ONLN); -#pragma omp parallel for +#pragma omp parallel for schedule(static) for (int ith = 0; ith < nth; ++ith) { ansiBLAS tb{k, A, lda, B, ldb, C, ldc, ith, nth}; tb.matmul(m, n); diff --git a/libc/cosmo.h b/libc/cosmo.h index 9788bb3ca..af2dc289c 100644 --- a/libc/cosmo.h +++ b/libc/cosmo.h @@ -2,7 +2,13 @@ #define COSMOPOLITAN_LIBC_COSMO_H_ COSMOPOLITAN_C_START_ -errno_t cosmo_once(_Atomic(unsigned) *, void (*)(void)) libcesque; +#ifndef __cplusplus +#define _COSMO_ATOMIC(x) _Atomic(x) +#else +#define _COSMO_ATOMIC(x) x +#endif + +errno_t cosmo_once(_COSMO_ATOMIC(unsigned) *, void (*)(void)) libcesque; int systemvpe(const char *, char *const[], char *const[]) libcesque; char *GetProgramExecutableName(void) libcesque; void unleaf(void) libcesque; diff --git a/libc/integral/c.inc b/libc/integral/c.inc index 491aff579..c3638a08b 100644 --- a/libc/integral/c.inc +++ b/libc/integral/c.inc @@ -1,7 +1,3 @@ -#if __GNUC__ + 0 < 2 -#define __attribute__(x) -#endif - #ifndef __cplusplus #define COSMOPOLITAN_C_START_ #define COSMOPOLITAN_C_END_ @@ -10,73 +6,16 @@ #define COSMOPOLITAN_CXX_USING_ #endif -#ifndef __ia16__ -#define __far -#endif - #if !defined(__GNUC__) && __cplusplus + 0 >= 201103L #define typeof(x) decltype(x) #elif !defined(__GNUC__) && __STDC_VERSION__ + 0 < 201112 #define typeof(x) __typeof(x) #endif -#ifdef __cplusplus -#if __cplusplus >= 201103L -#define _Alignof(x) alignof(x) -#endif /* C++11 */ -#else /* __cplusplus */ -#if __STDC_VERSION__ + 0 < 201112 -#if __GNUC__ + _MSC_VER + 0 -#define _Alignof(x) __alignof(x) -#else -#define _Alignof(x) /* basically all it ever did lool */ sizeof(x) -#endif /* GNU/MSVC/!ANSI */ -#endif /* C11 */ -#endif /* __cplusplus */ - -#if !defined(__cplusplus) && !defined(inline) && __STDC_VERSION__ + 0 < 199901 -#if defined(__GNUC__) || defined(_MSC_VER) -#define inline __inline -#else -#define inline -#define __inline -#endif -#endif - #ifdef __chibicc__ #define __extension__ #endif -#if __STDC_VERSION__ + 0 < 201112 -#ifdef __GNUC__ -#define _Alignas(x) __attribute__((__aligned__(x))) -#elif defined(_MSC_VER) -#define _Alignas(x) __declspec(align(x)) -#endif -#endif - -#ifdef _MSC_VER -#define __builtin_unreachable() __assume(false) -#elif !((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 405 || \ - defined(__clang__) || defined(__INTEL_COMPILER) || \ - __has_builtin(__builtin_unreachable)) -#define __builtin_unreachable() \ - for (;;) { \ - } -#endif - -#if (!defined(__llvm__) && !__has_builtin(__builtin_assume)) -#define __builtin_assume(x) \ - do { \ - if (!(x)) \ - __builtin_unreachable(); \ - } while (0) -#endif - -#if __STDC_VERSION__ + 0 < 201112 -#define _Atomic(t) volatile t -#endif - #ifdef __llvm__ #define __gnu_printf__ __printf__ #define __gnu_scanf__ __scanf__ @@ -325,16 +264,6 @@ typedef struct { #define hasatleast #endif -#if __STDC_VERSION__ + 0 < 199901L && !defined(restrict) -#if !defined(__cplusplus) && \ - ((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 301 || defined(_MSC_VER)) -#define restrict __restrict__ -#else -#define restrict -#define __restrict -#endif -#endif - #ifndef dontcallback #if (__has_attribute(__leaf__) || \ (!defined(__llvm__) && \ @@ -594,11 +523,8 @@ typedef struct { #ifdef __aarch64__ /* raise sigill (not sigtrap) like x86 does */ -#define __builtin_trap() \ - do { \ - __asm__("udf\t#0x666"); \ - __builtin_unreachable(); \ - } while (0) +#define __builtin_trap() \ + (({ __asm__("udf\t#0x666"); }), __builtin_unreachable()) #endif #endif /* _COSMO_SOURCE */ diff --git a/libc/runtime/zipos.internal.h b/libc/runtime/zipos.internal.h index 0891e16a7..1691cee5d 100644 --- a/libc/runtime/zipos.internal.h +++ b/libc/runtime/zipos.internal.h @@ -6,6 +6,12 @@ COSMOPOLITAN_C_START_ #define ZIPOS_SYNTHETIC_DIRECTORY 0 +#ifndef __cplusplus +#define _ZIPOS_ATOMIC(x) _Atomic(x) +#else +#define _ZIPOS_ATOMIC(x) x +#endif + struct stat; struct iovec; struct Zipos; @@ -21,8 +27,8 @@ struct ZiposHandle { size_t size; size_t mapsize; size_t cfile; - _Atomic(size_t) refs; - _Atomic(size_t) pos; + _ZIPOS_ATOMIC(size_t) refs; + _ZIPOS_ATOMIC(size_t) pos; uint8_t *mem; uint8_t data[]; }; diff --git a/libc/str/str.h b/libc/str/str.h index ed9553633..897fb2cfb 100644 --- a/libc/str/str.h +++ b/libc/str/str.h @@ -26,10 +26,10 @@ COSMOPOLITAN_C_START_ void *memset(void *, int, size_t) memcpyesque; void *memmove(void *, const void *, size_t) memcpyesque; -void *memcpy(void *restrict, const void *restrict, size_t) memcpyesque; -void *mempcpy(void *restrict, const void *restrict, size_t) memcpyesque; -char *hexpcpy(char *restrict, const void *restrict, size_t) memcpyesque; -void *memccpy(void *restrict, const void *restrict, int, size_t) memcpyesque; +void *memcpy(void *, const void *, size_t) memcpyesque; +void *mempcpy(void *, const void *, size_t) memcpyesque; +char *hexpcpy(char *, const void *, size_t) memcpyesque; +void *memccpy(void *, const void *, int, size_t) memcpyesque; void explicit_bzero(void *, size_t); int memcmp(const void *, const void *, size_t) strlenesque; diff --git a/libc/thread/semaphore.h b/libc/thread/semaphore.h index b5abf08ca..64119e2be 100644 --- a/libc/thread/semaphore.h +++ b/libc/thread/semaphore.h @@ -3,6 +3,12 @@ #include "libc/calls/struct/timespec.h" COSMOPOLITAN_C_START_ +#ifndef __cplusplus +#define _SEM_ATOMIC(x) _Atomic(x) +#else +#define _SEM_ATOMIC(x) x +#endif + #define SEM_FAILED ((sem_t *)0) #define SEM_MAGIC_NAMED 0xDEADBEEFu #define SEM_MAGIC_UNNAMED 0xFEEDABEEu @@ -11,9 +17,9 @@ COSMOPOLITAN_C_START_ typedef struct { union { struct { - _Atomic(int) sem_value; - _Atomic(int) sem_waiters; - _Atomic(int) sem_prefs; /* named only */ + _SEM_ATOMIC(int) sem_value; + _SEM_ATOMIC(int) sem_waiters; + _SEM_ATOMIC(int) sem_prefs; /* named only */ unsigned sem_magic; int64_t sem_dev; /* named only */ int64_t sem_ino; /* named only */ diff --git a/libc/thread/thread.h b/libc/thread/thread.h index 1c3ff8eff..2e4448f2e 100644 --- a/libc/thread/thread.h +++ b/libc/thread/thread.h @@ -48,6 +48,12 @@ COSMOPOLITAN_C_START_ #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE} +#ifndef __cplusplus +#define _PTHREAD_ATOMIC(x) _Atomic(x) +#else +#define _PTHREAD_ATOMIC(x) x +#endif + typedef uintptr_t pthread_t; typedef int pthread_id_np_t; typedef char pthread_condattr_t; @@ -57,20 +63,20 @@ typedef unsigned pthread_key_t; typedef void (*pthread_key_dtor)(void *); typedef struct pthread_once_s { - _Atomic(uint32_t) _lock; + _PTHREAD_ATOMIC(uint32_t) _lock; } pthread_once_t; typedef struct pthread_spinlock_s { - _Atomic(int) _lock; + _PTHREAD_ATOMIC(int) _lock; } pthread_spinlock_t; typedef struct pthread_mutex_s { uint32_t _nsync; union { int32_t _pid; - _Atomic(int32_t) _futex; + _PTHREAD_ATOMIC(int32_t) _futex; }; - _Atomic(uint64_t) _word; + _PTHREAD_ATOMIC(uint64_t) _word; } pthread_mutex_t; typedef struct pthread_mutexattr_s { @@ -85,8 +91,8 @@ typedef struct pthread_cond_s { char _pshared; }; }; - _Atomic(uint32_t) _sequence; - _Atomic(uint32_t) _waiters; + _PTHREAD_ATOMIC(uint32_t) _sequence; + _PTHREAD_ATOMIC(uint32_t) _waiters; } pthread_cond_t; typedef struct pthread_rwlock_s { @@ -97,8 +103,8 @@ typedef struct pthread_rwlock_s { typedef struct pthread_barrier_s { int _count; char _pshared; - _Atomic(int) _counter; - _Atomic(int) _waiters; + _PTHREAD_ATOMIC(int) _counter; + _PTHREAD_ATOMIC(int) _waiters; } pthread_barrier_t; typedef struct pthread_attr_s { diff --git a/test/libcxx/openmp_test.cc b/test/libcxx/openmp_test.cc new file mode 100644 index 000000000..11cb481ef --- /dev/null +++ b/test/libcxx/openmp_test.cc @@ -0,0 +1,476 @@ +/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-โ”‚ +โ”‚ vi: set et ft=cpp ts=2 sts=2 sw=2 fenc=utf-8 :vi โ”‚ +โ•žโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ก +โ”‚ Copyright 2024 Justine Alexandra Roberts Tunney โ”‚ +โ”‚ โ”‚ +โ”‚ Permission to use, copy, modify, and/or distribute this software for โ”‚ +โ”‚ any purpose with or without fee is hereby granted, provided that the โ”‚ +โ”‚ above copyright notice and this permission notice appear in all copies. โ”‚ +โ”‚ โ”‚ +โ”‚ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL โ”‚ +โ”‚ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED โ”‚ +โ”‚ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE โ”‚ +โ”‚ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL โ”‚ +โ”‚ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR โ”‚ +โ”‚ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER โ”‚ +โ”‚ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR โ”‚ +โ”‚ PERFORMANCE OF THIS SOFTWARE. โ”‚ +โ•šโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€*/ +#include +#include +#include +#include +#include +#include +#include "libc/stdio/rand.h" + +#define PRECISION 2e-6 +#define LV1DCACHE 49152 +#define THRESHOLD 3000000 + +#if defined(__OPTIMIZE__) && !defined(__SANITIZE_ADDRESS__) +#define ITERATIONS 5 +#else +#define ITERATIONS 1 +#endif + +#define OPTIMIZED __attribute__((__optimize__("-O3,-ffast-math"))) +#define PORTABLE \ + __target_clones("arch=znver4," \ + "arch=znver3," \ + "arch=sapphirerapids," \ + "arch=alderlake," \ + "arch=rocketlake," \ + "arch=cooperlake," \ + "arch=tigerlake," \ + "arch=cascadelake," \ + "arch=skylake-avx512," \ + "arch=skylake," \ + "arch=znver1," \ + "arch=tremont," \ + "fma," \ + "avx") + +static bool is_self_testing; + +// mร—n โ†’ nร—m +template +void transpose(long m, long n, const TA *A, long lda, TB *B, long ldb) { +#pragma omp parallel for collapse(2) if (m * n > THRESHOLD) + for (long i = 0; i < m; ++i) + for (long j = 0; j < n; ++j) { + B[ldb * j + i] = A[lda * i + j]; + } +} + +// mร—k * kร—n โ†’ mร—n +// kร—m * kร—n โ†’ mร—n if aแต€ +// mร—k * nร—k โ†’ mร—n if bแต€ +// kร—m * nร—k โ†’ mร—n if aแต€ and bแต€ +template +void dgemm(bool aแต€, bool bแต€, long m, long n, long k, float ฮฑ, const TA *A, + long lda, const TB *B, long ldb, float ฮฒ, TC *C, long ldc) { +#pragma omp parallel for collapse(2) if (m * n * k > THRESHOLD) + for (long i = 0; i < m; ++i) + for (long j = 0; j < n; ++j) { + double sum = 0; + for (long l = 0; l < k; ++l) + sum = std::fma((aแต€ ? A[lda * l + i] : A[lda * i + l]) * ฮฑ, + (bแต€ ? B[ldb * j + l] : B[ldb * l + j]), sum); + C[ldc * i + j] = C[ldc * i + j] * ฮฒ + sum; + } +} + +template +struct Gemmlin { + public: + Gemmlin(bool aT, bool bT, float ฮฑ, const TA *A, long lda, const TB *B, + long ldb, float ฮฒ, TC *C, long ldc) + : aT(aT), + bT(bT), + ฮฑ(ฮฑ), + A(A), + lda(lda), + B(B), + ldb(ldb), + ฮฒ(ฮฒ), + C(C), + ldc(ldc) { + } + + void gemm(long m, long n, long k) { + if (!m || !n) + return; + for (long i = 0; i < m; ++i) + for (long j = 0; j < n; ++j) { + C[ldc * i + j] *= ฮฒ; + } + if (!k) + return; + cub = sqrt(LV1DCACHE) / sqrt(sizeof(T) * 3); + mnpack(0, m, 0, n, 0, k); + } + + private: + void mnpack(long m0, long m, // + long n0, long n, // + long k0, long k) { + long mc = rounddown(std::min(m - m0, cub), 4); + long mp = m0 + (m - m0) / mc * mc; + long nc = rounddown(std::min(n - n0, cub), 4); + long np = n0 + (n - n0) / nc * nc; + long kc = rounddown(std::min(k - k0, cub), 4); + long kp = k0 + (k - k0) / kc * kc; + kpack(m0, mc, mp, n0, nc, np, k0, kc, k, kp); + if (m - mp) + mnpack(mp, m, n0, np, k0, k); + if (n - np) + mnpack(m0, mp, np, n, k0, k); + if (m - mp && n - np) + mnpack(mp, m, np, n, k0, k); + } + + void kpack(long m0, long mc, long m, // + long n0, long nc, long n, // + long k0, long kc, long k, // + long kp) { + rpack(m0, mc, m, n0, nc, n, k0, kc, kp); + if (k - kp) + rpack(m0, mc, m, n0, nc, n, kp, k - kp, k); + } + + void rpack(long m0, long mc, long m, // + long n0, long nc, long n, // + long k0, long kc, long k) { + if (!(mc % 4) && !(nc % 4)) + bgemm<4, 4>(m0, mc, m, n0, nc, n, k0, kc, k); + else + bgemm<1, 1>(m0, mc, m, n0, nc, n, k0, kc, k); + } + + template + void bgemm(long m0, long mc, long m, // + long n0, long nc, long n, // + long k0, long kc, long k) { + ops = (m - m0) * (n - n0) * (k - k0); + ml = (m - m0) / mc; + nl = (n - n0) / nc; + locks = new lock[ml * nl]; + there_will_be_blocks(m0, mc, m, n0, nc, n, k0, kc, k); + delete[] locks; + } + + template + void there_will_be_blocks(long m0, long mc, long m, long n0, long nc, long n, + long k0, long kc, long k) { +#pragma omp parallel for collapse(2) if (ops > THRESHOLD && mc * kc > 16) + for (long ic = m0; ic < m; ic += mc) + for (long pc = k0; pc < k; pc += kc) + gizmo(m0, mc, ic, n0, nc, k0, kc, pc, n); + } + + template + PORTABLE OPTIMIZED void gizmo(long m0, long mc, long ic, long n0, long nc, + long k0, long kc, long pc, long n) { + T Ac[mc / mr][kc][mr]; + for (long i = 0; i < mc; ++i) + for (long j = 0; j < kc; ++j) + Ac[i / mr][j][i % mr] = ฮฑ * (aT ? A[lda * (pc + j) + (ic + i)] + : A[lda * (ic + i) + (pc + j)]); + for (long jc = n0; jc < n; jc += nc) { + T Bc[nc / nr][nr][kc]; + for (long j = 0; j < nc; ++j) + for (long i = 0; i < kc; ++i) + Bc[j / nr][j % nr][i] = + bT ? B[ldb * (jc + j) + (pc + i)] : B[ldb * (pc + i) + (jc + j)]; + T Cc[nc / nr][mc / mr][nr][mr]; + memset(Cc, 0, nc * mc * sizeof(float)); + for (long jr = 0; jr < nc / nr; ++jr) + for (long ir = 0; ir < mc / mr; ++ir) + for (long pr = 0; pr < kc; ++pr) + for (long j = 0; j < nr; ++j) + for (long i = 0; i < mr; ++i) + Cc[jr][ir][j][i] += Ac[ir][pr][i] * Bc[jr][j][pr]; + const long lk = nl * ((ic - m0) / mc) + ((jc - n0) / nc); + locks[lk].acquire(); + for (long ir = 0; ir < mc; ir += mr) + for (long jr = 0; jr < nc; jr += nr) + for (long i = 0; i < mr; ++i) + for (long j = 0; j < nr; ++j) + C[ldc * (ic + ir + i) + (jc + jr + j)] += + Cc[jr / nr][ir / mr][j][i]; + locks[lk].release(); + } + } + + inline long rounddown(long x, long r) { + if (x < r) + return x; + else + return x & -r; + } + + class lock { + public: + lock() = default; + void acquire() { + while (lock_.exchange(true, std::memory_order_acquire)) { + } + } + void release() { + lock_.store(false, std::memory_order_release); + } + + private: + std::atomic_bool lock_ = false; + }; + + bool aT; + bool bT; + float ฮฑ; + const TA *A; + long lda; + const TB *B; + long ldb; + float ฮฒ; + TC *C; + long ldc; + long ops; + long nl; + long ml; + lock *locks; + long cub; +}; + +template +void sgemm(bool aT, bool bT, long m, long n, long k, float ฮฑ, const TA *A, + long lda, const TB *B, long ldb, float ฮฒ, TC *C, long ldc) { + Gemmlin g{aT, bT, ฮฑ, A, lda, B, ldb, ฮฒ, C, ldc}; + g.gemm(m, n, k); +} + +template +void show(FILE *f, long max, long m, long n, const TA *A, long lda, const TB *B, + long ldb) { + flockfile(f); + fprintf(f, " "); + for (long j = 0; j < n; ++j) { + fprintf(f, "%13ld", j); + } + fprintf(f, "\n"); + for (long i = 0; i < m; ++i) { + if (i == max) { + fprintf(f, "...\n"); + break; + } + fprintf(f, "%5ld ", i); + for (long j = 0; j < n; ++j) { + if (j == max) { + fprintf(f, " ..."); + break; + } + char ba[16], bb[16]; + sprintf(ba, "%13.7f", static_cast(A[lda * i + j])); + sprintf(bb, "%13.7f", static_cast(B[ldb * i + j])); + for (long k = 0; ba[k] && bb[k]; ++k) { + if (ba[k] != bb[k]) + fputs_unlocked("\33[31m", f); + fputc_unlocked(ba[k], f); + if (ba[k] != bb[k]) + fputs_unlocked("\33[0m", f); + } + } + fprintf(f, "\n"); + } + funlockfile(f); +} + +inline unsigned long GetDoubleBits(double f) { + union { + double f; + unsigned long i; + } u; + u.f = f; + return u.i; +} + +inline bool IsNan(double x) { + return (GetDoubleBits(x) & (-1ull >> 1)) > (0x7ffull << 52); +} + +template +double diff(long m, long n, const TA *Want, long lda, const TB *Got, long ldb) { + double s = 0; + int got_nans = 0; + int want_nans = 0; + for (long i = 0; i < m; ++i) + for (long j = 0; j < n; ++j) + if (IsNan(Want[ldb * i + j])) + ++want_nans; + else if (IsNan(Got[ldb * i + j])) + ++got_nans; + else + s += std::fabs(Want[lda * i + j] - Got[ldb * i + j]); + if (got_nans) + printf("WARNING: got %d NaNs!\n", got_nans); + if (want_nans) + printf("WARNING: want array has %d NaNs!\n", want_nans); + return s / (m * n); +} + +template +void show_error(FILE *f, long max, long m, long n, const TA *A, long lda, + const TB *B, long ldb, const char *file, int line, double sad, + double tol) { + fprintf(f, "%s:%d: sad %.17g exceeds %g\nwant\n", file, line, sad, tol); + show(f, max, m, n, A, lda, B, ldb); + fprintf(f, "got\n"); + show(f, max, m, n, B, ldb, A, lda); + fprintf(f, "\n"); +} + +template +void check(double tol, long m, long n, const TA *A, long lda, const TB *B, + long ldb, const char *file, int line) { + double sad = diff(m, n, A, lda, B, ldb); + if (sad <= tol) { + if (!is_self_testing) { + printf(" %g error\n", sad); + } + } else { + show_error(stderr, 16, m, n, A, lda, B, ldb, file, line, sad, tol); + const char *path = "/tmp/openmp_test.log"; + FILE *f = fopen(path, "w"); + if (f) { + show_error(f, 10000, m, n, A, lda, B, ldb, file, line, sad, tol); + printf("see also %s\n", path); + } + exit(1); + } +} + +#define check(tol, m, n, A, lda, B, ldb) \ + check(tol, m, n, A, lda, B, ldb, __FILE__, __LINE__) + +long micros(void) { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000; +} + +#define bench(x) \ + do { \ + long t1 = micros(); \ + for (long i = 0; i < ITERATIONS; ++i) { \ + asm volatile("" ::: "memory"); \ + x; \ + asm volatile("" ::: "memory"); \ + } \ + long t2 = micros(); \ + printf("%8" PRId64 " ยตs %s\n", (t2 - t1 + ITERATIONS - 1) / ITERATIONS, \ + #x); \ + } while (0) + +double real01(unsigned long x) { // (0,1) + return 1. / 4503599627370496. * ((x >> 12) + .5); +} + +double numba(void) { // (-1,1) + return real01(lemur64()) * 2 - 1; +} + +template +void fill(T *A, long n) { + for (long i = 0; i < n; ++i) { + A[i] = numba(); + } +} + +void test_gemm(long m, long n, long k) { + float *A = new float[m * k]; + float *At = new float[k * m]; + float *B = new float[k * n]; + float *Bt = new float[n * k]; + float *C = new float[m * n]; + float *GOLD = new float[m * n]; + float ฮฑ = 1; + float ฮฒ = 0; + fill(A, m * k); + fill(B, k * n); + dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n); + transpose(m, k, A, k, At, m); + transpose(k, n, B, n, Bt, k); + sgemm(0, 0, m, n, k, ฮฑ, A, k, B, n, ฮฒ, C, n); + check(PRECISION, m, n, GOLD, n, C, n); + sgemm(1, 0, m, n, k, ฮฑ, At, m, B, n, ฮฒ, C, n); + check(PRECISION, m, n, GOLD, n, C, n); + sgemm(0, 1, m, n, k, ฮฑ, A, k, Bt, k, ฮฒ, C, n); + check(PRECISION, m, n, GOLD, n, C, n); + sgemm(1, 1, m, n, k, ฮฑ, At, m, Bt, k, ฮฒ, C, n); + check(PRECISION, m, n, GOLD, n, C, n); + delete[] GOLD; + delete[] C; + delete[] Bt; + delete[] B; + delete[] At; + delete[] A; +} + +void check_gemm_works(void) { + static long kSizes[] = {1, 2, 3, 4, 5, 6, 7, 17, 31, 33, 63, 128, 129}; + is_self_testing = true; + long c = 0; + long N = sizeof(kSizes) / sizeof(kSizes[0]); + for (long i = 0; i < N; ++i) { + long m = kSizes[i]; + for (long j = 0; j < N; ++j) { + long n = kSizes[N - 1 - i]; + for (long k = 0; k < N; ++k) { + long K = kSizes[i]; + if (c++ % 13 == 0) { + printf("testing %2ld %2ld %2ld\r", m, n, K); + } + test_gemm(m, n, K); + } + } + } + printf("\r"); + is_self_testing = false; +} + +long m = 2333 / 3; +long k = 577 / 3; +long n = 713 / 3; + +void check_sgemm(void) { + float *A = new float[m * k]; + float *At = new float[k * m]; + float *B = new float[k * n]; + float *Bt = new float[n * k]; + float *C = new float[m * n]; + double *GOLD = new double[m * n]; + fill(A, m * k); + fill(B, k * n); + transpose(m, k, A, k, At, m); + transpose(k, n, B, n, Bt, k); + bench(dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n)); + bench(sgemm(0, 0, m, n, k, 1, A, k, B, n, 0, C, n)); + check(PRECISION, m, n, GOLD, n, C, n); + bench(sgemm(1, 0, m, n, k, 1, At, m, B, n, 0, C, n)); + check(PRECISION, m, n, GOLD, n, C, n); + bench(sgemm(0, 1, m, n, k, 1, A, k, Bt, k, 0, C, n)); + check(PRECISION, m, n, GOLD, n, C, n); + bench(sgemm(1, 1, m, n, k, 1, At, m, Bt, k, 0, C, n)); + check(PRECISION, m, n, GOLD, n, C, n); + delete[] GOLD; + delete[] C; + delete[] Bt; + delete[] B; + delete[] At; + delete[] A; +} + +int main(int argc, char *argv[]) { + check_gemm_works(); + check_sgemm(); +} diff --git a/third_party/nsync/futex.internal.h b/third_party/nsync/futex.internal.h index 1128479c5..c572a1595 100644 --- a/third_party/nsync/futex.internal.h +++ b/third_party/nsync/futex.internal.h @@ -4,8 +4,14 @@ #include "libc/dce.h" COSMOPOLITAN_C_START_ -int nsync_futex_wake_(_Atomic(int) *, int, char); -int nsync_futex_wait_(_Atomic(int) *, int, char, const struct timespec *); +#ifndef __cplusplus +#define _FUTEX_ATOMIC(x) _Atomic(x) +#else +#define _FUTEX_ATOMIC(x) x +#endif + +int nsync_futex_wake_(_FUTEX_ATOMIC(int) *, int, char); +int nsync_futex_wait_(_FUTEX_ATOMIC(int) *, int, char, const struct timespec *); COSMOPOLITAN_C_END_ #endif /* NSYNC_FUTEX_INTERNAL_H_ */ diff --git a/third_party/openmp/kmp_lock.cpp b/third_party/openmp/kmp_lock.cpp index 8f871319d..95e734536 100644 --- a/third_party/openmp/kmp_lock.cpp +++ b/third_party/openmp/kmp_lock.cpp @@ -380,9 +380,9 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) { long rc; #ifdef __COSMOPOLITAN__ - if ((rc = nsync_futex_wait_((_Atomic(int) *)&(lck->lk.poll), poll_val, false, NULL)) != 0) { + if ((rc = nsync_futex_wait_((int *)&(lck->lk.poll), poll_val, false, NULL)) != 0) { #else - if ((rc = syscall(__NR_futex, (_Atomic(int) *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL, + if ((rc = syscall(__NR_futex, (int *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { #endif KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) " @@ -462,7 +462,7 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) { ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n", lck, gtid)); #ifdef __COSMOPOLITAN__ - nsync_futex_wake_((_Atomic(int) *)&(lck->lk.poll), 1, false); + nsync_futex_wake_((int *)&(lck->lk.poll), 1, false); #else syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0);