Remove bad defines from early days of project

These definitions were causing issues with building LLVM. It is possible
they also caused crashes we've seen with our MacOS ARM64 OpenMP support.
This commit is contained in:
Justine Tunney 2024-07-24 12:11:21 -07:00
parent f25fbbaaeb
commit 5dd7ddb9ea
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
20 changed files with 568 additions and 140 deletions

View file

@ -9,14 +9,13 @@ int mulaw(int);
int unmulaw(int); int unmulaw(int);
void *double2byte(long, const void *, double, double) vallocesque; void *double2byte(long, const void *, double, double) vallocesque;
void *byte2double(long, const void *, double, double) vallocesque; void *byte2double(long, const void *, double, double) vallocesque;
void *dct(float[restrict hasatleast 8][8], unsigned, float, float, float, float, void *dct(float[hasatleast 8][8], unsigned, float, float, float, float, float);
float); void *dctjpeg(float[hasatleast 8][8], unsigned);
void *dctjpeg(float[restrict hasatleast 8][8], unsigned);
double det3(const double[3][3]) nosideeffect; double det3(const double[3][3]) nosideeffect;
void *inv3(double[restrict 3][3], const double[restrict 3][3], double); void *inv3(double[3][3], const double[3][3], double);
void *matmul3(double[restrict 3][3], const double[3][3], const double[3][3]); void *matmul3(double[3][3], const double[3][3], const double[3][3]);
void *vmatmul3(double[restrict 3], const double[3], const double[3][3]); void *vmatmul3(double[3], const double[3], const double[3][3]);
void *matvmul3(double[restrict 3], const double[3][3], const double[3]); void *matvmul3(double[3], const double[3][3], const double[3]);
double rgb2stdtv(double) pureconst; double rgb2stdtv(double) pureconst;
double rgb2lintv(double) pureconst; double rgb2lintv(double) pureconst;
double rgb2stdpc(double, double) pureconst; double rgb2stdpc(double, double) pureconst;

View file

@ -65,8 +65,8 @@
* *
* @cost ~100ns * @cost ~100ns
*/ */
void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0, void *dct(float M[hasatleast 8][8], unsigned stride, float c0, float c1,
float c1, float c2, float c3, float c4) { float c2, float c3, float c4) {
unsigned y, x; unsigned y, x;
for (y = 0; y < stride * 8; y += stride) { for (y = 0; y < stride * 8; y += stride) {
DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7], DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7],
@ -79,7 +79,7 @@ void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0,
return M; return M;
} }
void *dctjpeg(float M[restrict hasatleast 8][8], unsigned stride) { void *dctjpeg(float M[hasatleast 8][8], unsigned stride) {
return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f, return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f,
.707106781f); .707106781f);
} }

View file

@ -30,7 +30,7 @@
* @define 𝐀¹=𝐁 such that 𝐀×𝐁=𝐁×𝐀=𝐈 * @define 𝐀¹=𝐁 such that 𝐀×𝐁=𝐁×𝐀=𝐈
* @see det3() * @see det3()
*/ */
void *inv3(double B[restrict 3][3], const double A[restrict 3][3], double d) { void *inv3(double B[3][3], const double A[3][3], double d) {
d = d ? 1 / d : NAN; d = d ? 1 / d : NAN;
B[0][0] = (A[1][1] * A[2][2] - A[2][1] * A[1][2]) * d; B[0][0] = (A[1][1] * A[2][2] - A[2][1] * A[1][2]) * d;
B[0][1] = (A[2][1] * A[0][2] - A[0][1] * A[2][2]) * d; B[0][1] = (A[2][1] * A[0][2] - A[0][1] * A[2][2]) * d;

View file

@ -22,8 +22,7 @@
/** /**
* Multiplies 3×3 matrices. * Multiplies 3×3 matrices.
*/ */
void *matmul3(double R[restrict 3][3], const double A[3][3], void *matmul3(double R[3][3], const double A[3][3], const double B[3][3]) {
const double B[3][3]) {
int i, j, k; int i, j, k;
memset(R, 0, sizeof(double) * 3 * 3); memset(R, 0, sizeof(double) * 3 * 3);
for (i = 0; i < 3; ++i) { for (i = 0; i < 3; ++i) {

View file

@ -23,7 +23,7 @@
* *
* @see vmatmul3() for noncommutative corollary * @see vmatmul3() for noncommutative corollary
*/ */
void *matvmul3(double R[restrict 3], const double M[3][3], const double V[3]) { void *matvmul3(double R[3], const double M[3][3], const double V[3]) {
R[0] = V[0] * M[0][0] + V[1] * M[0][1] + V[2] * M[0][2]; R[0] = V[0] * M[0][0] + V[1] * M[0][1] + V[2] * M[0][2];
R[1] = V[0] * M[1][0] + V[1] * M[1][1] + V[2] * M[1][2]; R[1] = V[0] * M[1][0] + V[1] * M[1][1] + V[2] * M[1][2];
R[2] = V[0] * M[2][0] + V[1] * M[2][1] + V[2] * M[2][2]; R[2] = V[0] * M[2][0] + V[1] * M[2][1] + V[2] * M[2][2];

View file

@ -23,7 +23,7 @@
* *
* @see matvmul3() for noncommutative corollary * @see matvmul3() for noncommutative corollary
*/ */
void *vmatmul3(double R[restrict 3], const double V[3], const double M[3][3]) { void *vmatmul3(double R[3], const double V[3], const double M[3][3]) {
R[0] = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0]; R[0] = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0];
R[1] = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1]; R[1] = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1];
R[2] = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2]; R[2] = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2];

View file

@ -31,9 +31,8 @@
#include "dsp/mpeg/video.h" #include "dsp/mpeg/video.h"
#include "libc/log/check.h" #include "libc/log/check.h"
forceinline void plm_video_process_macroblock(plm_video_t *self, forceinline void plm_video_process_macroblock(plm_video_t *self, uint8_t *d,
uint8_t *restrict d, uint8_t *s, int motion_h,
uint8_t *restrict s, int motion_h,
int motion_v, bool interpolate, int motion_v, bool interpolate,
unsigned BW) { unsigned BW) {
unsigned si, di, max_address; unsigned si, di, max_address;
@ -155,17 +154,17 @@ forceinline void plm_video_process_macroblock(plm_video_t *self,
} }
} }
void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *restrict d, void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *d, uint8_t *s,
uint8_t *restrict s, int motion_h, int motion_h, int motion_v,
int motion_v, bool interpolate) { bool interpolate) {
DCHECK_ALIGNED(8, d); DCHECK_ALIGNED(8, d);
DCHECK_ALIGNED(8, s); DCHECK_ALIGNED(8, s);
plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8); plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8);
} }
void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *restrict d, void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *d, uint8_t *s,
uint8_t *restrict s, int motion_h, int motion_h, int motion_v,
int motion_v, bool interpolate) { bool interpolate) {
DCHECK_ALIGNED(16, d); DCHECK_ALIGNED(16, d);
DCHECK_ALIGNED(16, s); DCHECK_ALIGNED(16, s);
plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16); plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16);

View file

@ -51,10 +51,10 @@ typedef struct plm_video_t {
int assume_no_b_frames; int assume_no_b_frames;
} plm_video_t; } plm_video_t;
void plm_video_process_macroblock_8(plm_video_t *, uint8_t *restrict, void plm_video_process_macroblock_8(plm_video_t *, uint8_t *, uint8_t *, int,
uint8_t *restrict, int, int, bool); int, bool);
void plm_video_process_macroblock_16(plm_video_t *, uint8_t *restrict, void plm_video_process_macroblock_16(plm_video_t *, uint8_t *, uint8_t *, int,
uint8_t *restrict, int, int, bool); int, bool);
COSMOPOLITAN_C_END_ COSMOPOLITAN_C_END_
#endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */ #endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */

View file

@ -53,11 +53,11 @@ struct SamplingSolution {
static double ComputeWeight(double x) { static double ComputeWeight(double x) {
if (-1.5 < x && x < 1.5) { if (-1.5 < x && x < 1.5) {
if (-.5 < x && x < .5) { if (-.5 < x && x < .5) {
return .75 - SQR(x); return.75 - SQR(x);
} else if (x < 0) { } else if (x < 0) {
return .5 * SQR(x + 1.5); return.5 * SQR(x + 1.5);
} else { } else {
return .5 * SQR(x - 1.5); return.5 * SQR(x - 1.5);
} }
} else { } else {
return 0; return 0;
@ -149,12 +149,11 @@ static int Sharpen(int ax, int bx, int cx) {
static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw, static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw,
long sxw, const int src[syw][sxw], long dyn, long dxn, long sxw, const int src[syw][sxw], long dyn, long dxn,
long syn, long sxn, int tmp0[restrict dyn][sxn], long syn, long sxn, int tmp0[dyn][sxn],
int tmp1[restrict dyn][sxn], int tmp1[dyn][sxn], int tmp2[dyn][dxn], long yfn,
int tmp2[restrict dyn][dxn], long yfn, long xfn, long xfn, const short fyi[dyn][yfn],
const short fyi[dyn][yfn], const short fyw[dyn][yfn], const short fyw[dyn][yfn], const short fxi[dxn][xfn],
const short fxi[dxn][xfn], const short fxw[dxn][xfn], const short fxw[dxn][xfn], bool sharpen) {
bool sharpen) {
long i; long i;
int eax, dy, dx, sx; int eax, dy, dx, sx;
for (sx = 0; sx < sxn; ++sx) { for (sx = 0; sx < sxn; ++sx) {

View file

@ -106,9 +106,9 @@ void *Magkern2xY(long ys, long xs, unsigned char p[ys][xs], long yn, long xn) {
return p; return p;
} }
void *MagikarpY(long dys, long dxs, unsigned char d[restrict dys][dxs], void *MagikarpY(long dys, long dxs, unsigned char d[dys][dxs], long sys,
long sys, long sxs, const unsigned char s[sys][sxs], long yn, long sxs, const unsigned char s[sys][sxs], long yn, long xn,
long xn, const signed char K[8]) { const signed char K[8]) {
long y, x; long y, x;
for (y = 0; y < yn; ++y) { for (y = 0; y < yn; ++y) {
for (x = 0; x < xn; ++x) { for (x = 0; x < xn; ++x) {

View file

@ -135,7 +135,7 @@ void sgemm(int m, int n, int k, //
const float *B, int ldb, // const float *B, int ldb, //
float *C, int ldc) { float *C, int ldc) {
int nth = sysconf(_SC_NPROCESSORS_ONLN); int nth = sysconf(_SC_NPROCESSORS_ONLN);
#pragma omp parallel for #pragma omp parallel for schedule(static)
for (int ith = 0; ith < nth; ++ith) { for (int ith = 0; ith < nth; ++ith) {
ansiBLAS tb{k, A, lda, B, ldb, C, ldc, ith, nth}; ansiBLAS tb{k, A, lda, B, ldb, C, ldc, ith, nth};
tb.matmul(m, n); tb.matmul(m, n);

View file

@ -2,7 +2,13 @@
#define COSMOPOLITAN_LIBC_COSMO_H_ #define COSMOPOLITAN_LIBC_COSMO_H_
COSMOPOLITAN_C_START_ COSMOPOLITAN_C_START_
errno_t cosmo_once(_Atomic(unsigned) *, void (*)(void)) libcesque; #ifndef __cplusplus
#define _COSMO_ATOMIC(x) _Atomic(x)
#else
#define _COSMO_ATOMIC(x) x
#endif
errno_t cosmo_once(_COSMO_ATOMIC(unsigned) *, void (*)(void)) libcesque;
int systemvpe(const char *, char *const[], char *const[]) libcesque; int systemvpe(const char *, char *const[], char *const[]) libcesque;
char *GetProgramExecutableName(void) libcesque; char *GetProgramExecutableName(void) libcesque;
void unleaf(void) libcesque; void unleaf(void) libcesque;

View file

@ -1,7 +1,3 @@
#if __GNUC__ + 0 < 2
#define __attribute__(x)
#endif
#ifndef __cplusplus #ifndef __cplusplus
#define COSMOPOLITAN_C_START_ #define COSMOPOLITAN_C_START_
#define COSMOPOLITAN_C_END_ #define COSMOPOLITAN_C_END_
@ -10,73 +6,16 @@
#define COSMOPOLITAN_CXX_USING_ #define COSMOPOLITAN_CXX_USING_
#endif #endif
#ifndef __ia16__
#define __far
#endif
#if !defined(__GNUC__) && __cplusplus + 0 >= 201103L #if !defined(__GNUC__) && __cplusplus + 0 >= 201103L
#define typeof(x) decltype(x) #define typeof(x) decltype(x)
#elif !defined(__GNUC__) && __STDC_VERSION__ + 0 < 201112 #elif !defined(__GNUC__) && __STDC_VERSION__ + 0 < 201112
#define typeof(x) __typeof(x) #define typeof(x) __typeof(x)
#endif #endif
#ifdef __cplusplus
#if __cplusplus >= 201103L
#define _Alignof(x) alignof(x)
#endif /* C++11 */
#else /* __cplusplus */
#if __STDC_VERSION__ + 0 < 201112
#if __GNUC__ + _MSC_VER + 0
#define _Alignof(x) __alignof(x)
#else
#define _Alignof(x) /* basically all it ever did lool */ sizeof(x)
#endif /* GNU/MSVC/!ANSI */
#endif /* C11 */
#endif /* __cplusplus */
#if !defined(__cplusplus) && !defined(inline) && __STDC_VERSION__ + 0 < 199901
#if defined(__GNUC__) || defined(_MSC_VER)
#define inline __inline
#else
#define inline
#define __inline
#endif
#endif
#ifdef __chibicc__ #ifdef __chibicc__
#define __extension__ #define __extension__
#endif #endif
#if __STDC_VERSION__ + 0 < 201112
#ifdef __GNUC__
#define _Alignas(x) __attribute__((__aligned__(x)))
#elif defined(_MSC_VER)
#define _Alignas(x) __declspec(align(x))
#endif
#endif
#ifdef _MSC_VER
#define __builtin_unreachable() __assume(false)
#elif !((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 405 || \
defined(__clang__) || defined(__INTEL_COMPILER) || \
__has_builtin(__builtin_unreachable))
#define __builtin_unreachable() \
for (;;) { \
}
#endif
#if (!defined(__llvm__) && !__has_builtin(__builtin_assume))
#define __builtin_assume(x) \
do { \
if (!(x)) \
__builtin_unreachable(); \
} while (0)
#endif
#if __STDC_VERSION__ + 0 < 201112
#define _Atomic(t) volatile t
#endif
#ifdef __llvm__ #ifdef __llvm__
#define __gnu_printf__ __printf__ #define __gnu_printf__ __printf__
#define __gnu_scanf__ __scanf__ #define __gnu_scanf__ __scanf__
@ -325,16 +264,6 @@ typedef struct {
#define hasatleast #define hasatleast
#endif #endif
#if __STDC_VERSION__ + 0 < 199901L && !defined(restrict)
#if !defined(__cplusplus) && \
((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 301 || defined(_MSC_VER))
#define restrict __restrict__
#else
#define restrict
#define __restrict
#endif
#endif
#ifndef dontcallback #ifndef dontcallback
#if (__has_attribute(__leaf__) || \ #if (__has_attribute(__leaf__) || \
(!defined(__llvm__) && \ (!defined(__llvm__) && \
@ -594,11 +523,8 @@ typedef struct {
#ifdef __aarch64__ #ifdef __aarch64__
/* raise sigill (not sigtrap) like x86 does */ /* raise sigill (not sigtrap) like x86 does */
#define __builtin_trap() \ #define __builtin_trap() \
do { \ (({ __asm__("udf\t#0x666"); }), __builtin_unreachable())
__asm__("udf\t#0x666"); \
__builtin_unreachable(); \
} while (0)
#endif #endif
#endif /* _COSMO_SOURCE */ #endif /* _COSMO_SOURCE */

View file

@ -6,6 +6,12 @@ COSMOPOLITAN_C_START_
#define ZIPOS_SYNTHETIC_DIRECTORY 0 #define ZIPOS_SYNTHETIC_DIRECTORY 0
#ifndef __cplusplus
#define _ZIPOS_ATOMIC(x) _Atomic(x)
#else
#define _ZIPOS_ATOMIC(x) x
#endif
struct stat; struct stat;
struct iovec; struct iovec;
struct Zipos; struct Zipos;
@ -21,8 +27,8 @@ struct ZiposHandle {
size_t size; size_t size;
size_t mapsize; size_t mapsize;
size_t cfile; size_t cfile;
_Atomic(size_t) refs; _ZIPOS_ATOMIC(size_t) refs;
_Atomic(size_t) pos; _ZIPOS_ATOMIC(size_t) pos;
uint8_t *mem; uint8_t *mem;
uint8_t data[]; uint8_t data[];
}; };

View file

@ -26,10 +26,10 @@ COSMOPOLITAN_C_START_
void *memset(void *, int, size_t) memcpyesque; void *memset(void *, int, size_t) memcpyesque;
void *memmove(void *, const void *, size_t) memcpyesque; void *memmove(void *, const void *, size_t) memcpyesque;
void *memcpy(void *restrict, const void *restrict, size_t) memcpyesque; void *memcpy(void *, const void *, size_t) memcpyesque;
void *mempcpy(void *restrict, const void *restrict, size_t) memcpyesque; void *mempcpy(void *, const void *, size_t) memcpyesque;
char *hexpcpy(char *restrict, const void *restrict, size_t) memcpyesque; char *hexpcpy(char *, const void *, size_t) memcpyesque;
void *memccpy(void *restrict, const void *restrict, int, size_t) memcpyesque; void *memccpy(void *, const void *, int, size_t) memcpyesque;
void explicit_bzero(void *, size_t); void explicit_bzero(void *, size_t);
int memcmp(const void *, const void *, size_t) strlenesque; int memcmp(const void *, const void *, size_t) strlenesque;

View file

@ -3,6 +3,12 @@
#include "libc/calls/struct/timespec.h" #include "libc/calls/struct/timespec.h"
COSMOPOLITAN_C_START_ COSMOPOLITAN_C_START_
#ifndef __cplusplus
#define _SEM_ATOMIC(x) _Atomic(x)
#else
#define _SEM_ATOMIC(x) x
#endif
#define SEM_FAILED ((sem_t *)0) #define SEM_FAILED ((sem_t *)0)
#define SEM_MAGIC_NAMED 0xDEADBEEFu #define SEM_MAGIC_NAMED 0xDEADBEEFu
#define SEM_MAGIC_UNNAMED 0xFEEDABEEu #define SEM_MAGIC_UNNAMED 0xFEEDABEEu
@ -11,9 +17,9 @@ COSMOPOLITAN_C_START_
typedef struct { typedef struct {
union { union {
struct { struct {
_Atomic(int) sem_value; _SEM_ATOMIC(int) sem_value;
_Atomic(int) sem_waiters; _SEM_ATOMIC(int) sem_waiters;
_Atomic(int) sem_prefs; /* named only */ _SEM_ATOMIC(int) sem_prefs; /* named only */
unsigned sem_magic; unsigned sem_magic;
int64_t sem_dev; /* named only */ int64_t sem_dev; /* named only */
int64_t sem_ino; /* named only */ int64_t sem_ino; /* named only */

View file

@ -48,6 +48,12 @@ COSMOPOLITAN_C_START_
#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE} #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}
#ifndef __cplusplus
#define _PTHREAD_ATOMIC(x) _Atomic(x)
#else
#define _PTHREAD_ATOMIC(x) x
#endif
typedef uintptr_t pthread_t; typedef uintptr_t pthread_t;
typedef int pthread_id_np_t; typedef int pthread_id_np_t;
typedef char pthread_condattr_t; typedef char pthread_condattr_t;
@ -57,20 +63,20 @@ typedef unsigned pthread_key_t;
typedef void (*pthread_key_dtor)(void *); typedef void (*pthread_key_dtor)(void *);
typedef struct pthread_once_s { typedef struct pthread_once_s {
_Atomic(uint32_t) _lock; _PTHREAD_ATOMIC(uint32_t) _lock;
} pthread_once_t; } pthread_once_t;
typedef struct pthread_spinlock_s { typedef struct pthread_spinlock_s {
_Atomic(int) _lock; _PTHREAD_ATOMIC(int) _lock;
} pthread_spinlock_t; } pthread_spinlock_t;
typedef struct pthread_mutex_s { typedef struct pthread_mutex_s {
uint32_t _nsync; uint32_t _nsync;
union { union {
int32_t _pid; int32_t _pid;
_Atomic(int32_t) _futex; _PTHREAD_ATOMIC(int32_t) _futex;
}; };
_Atomic(uint64_t) _word; _PTHREAD_ATOMIC(uint64_t) _word;
} pthread_mutex_t; } pthread_mutex_t;
typedef struct pthread_mutexattr_s { typedef struct pthread_mutexattr_s {
@ -85,8 +91,8 @@ typedef struct pthread_cond_s {
char _pshared; char _pshared;
}; };
}; };
_Atomic(uint32_t) _sequence; _PTHREAD_ATOMIC(uint32_t) _sequence;
_Atomic(uint32_t) _waiters; _PTHREAD_ATOMIC(uint32_t) _waiters;
} pthread_cond_t; } pthread_cond_t;
typedef struct pthread_rwlock_s { typedef struct pthread_rwlock_s {
@ -97,8 +103,8 @@ typedef struct pthread_rwlock_s {
typedef struct pthread_barrier_s { typedef struct pthread_barrier_s {
int _count; int _count;
char _pshared; char _pshared;
_Atomic(int) _counter; _PTHREAD_ATOMIC(int) _counter;
_Atomic(int) _waiters; _PTHREAD_ATOMIC(int) _waiters;
} pthread_barrier_t; } pthread_barrier_t;
typedef struct pthread_attr_s { typedef struct pthread_attr_s {

476
test/libcxx/openmp_test.cc Normal file
View file

@ -0,0 +1,476 @@
/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
vi: set et ft=cpp ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include <algorithm>
#include <atomic>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include "libc/stdio/rand.h"
#define PRECISION 2e-6
#define LV1DCACHE 49152
#define THRESHOLD 3000000
#if defined(__OPTIMIZE__) && !defined(__SANITIZE_ADDRESS__)
#define ITERATIONS 5
#else
#define ITERATIONS 1
#endif
#define OPTIMIZED __attribute__((__optimize__("-O3,-ffast-math")))
#define PORTABLE \
__target_clones("arch=znver4," \
"arch=znver3," \
"arch=sapphirerapids," \
"arch=alderlake," \
"arch=rocketlake," \
"arch=cooperlake," \
"arch=tigerlake," \
"arch=cascadelake," \
"arch=skylake-avx512," \
"arch=skylake," \
"arch=znver1," \
"arch=tremont," \
"fma," \
"avx")
static bool is_self_testing;
// m×n → n×m
template <typename TA, typename TB>
void transpose(long m, long n, const TA *A, long lda, TB *B, long ldb) {
#pragma omp parallel for collapse(2) if (m * n > THRESHOLD)
for (long i = 0; i < m; ++i)
for (long j = 0; j < n; ++j) {
B[ldb * j + i] = A[lda * i + j];
}
}
// m×k * k×n → m×n
// k×m * k×n → m×n if aᵀ
// m×k * n×k → m×n if bᵀ
// k×m * n×k → m×n if aᵀ and bᵀ
template <typename TC, typename TA, typename TB>
void dgemm(bool aᵀ, bool bᵀ, long m, long n, long k, float α, const TA *A,
long lda, const TB *B, long ldb, float β, TC *C, long ldc) {
#pragma omp parallel for collapse(2) if (m * n * k > THRESHOLD)
for (long i = 0; i < m; ++i)
for (long j = 0; j < n; ++j) {
double sum = 0;
for (long l = 0; l < k; ++l)
sum = std::fma((aᵀ ? A[lda * l + i] : A[lda * i + l]) * α,
(bᵀ ? B[ldb * j + l] : B[ldb * l + j]), sum);
C[ldc * i + j] = C[ldc * i + j] * β + sum;
}
}
template <typename T, typename TC, typename TA, typename TB>
struct Gemmlin {
public:
Gemmlin(bool aT, bool bT, float α, const TA *A, long lda, const TB *B,
long ldb, float β, TC *C, long ldc)
: aT(aT),
bT(bT),
α(α),
A(A),
lda(lda),
B(B),
ldb(ldb),
β(β),
C(C),
ldc(ldc) {
}
void gemm(long m, long n, long k) {
if (!m || !n)
return;
for (long i = 0; i < m; ++i)
for (long j = 0; j < n; ++j) {
C[ldc * i + j] *= β;
}
if (!k)
return;
cub = sqrt(LV1DCACHE) / sqrt(sizeof(T) * 3);
mnpack(0, m, 0, n, 0, k);
}
private:
void mnpack(long m0, long m, //
long n0, long n, //
long k0, long k) {
long mc = rounddown(std::min(m - m0, cub), 4);
long mp = m0 + (m - m0) / mc * mc;
long nc = rounddown(std::min(n - n0, cub), 4);
long np = n0 + (n - n0) / nc * nc;
long kc = rounddown(std::min(k - k0, cub), 4);
long kp = k0 + (k - k0) / kc * kc;
kpack(m0, mc, mp, n0, nc, np, k0, kc, k, kp);
if (m - mp)
mnpack(mp, m, n0, np, k0, k);
if (n - np)
mnpack(m0, mp, np, n, k0, k);
if (m - mp && n - np)
mnpack(mp, m, np, n, k0, k);
}
void kpack(long m0, long mc, long m, //
long n0, long nc, long n, //
long k0, long kc, long k, //
long kp) {
rpack(m0, mc, m, n0, nc, n, k0, kc, kp);
if (k - kp)
rpack(m0, mc, m, n0, nc, n, kp, k - kp, k);
}
void rpack(long m0, long mc, long m, //
long n0, long nc, long n, //
long k0, long kc, long k) {
if (!(mc % 4) && !(nc % 4))
bgemm<4, 4>(m0, mc, m, n0, nc, n, k0, kc, k);
else
bgemm<1, 1>(m0, mc, m, n0, nc, n, k0, kc, k);
}
template <int mr, int nr>
void bgemm(long m0, long mc, long m, //
long n0, long nc, long n, //
long k0, long kc, long k) {
ops = (m - m0) * (n - n0) * (k - k0);
ml = (m - m0) / mc;
nl = (n - n0) / nc;
locks = new lock[ml * nl];
there_will_be_blocks<mr, nr>(m0, mc, m, n0, nc, n, k0, kc, k);
delete[] locks;
}
template <int mr, int nr>
void there_will_be_blocks(long m0, long mc, long m, long n0, long nc, long n,
long k0, long kc, long k) {
#pragma omp parallel for collapse(2) if (ops > THRESHOLD && mc * kc > 16)
for (long ic = m0; ic < m; ic += mc)
for (long pc = k0; pc < k; pc += kc)
gizmo<mr, nr>(m0, mc, ic, n0, nc, k0, kc, pc, n);
}
template <int mr, int nr>
PORTABLE OPTIMIZED void gizmo(long m0, long mc, long ic, long n0, long nc,
long k0, long kc, long pc, long n) {
T Ac[mc / mr][kc][mr];
for (long i = 0; i < mc; ++i)
for (long j = 0; j < kc; ++j)
Ac[i / mr][j][i % mr] = α * (aT ? A[lda * (pc + j) + (ic + i)]
: A[lda * (ic + i) + (pc + j)]);
for (long jc = n0; jc < n; jc += nc) {
T Bc[nc / nr][nr][kc];
for (long j = 0; j < nc; ++j)
for (long i = 0; i < kc; ++i)
Bc[j / nr][j % nr][i] =
bT ? B[ldb * (jc + j) + (pc + i)] : B[ldb * (pc + i) + (jc + j)];
T Cc[nc / nr][mc / mr][nr][mr];
memset(Cc, 0, nc * mc * sizeof(float));
for (long jr = 0; jr < nc / nr; ++jr)
for (long ir = 0; ir < mc / mr; ++ir)
for (long pr = 0; pr < kc; ++pr)
for (long j = 0; j < nr; ++j)
for (long i = 0; i < mr; ++i)
Cc[jr][ir][j][i] += Ac[ir][pr][i] * Bc[jr][j][pr];
const long lk = nl * ((ic - m0) / mc) + ((jc - n0) / nc);
locks[lk].acquire();
for (long ir = 0; ir < mc; ir += mr)
for (long jr = 0; jr < nc; jr += nr)
for (long i = 0; i < mr; ++i)
for (long j = 0; j < nr; ++j)
C[ldc * (ic + ir + i) + (jc + jr + j)] +=
Cc[jr / nr][ir / mr][j][i];
locks[lk].release();
}
}
inline long rounddown(long x, long r) {
if (x < r)
return x;
else
return x & -r;
}
class lock {
public:
lock() = default;
void acquire() {
while (lock_.exchange(true, std::memory_order_acquire)) {
}
}
void release() {
lock_.store(false, std::memory_order_release);
}
private:
std::atomic_bool lock_ = false;
};
bool aT;
bool bT;
float α;
const TA *A;
long lda;
const TB *B;
long ldb;
float β;
TC *C;
long ldc;
long ops;
long nl;
long ml;
lock *locks;
long cub;
};
template <typename TC, typename TA, typename TB>
void sgemm(bool aT, bool bT, long m, long n, long k, float α, const TA *A,
long lda, const TB *B, long ldb, float β, TC *C, long ldc) {
Gemmlin<float, TC, TA, TB> g{aT, bT, α, A, lda, B, ldb, β, C, ldc};
g.gemm(m, n, k);
}
template <typename TA, typename TB>
void show(FILE *f, long max, long m, long n, const TA *A, long lda, const TB *B,
long ldb) {
flockfile(f);
fprintf(f, " ");
for (long j = 0; j < n; ++j) {
fprintf(f, "%13ld", j);
}
fprintf(f, "\n");
for (long i = 0; i < m; ++i) {
if (i == max) {
fprintf(f, "...\n");
break;
}
fprintf(f, "%5ld ", i);
for (long j = 0; j < n; ++j) {
if (j == max) {
fprintf(f, " ...");
break;
}
char ba[16], bb[16];
sprintf(ba, "%13.7f", static_cast<double>(A[lda * i + j]));
sprintf(bb, "%13.7f", static_cast<double>(B[ldb * i + j]));
for (long k = 0; ba[k] && bb[k]; ++k) {
if (ba[k] != bb[k])
fputs_unlocked("\33[31m", f);
fputc_unlocked(ba[k], f);
if (ba[k] != bb[k])
fputs_unlocked("\33[0m", f);
}
}
fprintf(f, "\n");
}
funlockfile(f);
}
inline unsigned long GetDoubleBits(double f) {
union {
double f;
unsigned long i;
} u;
u.f = f;
return u.i;
}
inline bool IsNan(double x) {
return (GetDoubleBits(x) & (-1ull >> 1)) > (0x7ffull << 52);
}
template <typename TA, typename TB>
double diff(long m, long n, const TA *Want, long lda, const TB *Got, long ldb) {
double s = 0;
int got_nans = 0;
int want_nans = 0;
for (long i = 0; i < m; ++i)
for (long j = 0; j < n; ++j)
if (IsNan(Want[ldb * i + j]))
++want_nans;
else if (IsNan(Got[ldb * i + j]))
++got_nans;
else
s += std::fabs(Want[lda * i + j] - Got[ldb * i + j]);
if (got_nans)
printf("WARNING: got %d NaNs!\n", got_nans);
if (want_nans)
printf("WARNING: want array has %d NaNs!\n", want_nans);
return s / (m * n);
}
template <typename TA, typename TB>
void show_error(FILE *f, long max, long m, long n, const TA *A, long lda,
const TB *B, long ldb, const char *file, int line, double sad,
double tol) {
fprintf(f, "%s:%d: sad %.17g exceeds %g\nwant\n", file, line, sad, tol);
show(f, max, m, n, A, lda, B, ldb);
fprintf(f, "got\n");
show(f, max, m, n, B, ldb, A, lda);
fprintf(f, "\n");
}
template <typename TA, typename TB>
void check(double tol, long m, long n, const TA *A, long lda, const TB *B,
long ldb, const char *file, int line) {
double sad = diff(m, n, A, lda, B, ldb);
if (sad <= tol) {
if (!is_self_testing) {
printf(" %g error\n", sad);
}
} else {
show_error(stderr, 16, m, n, A, lda, B, ldb, file, line, sad, tol);
const char *path = "/tmp/openmp_test.log";
FILE *f = fopen(path, "w");
if (f) {
show_error(f, 10000, m, n, A, lda, B, ldb, file, line, sad, tol);
printf("see also %s\n", path);
}
exit(1);
}
}
#define check(tol, m, n, A, lda, B, ldb) \
check(tol, m, n, A, lda, B, ldb, __FILE__, __LINE__)
long micros(void) {
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000;
}
#define bench(x) \
do { \
long t1 = micros(); \
for (long i = 0; i < ITERATIONS; ++i) { \
asm volatile("" ::: "memory"); \
x; \
asm volatile("" ::: "memory"); \
} \
long t2 = micros(); \
printf("%8" PRId64 " µs %s\n", (t2 - t1 + ITERATIONS - 1) / ITERATIONS, \
#x); \
} while (0)
double real01(unsigned long x) { // (0,1)
return 1. / 4503599627370496. * ((x >> 12) + .5);
}
double numba(void) { // (-1,1)
return real01(lemur64()) * 2 - 1;
}
template <typename T>
void fill(T *A, long n) {
for (long i = 0; i < n; ++i) {
A[i] = numba();
}
}
void test_gemm(long m, long n, long k) {
float *A = new float[m * k];
float *At = new float[k * m];
float *B = new float[k * n];
float *Bt = new float[n * k];
float *C = new float[m * n];
float *GOLD = new float[m * n];
float α = 1;
float β = 0;
fill(A, m * k);
fill(B, k * n);
dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n);
transpose(m, k, A, k, At, m);
transpose(k, n, B, n, Bt, k);
sgemm(0, 0, m, n, k, α, A, k, B, n, β, C, n);
check(PRECISION, m, n, GOLD, n, C, n);
sgemm(1, 0, m, n, k, α, At, m, B, n, β, C, n);
check(PRECISION, m, n, GOLD, n, C, n);
sgemm(0, 1, m, n, k, α, A, k, Bt, k, β, C, n);
check(PRECISION, m, n, GOLD, n, C, n);
sgemm(1, 1, m, n, k, α, At, m, Bt, k, β, C, n);
check(PRECISION, m, n, GOLD, n, C, n);
delete[] GOLD;
delete[] C;
delete[] Bt;
delete[] B;
delete[] At;
delete[] A;
}
void check_gemm_works(void) {
static long kSizes[] = {1, 2, 3, 4, 5, 6, 7, 17, 31, 33, 63, 128, 129};
is_self_testing = true;
long c = 0;
long N = sizeof(kSizes) / sizeof(kSizes[0]);
for (long i = 0; i < N; ++i) {
long m = kSizes[i];
for (long j = 0; j < N; ++j) {
long n = kSizes[N - 1 - i];
for (long k = 0; k < N; ++k) {
long K = kSizes[i];
if (c++ % 13 == 0) {
printf("testing %2ld %2ld %2ld\r", m, n, K);
}
test_gemm(m, n, K);
}
}
}
printf("\r");
is_self_testing = false;
}
long m = 2333 / 3;
long k = 577 / 3;
long n = 713 / 3;
void check_sgemm(void) {
float *A = new float[m * k];
float *At = new float[k * m];
float *B = new float[k * n];
float *Bt = new float[n * k];
float *C = new float[m * n];
double *GOLD = new double[m * n];
fill(A, m * k);
fill(B, k * n);
transpose(m, k, A, k, At, m);
transpose(k, n, B, n, Bt, k);
bench(dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n));
bench(sgemm(0, 0, m, n, k, 1, A, k, B, n, 0, C, n));
check(PRECISION, m, n, GOLD, n, C, n);
bench(sgemm(1, 0, m, n, k, 1, At, m, B, n, 0, C, n));
check(PRECISION, m, n, GOLD, n, C, n);
bench(sgemm(0, 1, m, n, k, 1, A, k, Bt, k, 0, C, n));
check(PRECISION, m, n, GOLD, n, C, n);
bench(sgemm(1, 1, m, n, k, 1, At, m, Bt, k, 0, C, n));
check(PRECISION, m, n, GOLD, n, C, n);
delete[] GOLD;
delete[] C;
delete[] Bt;
delete[] B;
delete[] At;
delete[] A;
}
int main(int argc, char *argv[]) {
check_gemm_works();
check_sgemm();
}

View file

@ -4,8 +4,14 @@
#include "libc/dce.h" #include "libc/dce.h"
COSMOPOLITAN_C_START_ COSMOPOLITAN_C_START_
int nsync_futex_wake_(_Atomic(int) *, int, char); #ifndef __cplusplus
int nsync_futex_wait_(_Atomic(int) *, int, char, const struct timespec *); #define _FUTEX_ATOMIC(x) _Atomic(x)
#else
#define _FUTEX_ATOMIC(x) x
#endif
int nsync_futex_wake_(_FUTEX_ATOMIC(int) *, int, char);
int nsync_futex_wait_(_FUTEX_ATOMIC(int) *, int, char, const struct timespec *);
COSMOPOLITAN_C_END_ COSMOPOLITAN_C_END_
#endif /* NSYNC_FUTEX_INTERNAL_H_ */ #endif /* NSYNC_FUTEX_INTERNAL_H_ */

View file

@ -380,9 +380,9 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) {
long rc; long rc;
#ifdef __COSMOPOLITAN__ #ifdef __COSMOPOLITAN__
if ((rc = nsync_futex_wait_((_Atomic(int) *)&(lck->lk.poll), poll_val, false, NULL)) != 0) { if ((rc = nsync_futex_wait_((int *)&(lck->lk.poll), poll_val, false, NULL)) != 0) {
#else #else
if ((rc = syscall(__NR_futex, (_Atomic(int) *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL, if ((rc = syscall(__NR_futex, (int *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL,
NULL, 0)) != 0) { NULL, 0)) != 0) {
#endif #endif
KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) " KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) "
@ -462,7 +462,7 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) {
("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n", ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n",
lck, gtid)); lck, gtid));
#ifdef __COSMOPOLITAN__ #ifdef __COSMOPOLITAN__
nsync_futex_wake_((_Atomic(int) *)&(lck->lk.poll), 1, false); nsync_futex_wake_((int *)&(lck->lk.poll), 1, false);
#else #else
syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex),
NULL, NULL, 0); NULL, NULL, 0);