mirror of
				https://github.com/jart/cosmopolitan.git
				synced 2025-10-26 11:10:58 +00:00 
			
		
		
		
	Remove bad defines from early days of project
These definitions were causing issues with building LLVM. It is possible they also caused crashes we've seen with our MacOS ARM64 OpenMP support.
This commit is contained in:
		
							parent
							
								
									f25fbbaaeb
								
							
						
					
					
						commit
						5dd7ddb9ea
					
				
					 20 changed files with 568 additions and 140 deletions
				
			
		|  | @ -9,14 +9,13 @@ int mulaw(int); | ||||||
| int unmulaw(int); | int unmulaw(int); | ||||||
| void *double2byte(long, const void *, double, double) vallocesque; | void *double2byte(long, const void *, double, double) vallocesque; | ||||||
| void *byte2double(long, const void *, double, double) vallocesque; | void *byte2double(long, const void *, double, double) vallocesque; | ||||||
| void *dct(float[restrict hasatleast 8][8], unsigned, float, float, float, float, | void *dct(float[hasatleast 8][8], unsigned, float, float, float, float, float); | ||||||
|           float); | void *dctjpeg(float[hasatleast 8][8], unsigned); | ||||||
| void *dctjpeg(float[restrict hasatleast 8][8], unsigned); |  | ||||||
| double det3(const double[3][3]) nosideeffect; | double det3(const double[3][3]) nosideeffect; | ||||||
| void *inv3(double[restrict 3][3], const double[restrict 3][3], double); | void *inv3(double[3][3], const double[3][3], double); | ||||||
| void *matmul3(double[restrict 3][3], const double[3][3], const double[3][3]); | void *matmul3(double[3][3], const double[3][3], const double[3][3]); | ||||||
| void *vmatmul3(double[restrict 3], const double[3], const double[3][3]); | void *vmatmul3(double[3], const double[3], const double[3][3]); | ||||||
| void *matvmul3(double[restrict 3], const double[3][3], const double[3]); | void *matvmul3(double[3], const double[3][3], const double[3]); | ||||||
| double rgb2stdtv(double) pureconst; | double rgb2stdtv(double) pureconst; | ||||||
| double rgb2lintv(double) pureconst; | double rgb2lintv(double) pureconst; | ||||||
| double rgb2stdpc(double, double) pureconst; | double rgb2stdpc(double, double) pureconst; | ||||||
|  |  | ||||||
|  | @ -65,8 +65,8 @@ | ||||||
|  * |  * | ||||||
|  * @cost ~100ns |  * @cost ~100ns | ||||||
|  */ |  */ | ||||||
| void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0, | void *dct(float M[hasatleast 8][8], unsigned stride, float c0, float c1, | ||||||
|           float c1, float c2, float c3, float c4) { |           float c2, float c3, float c4) { | ||||||
|   unsigned y, x; |   unsigned y, x; | ||||||
|   for (y = 0; y < stride * 8; y += stride) { |   for (y = 0; y < stride * 8; y += stride) { | ||||||
|     DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7], |     DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7], | ||||||
|  | @ -79,7 +79,7 @@ void *dct(float M[restrict hasatleast 8][8], unsigned stride, float c0, | ||||||
|   return M; |   return M; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void *dctjpeg(float M[restrict hasatleast 8][8], unsigned stride) { | void *dctjpeg(float M[hasatleast 8][8], unsigned stride) { | ||||||
|   return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f, |   return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f, | ||||||
|              .707106781f); |              .707106781f); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -30,7 +30,7 @@ | ||||||
|  * @define 𝐀⁻¹=𝐁 such that 𝐀×𝐁=𝐁×𝐀=𝐈ₙ |  * @define 𝐀⁻¹=𝐁 such that 𝐀×𝐁=𝐁×𝐀=𝐈ₙ | ||||||
|  * @see det3() |  * @see det3() | ||||||
|  */ |  */ | ||||||
| void *inv3(double B[restrict 3][3], const double A[restrict 3][3], double d) { | void *inv3(double B[3][3], const double A[3][3], double d) { | ||||||
|   d = d ? 1 / d : NAN; |   d = d ? 1 / d : NAN; | ||||||
|   B[0][0] = (A[1][1] * A[2][2] - A[2][1] * A[1][2]) * d; |   B[0][0] = (A[1][1] * A[2][2] - A[2][1] * A[1][2]) * d; | ||||||
|   B[0][1] = (A[2][1] * A[0][2] - A[0][1] * A[2][2]) * d; |   B[0][1] = (A[2][1] * A[0][2] - A[0][1] * A[2][2]) * d; | ||||||
|  |  | ||||||
|  | @ -22,8 +22,7 @@ | ||||||
| /**
 | /**
 | ||||||
|  * Multiplies 3×3 matrices. |  * Multiplies 3×3 matrices. | ||||||
|  */ |  */ | ||||||
| void *matmul3(double R[restrict 3][3], const double A[3][3], | void *matmul3(double R[3][3], const double A[3][3], const double B[3][3]) { | ||||||
|               const double B[3][3]) { |  | ||||||
|   int i, j, k; |   int i, j, k; | ||||||
|   memset(R, 0, sizeof(double) * 3 * 3); |   memset(R, 0, sizeof(double) * 3 * 3); | ||||||
|   for (i = 0; i < 3; ++i) { |   for (i = 0; i < 3; ++i) { | ||||||
|  |  | ||||||
|  | @ -23,7 +23,7 @@ | ||||||
|  * |  * | ||||||
|  * @see vmatmul3() for noncommutative corollary |  * @see vmatmul3() for noncommutative corollary | ||||||
|  */ |  */ | ||||||
| void *matvmul3(double R[restrict 3], const double M[3][3], const double V[3]) { | void *matvmul3(double R[3], const double M[3][3], const double V[3]) { | ||||||
|   R[0] = V[0] * M[0][0] + V[1] * M[0][1] + V[2] * M[0][2]; |   R[0] = V[0] * M[0][0] + V[1] * M[0][1] + V[2] * M[0][2]; | ||||||
|   R[1] = V[0] * M[1][0] + V[1] * M[1][1] + V[2] * M[1][2]; |   R[1] = V[0] * M[1][0] + V[1] * M[1][1] + V[2] * M[1][2]; | ||||||
|   R[2] = V[0] * M[2][0] + V[1] * M[2][1] + V[2] * M[2][2]; |   R[2] = V[0] * M[2][0] + V[1] * M[2][1] + V[2] * M[2][2]; | ||||||
|  |  | ||||||
|  | @ -23,7 +23,7 @@ | ||||||
|  * |  * | ||||||
|  * @see matvmul3() for noncommutative corollary |  * @see matvmul3() for noncommutative corollary | ||||||
|  */ |  */ | ||||||
| void *vmatmul3(double R[restrict 3], const double V[3], const double M[3][3]) { | void *vmatmul3(double R[3], const double V[3], const double M[3][3]) { | ||||||
|   R[0] = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0]; |   R[0] = V[0] * M[0][0] + V[1] * M[1][0] + V[2] * M[2][0]; | ||||||
|   R[1] = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1]; |   R[1] = V[0] * M[0][1] + V[1] * M[1][1] + V[2] * M[2][1]; | ||||||
|   R[2] = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2]; |   R[2] = V[0] * M[0][2] + V[1] * M[1][2] + V[2] * M[2][2]; | ||||||
|  |  | ||||||
|  | @ -31,9 +31,8 @@ | ||||||
| #include "dsp/mpeg/video.h" | #include "dsp/mpeg/video.h" | ||||||
| #include "libc/log/check.h" | #include "libc/log/check.h" | ||||||
| 
 | 
 | ||||||
| forceinline void plm_video_process_macroblock(plm_video_t *self, | forceinline void plm_video_process_macroblock(plm_video_t *self, uint8_t *d, | ||||||
|                                               uint8_t *restrict d, |                                               uint8_t *s, int motion_h, | ||||||
|                                               uint8_t *restrict s, int motion_h, |  | ||||||
|                                               int motion_v, bool interpolate, |                                               int motion_v, bool interpolate, | ||||||
|                                               unsigned BW) { |                                               unsigned BW) { | ||||||
|   unsigned si, di, max_address; |   unsigned si, di, max_address; | ||||||
|  | @ -155,17 +154,17 @@ forceinline void plm_video_process_macroblock(plm_video_t *self, | ||||||
|   } |   } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *restrict d, | void plm_video_process_macroblock_8(plm_video_t *self, uint8_t *d, uint8_t *s, | ||||||
|                                     uint8_t *restrict s, int motion_h, |                                     int motion_h, int motion_v, | ||||||
|                                     int motion_v, bool interpolate) { |                                     bool interpolate) { | ||||||
|   DCHECK_ALIGNED(8, d); |   DCHECK_ALIGNED(8, d); | ||||||
|   DCHECK_ALIGNED(8, s); |   DCHECK_ALIGNED(8, s); | ||||||
|   plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8); |   plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 8); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *restrict d, | void plm_video_process_macroblock_16(plm_video_t *self, uint8_t *d, uint8_t *s, | ||||||
|                                      uint8_t *restrict s, int motion_h, |                                      int motion_h, int motion_v, | ||||||
|                                      int motion_v, bool interpolate) { |                                      bool interpolate) { | ||||||
|   DCHECK_ALIGNED(16, d); |   DCHECK_ALIGNED(16, d); | ||||||
|   DCHECK_ALIGNED(16, s); |   DCHECK_ALIGNED(16, s); | ||||||
|   plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16); |   plm_video_process_macroblock(self, d, s, motion_h, motion_v, interpolate, 16); | ||||||
|  |  | ||||||
|  | @ -51,10 +51,10 @@ typedef struct plm_video_t { | ||||||
|   int assume_no_b_frames; |   int assume_no_b_frames; | ||||||
| } plm_video_t; | } plm_video_t; | ||||||
| 
 | 
 | ||||||
| void plm_video_process_macroblock_8(plm_video_t *, uint8_t *restrict, | void plm_video_process_macroblock_8(plm_video_t *, uint8_t *, uint8_t *, int, | ||||||
|                                     uint8_t *restrict, int, int, bool); |                                     int, bool); | ||||||
| void plm_video_process_macroblock_16(plm_video_t *, uint8_t *restrict, | void plm_video_process_macroblock_16(plm_video_t *, uint8_t *, uint8_t *, int, | ||||||
|                                      uint8_t *restrict, int, int, bool); |                                      int, bool); | ||||||
| 
 | 
 | ||||||
| COSMOPOLITAN_C_END_ | COSMOPOLITAN_C_END_ | ||||||
| #endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */ | #endif /* COSMOPOLITAN_DSP_MPEG_VIDEO_H_ */ | ||||||
|  |  | ||||||
|  | @ -149,12 +149,11 @@ static int Sharpen(int ax, int bx, int cx) { | ||||||
| 
 | 
 | ||||||
| static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw, | static void GyaradosImpl(long dyw, long dxw, int dst[dyw][dxw], long syw, | ||||||
|                          long sxw, const int src[syw][sxw], long dyn, long dxn, |                          long sxw, const int src[syw][sxw], long dyn, long dxn, | ||||||
|                          long syn, long sxn, int tmp0[restrict dyn][sxn], |                          long syn, long sxn, int tmp0[dyn][sxn], | ||||||
|                          int tmp1[restrict dyn][sxn], |                          int tmp1[dyn][sxn], int tmp2[dyn][dxn], long yfn, | ||||||
|                          int tmp2[restrict dyn][dxn], long yfn, long xfn, |                          long xfn, const short fyi[dyn][yfn], | ||||||
|                          const short fyi[dyn][yfn], const short fyw[dyn][yfn], |                          const short fyw[dyn][yfn], const short fxi[dxn][xfn], | ||||||
|                          const short fxi[dxn][xfn], const short fxw[dxn][xfn], |                          const short fxw[dxn][xfn], bool sharpen) { | ||||||
|                          bool sharpen) { |  | ||||||
|   long i; |   long i; | ||||||
|   int eax, dy, dx, sx; |   int eax, dy, dx, sx; | ||||||
|   for (sx = 0; sx < sxn; ++sx) { |   for (sx = 0; sx < sxn; ++sx) { | ||||||
|  |  | ||||||
|  | @ -106,9 +106,9 @@ void *Magkern2xY(long ys, long xs, unsigned char p[ys][xs], long yn, long xn) { | ||||||
|   return p; |   return p; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void *MagikarpY(long dys, long dxs, unsigned char d[restrict dys][dxs], | void *MagikarpY(long dys, long dxs, unsigned char d[dys][dxs], long sys, | ||||||
|                 long sys, long sxs, const unsigned char s[sys][sxs], long yn, |                 long sxs, const unsigned char s[sys][sxs], long yn, long xn, | ||||||
|                 long xn, const signed char K[8]) { |                 const signed char K[8]) { | ||||||
|   long y, x; |   long y, x; | ||||||
|   for (y = 0; y < yn; ++y) { |   for (y = 0; y < yn; ++y) { | ||||||
|     for (x = 0; x < xn; ++x) { |     for (x = 0; x < xn; ++x) { | ||||||
|  |  | ||||||
|  | @ -135,7 +135,7 @@ void sgemm(int m, int n, int k,      // | ||||||
|            const float *B, int ldb,  //
 |            const float *B, int ldb,  //
 | ||||||
|            float *C, int ldc) { |            float *C, int ldc) { | ||||||
|   int nth = sysconf(_SC_NPROCESSORS_ONLN); |   int nth = sysconf(_SC_NPROCESSORS_ONLN); | ||||||
| #pragma omp parallel for | #pragma omp parallel for schedule(static) | ||||||
|   for (int ith = 0; ith < nth; ++ith) { |   for (int ith = 0; ith < nth; ++ith) { | ||||||
|     ansiBLAS tb{k, A, lda, B, ldb, C, ldc, ith, nth}; |     ansiBLAS tb{k, A, lda, B, ldb, C, ldc, ith, nth}; | ||||||
|     tb.matmul(m, n); |     tb.matmul(m, n); | ||||||
|  |  | ||||||
|  | @ -2,7 +2,13 @@ | ||||||
| #define COSMOPOLITAN_LIBC_COSMO_H_ | #define COSMOPOLITAN_LIBC_COSMO_H_ | ||||||
| COSMOPOLITAN_C_START_ | COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
| errno_t cosmo_once(_Atomic(unsigned) *, void (*)(void)) libcesque; | #ifndef __cplusplus | ||||||
|  | #define _COSMO_ATOMIC(x) _Atomic(x) | ||||||
|  | #else | ||||||
|  | #define _COSMO_ATOMIC(x) x | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | errno_t cosmo_once(_COSMO_ATOMIC(unsigned) *, void (*)(void)) libcesque; | ||||||
| int systemvpe(const char *, char *const[], char *const[]) libcesque; | int systemvpe(const char *, char *const[], char *const[]) libcesque; | ||||||
| char *GetProgramExecutableName(void) libcesque; | char *GetProgramExecutableName(void) libcesque; | ||||||
| void unleaf(void) libcesque; | void unleaf(void) libcesque; | ||||||
|  |  | ||||||
|  | @ -1,7 +1,3 @@ | ||||||
| #if __GNUC__ + 0 < 2
 |  | ||||||
| #define __attribute__(x)
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #ifndef __cplusplus
 | #ifndef __cplusplus
 | ||||||
| #define COSMOPOLITAN_C_START_
 | #define COSMOPOLITAN_C_START_
 | ||||||
| #define COSMOPOLITAN_C_END_
 | #define COSMOPOLITAN_C_END_
 | ||||||
|  | @ -10,73 +6,16 @@ | ||||||
| #define COSMOPOLITAN_CXX_USING_
 | #define COSMOPOLITAN_CXX_USING_
 | ||||||
| #endif
 | #endif
 | ||||||
| 
 | 
 | ||||||
| #ifndef __ia16__
 |  | ||||||
| #define __far
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #if !defined(__GNUC__) && __cplusplus + 0 >= 201103L
 | #if !defined(__GNUC__) && __cplusplus + 0 >= 201103L
 | ||||||
| #define typeof(x) decltype(x)
 | #define typeof(x) decltype(x)
 | ||||||
| #elif !defined(__GNUC__) && __STDC_VERSION__ + 0 < 201112
 | #elif !defined(__GNUC__) && __STDC_VERSION__ + 0 < 201112
 | ||||||
| #define typeof(x) __typeof(x)
 | #define typeof(x) __typeof(x)
 | ||||||
| #endif
 | #endif
 | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus
 |  | ||||||
| #if __cplusplus >= 201103L
 |  | ||||||
| #define _Alignof(x) alignof(x)
 |  | ||||||
| #endif /* C++11 */
 |  | ||||||
| #else  /* __cplusplus */
 |  | ||||||
| #if __STDC_VERSION__ + 0 < 201112
 |  | ||||||
| #if __GNUC__ + _MSC_VER + 0
 |  | ||||||
| #define _Alignof(x) __alignof(x)
 |  | ||||||
| #else
 |  | ||||||
| #define _Alignof(x) /* basically all it ever did lool */ sizeof(x)
 |  | ||||||
| #endif /* GNU/MSVC/!ANSI */
 |  | ||||||
| #endif /* C11 */
 |  | ||||||
| #endif /* __cplusplus */
 |  | ||||||
| 
 |  | ||||||
| #if !defined(__cplusplus) && !defined(inline) && __STDC_VERSION__ + 0 < 199901
 |  | ||||||
| #if defined(__GNUC__) || defined(_MSC_VER)
 |  | ||||||
| #define inline __inline
 |  | ||||||
| #else
 |  | ||||||
| #define inline
 |  | ||||||
| #define __inline
 |  | ||||||
| #endif
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #ifdef __chibicc__
 | #ifdef __chibicc__
 | ||||||
| #define __extension__
 | #define __extension__
 | ||||||
| #endif
 | #endif
 | ||||||
| 
 | 
 | ||||||
| #if __STDC_VERSION__ + 0 < 201112
 |  | ||||||
| #ifdef __GNUC__
 |  | ||||||
| #define _Alignas(x) __attribute__((__aligned__(x)))
 |  | ||||||
| #elif defined(_MSC_VER)
 |  | ||||||
| #define _Alignas(x) __declspec(align(x))
 |  | ||||||
| #endif
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #ifdef _MSC_VER
 |  | ||||||
| #define __builtin_unreachable() __assume(false)
 |  | ||||||
| #elif !((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 405 || \
 |  | ||||||
|         defined(__clang__) || defined(__INTEL_COMPILER) ||    \ |  | ||||||
|         __has_builtin(__builtin_unreachable)) |  | ||||||
| #define __builtin_unreachable() \
 |  | ||||||
|   for (;;) {                    \ |  | ||||||
|   } |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #if (!defined(__llvm__) && !__has_builtin(__builtin_assume))
 |  | ||||||
| #define __builtin_assume(x)    \
 |  | ||||||
|   do {                         \ |  | ||||||
|     if (!(x))                  \ |  | ||||||
|       __builtin_unreachable(); \ |  | ||||||
|   } while (0) |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #if __STDC_VERSION__ + 0 < 201112
 |  | ||||||
| #define _Atomic(t) volatile t
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #ifdef __llvm__
 | #ifdef __llvm__
 | ||||||
| #define __gnu_printf__ __printf__
 | #define __gnu_printf__ __printf__
 | ||||||
| #define __gnu_scanf__  __scanf__
 | #define __gnu_scanf__  __scanf__
 | ||||||
|  | @ -325,16 +264,6 @@ typedef struct { | ||||||
| #define hasatleast
 | #define hasatleast
 | ||||||
| #endif
 | #endif
 | ||||||
| 
 | 
 | ||||||
| #if __STDC_VERSION__ + 0 < 199901L && !defined(restrict)
 |  | ||||||
| #if !defined(__cplusplus) && \
 |  | ||||||
|     ((__GNUC__ + 0) * 100 + (__GNUC_MINOR__ + 0) >= 301 || defined(_MSC_VER)) |  | ||||||
| #define restrict __restrict__
 |  | ||||||
| #else
 |  | ||||||
| #define restrict
 |  | ||||||
| #define __restrict
 |  | ||||||
| #endif
 |  | ||||||
| #endif
 |  | ||||||
| 
 |  | ||||||
| #ifndef dontcallback
 | #ifndef dontcallback
 | ||||||
| #if (__has_attribute(__leaf__) || \
 | #if (__has_attribute(__leaf__) || \
 | ||||||
|      (!defined(__llvm__) &&       \ |      (!defined(__llvm__) &&       \ | ||||||
|  | @ -595,10 +524,7 @@ typedef struct { | ||||||
| #ifdef __aarch64__
 | #ifdef __aarch64__
 | ||||||
| /* raise sigill (not sigtrap) like x86 does */ | /* raise sigill (not sigtrap) like x86 does */ | ||||||
| #define __builtin_trap() \
 | #define __builtin_trap() \
 | ||||||
|   do {                       \ |   (({ __asm__("udf\t#0x666"); }), __builtin_unreachable()) | ||||||
|     __asm__("udf\t#0x666");  \ |  | ||||||
|     __builtin_unreachable(); \ |  | ||||||
|   } while (0) |  | ||||||
| #endif
 | #endif
 | ||||||
| 
 | 
 | ||||||
| #endif /* _COSMO_SOURCE */
 | #endif /* _COSMO_SOURCE */
 | ||||||
|  |  | ||||||
|  | @ -6,6 +6,12 @@ COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
| #define ZIPOS_SYNTHETIC_DIRECTORY 0 | #define ZIPOS_SYNTHETIC_DIRECTORY 0 | ||||||
| 
 | 
 | ||||||
|  | #ifndef __cplusplus | ||||||
|  | #define _ZIPOS_ATOMIC(x) _Atomic(x) | ||||||
|  | #else | ||||||
|  | #define _ZIPOS_ATOMIC(x) x | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| struct stat; | struct stat; | ||||||
| struct iovec; | struct iovec; | ||||||
| struct Zipos; | struct Zipos; | ||||||
|  | @ -21,8 +27,8 @@ struct ZiposHandle { | ||||||
|   size_t size; |   size_t size; | ||||||
|   size_t mapsize; |   size_t mapsize; | ||||||
|   size_t cfile; |   size_t cfile; | ||||||
|   _Atomic(size_t) refs; |   _ZIPOS_ATOMIC(size_t) refs; | ||||||
|   _Atomic(size_t) pos; |   _ZIPOS_ATOMIC(size_t) pos; | ||||||
|   uint8_t *mem; |   uint8_t *mem; | ||||||
|   uint8_t data[]; |   uint8_t data[]; | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -26,10 +26,10 @@ COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
| void *memset(void *, int, size_t) memcpyesque; | void *memset(void *, int, size_t) memcpyesque; | ||||||
| void *memmove(void *, const void *, size_t) memcpyesque; | void *memmove(void *, const void *, size_t) memcpyesque; | ||||||
| void *memcpy(void *restrict, const void *restrict, size_t) memcpyesque; | void *memcpy(void *, const void *, size_t) memcpyesque; | ||||||
| void *mempcpy(void *restrict, const void *restrict, size_t) memcpyesque; | void *mempcpy(void *, const void *, size_t) memcpyesque; | ||||||
| char *hexpcpy(char *restrict, const void *restrict, size_t) memcpyesque; | char *hexpcpy(char *, const void *, size_t) memcpyesque; | ||||||
| void *memccpy(void *restrict, const void *restrict, int, size_t) memcpyesque; | void *memccpy(void *, const void *, int, size_t) memcpyesque; | ||||||
| void explicit_bzero(void *, size_t); | void explicit_bzero(void *, size_t); | ||||||
| 
 | 
 | ||||||
| int memcmp(const void *, const void *, size_t) strlenesque; | int memcmp(const void *, const void *, size_t) strlenesque; | ||||||
|  |  | ||||||
|  | @ -3,6 +3,12 @@ | ||||||
| #include "libc/calls/struct/timespec.h" | #include "libc/calls/struct/timespec.h" | ||||||
| COSMOPOLITAN_C_START_ | COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
|  | #ifndef __cplusplus | ||||||
|  | #define _SEM_ATOMIC(x) _Atomic(x) | ||||||
|  | #else | ||||||
|  | #define _SEM_ATOMIC(x) x | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| #define SEM_FAILED        ((sem_t *)0) | #define SEM_FAILED        ((sem_t *)0) | ||||||
| #define SEM_MAGIC_NAMED   0xDEADBEEFu | #define SEM_MAGIC_NAMED   0xDEADBEEFu | ||||||
| #define SEM_MAGIC_UNNAMED 0xFEEDABEEu | #define SEM_MAGIC_UNNAMED 0xFEEDABEEu | ||||||
|  | @ -11,9 +17,9 @@ COSMOPOLITAN_C_START_ | ||||||
| typedef struct { | typedef struct { | ||||||
|   union { |   union { | ||||||
|     struct { |     struct { | ||||||
|       _Atomic(int) sem_value; |       _SEM_ATOMIC(int) sem_value; | ||||||
|       _Atomic(int) sem_waiters; |       _SEM_ATOMIC(int) sem_waiters; | ||||||
|       _Atomic(int) sem_prefs; /* named only */ |       _SEM_ATOMIC(int) sem_prefs; /* named only */ | ||||||
|       unsigned sem_magic; |       unsigned sem_magic; | ||||||
|       int64_t sem_dev;     /* named only */ |       int64_t sem_dev;     /* named only */ | ||||||
|       int64_t sem_ino;     /* named only */ |       int64_t sem_ino;     /* named only */ | ||||||
|  |  | ||||||
|  | @ -48,6 +48,12 @@ COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
| #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE} | #define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE} | ||||||
| 
 | 
 | ||||||
|  | #ifndef __cplusplus | ||||||
|  | #define _PTHREAD_ATOMIC(x) _Atomic(x) | ||||||
|  | #else | ||||||
|  | #define _PTHREAD_ATOMIC(x) x | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
| typedef uintptr_t pthread_t; | typedef uintptr_t pthread_t; | ||||||
| typedef int pthread_id_np_t; | typedef int pthread_id_np_t; | ||||||
| typedef char pthread_condattr_t; | typedef char pthread_condattr_t; | ||||||
|  | @ -57,20 +63,20 @@ typedef unsigned pthread_key_t; | ||||||
| typedef void (*pthread_key_dtor)(void *); | typedef void (*pthread_key_dtor)(void *); | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_once_s { | typedef struct pthread_once_s { | ||||||
|   _Atomic(uint32_t) _lock; |   _PTHREAD_ATOMIC(uint32_t) _lock; | ||||||
| } pthread_once_t; | } pthread_once_t; | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_spinlock_s { | typedef struct pthread_spinlock_s { | ||||||
|   _Atomic(int) _lock; |   _PTHREAD_ATOMIC(int) _lock; | ||||||
| } pthread_spinlock_t; | } pthread_spinlock_t; | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_mutex_s { | typedef struct pthread_mutex_s { | ||||||
|   uint32_t _nsync; |   uint32_t _nsync; | ||||||
|   union { |   union { | ||||||
|     int32_t _pid; |     int32_t _pid; | ||||||
|     _Atomic(int32_t) _futex; |     _PTHREAD_ATOMIC(int32_t) _futex; | ||||||
|   }; |   }; | ||||||
|   _Atomic(uint64_t) _word; |   _PTHREAD_ATOMIC(uint64_t) _word; | ||||||
| } pthread_mutex_t; | } pthread_mutex_t; | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_mutexattr_s { | typedef struct pthread_mutexattr_s { | ||||||
|  | @ -85,8 +91,8 @@ typedef struct pthread_cond_s { | ||||||
|       char _pshared; |       char _pshared; | ||||||
|     }; |     }; | ||||||
|   }; |   }; | ||||||
|   _Atomic(uint32_t) _sequence; |   _PTHREAD_ATOMIC(uint32_t) _sequence; | ||||||
|   _Atomic(uint32_t) _waiters; |   _PTHREAD_ATOMIC(uint32_t) _waiters; | ||||||
| } pthread_cond_t; | } pthread_cond_t; | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_rwlock_s { | typedef struct pthread_rwlock_s { | ||||||
|  | @ -97,8 +103,8 @@ typedef struct pthread_rwlock_s { | ||||||
| typedef struct pthread_barrier_s { | typedef struct pthread_barrier_s { | ||||||
|   int _count; |   int _count; | ||||||
|   char _pshared; |   char _pshared; | ||||||
|   _Atomic(int) _counter; |   _PTHREAD_ATOMIC(int) _counter; | ||||||
|   _Atomic(int) _waiters; |   _PTHREAD_ATOMIC(int) _waiters; | ||||||
| } pthread_barrier_t; | } pthread_barrier_t; | ||||||
| 
 | 
 | ||||||
| typedef struct pthread_attr_s { | typedef struct pthread_attr_s { | ||||||
|  |  | ||||||
							
								
								
									
										476
									
								
								test/libcxx/openmp_test.cc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										476
									
								
								test/libcxx/openmp_test.cc
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,476 @@ | ||||||
|  | /*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
 | ||||||
|  | │ vi: set et ft=cpp ts=2 sts=2 sw=2 fenc=utf-8                             :vi │ | ||||||
|  | ╞══════════════════════════════════════════════════════════════════════════════╡ | ||||||
|  | │ Copyright 2024 Justine Alexandra Roberts Tunney                              │ | ||||||
|  | │                                                                              │ | ||||||
|  | │ Permission to use, copy, modify, and/or distribute this software for         │ | ||||||
|  | │ any purpose with or without fee is hereby granted, provided that the         │ | ||||||
|  | │ above copyright notice and this permission notice appear in all copies.      │ | ||||||
|  | │                                                                              │ | ||||||
|  | │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │ | ||||||
|  | │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │ | ||||||
|  | │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │ | ||||||
|  | │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │ | ||||||
|  | │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │ | ||||||
|  | │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │ | ||||||
|  | │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │ | ||||||
|  | │ PERFORMANCE OF THIS SOFTWARE.                                                │ | ||||||
|  | ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||||
|  | #include <algorithm> | ||||||
|  | #include <atomic> | ||||||
|  | #include <cmath> | ||||||
|  | #include <cstdio> | ||||||
|  | #include <cstring> | ||||||
|  | #include <ctime> | ||||||
|  | #include "libc/stdio/rand.h" | ||||||
|  | 
 | ||||||
|  | #define PRECISION 2e-6 | ||||||
|  | #define LV1DCACHE 49152 | ||||||
|  | #define THRESHOLD 3000000 | ||||||
|  | 
 | ||||||
|  | #if defined(__OPTIMIZE__) && !defined(__SANITIZE_ADDRESS__) | ||||||
|  | #define ITERATIONS 5 | ||||||
|  | #else | ||||||
|  | #define ITERATIONS 1 | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | #define OPTIMIZED __attribute__((__optimize__("-O3,-ffast-math"))) | ||||||
|  | #define PORTABLE                         \ | ||||||
|  |   __target_clones("arch=znver4,"         \ | ||||||
|  |                   "arch=znver3,"         \ | ||||||
|  |                   "arch=sapphirerapids," \ | ||||||
|  |                   "arch=alderlake,"      \ | ||||||
|  |                   "arch=rocketlake,"     \ | ||||||
|  |                   "arch=cooperlake,"     \ | ||||||
|  |                   "arch=tigerlake,"      \ | ||||||
|  |                   "arch=cascadelake,"    \ | ||||||
|  |                   "arch=skylake-avx512," \ | ||||||
|  |                   "arch=skylake,"        \ | ||||||
|  |                   "arch=znver1,"         \ | ||||||
|  |                   "arch=tremont,"        \ | ||||||
|  |                   "fma,"                 \ | ||||||
|  |                   "avx") | ||||||
|  | 
 | ||||||
|  | static bool is_self_testing; | ||||||
|  | 
 | ||||||
|  | // m×n → n×m
 | ||||||
|  | template <typename TA, typename TB> | ||||||
|  | void transpose(long m, long n, const TA *A, long lda, TB *B, long ldb) { | ||||||
|  | #pragma omp parallel for collapse(2) if (m * n > THRESHOLD) | ||||||
|  |   for (long i = 0; i < m; ++i) | ||||||
|  |     for (long j = 0; j < n; ++j) { | ||||||
|  |       B[ldb * j + i] = A[lda * i + j]; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // m×k * k×n → m×n
 | ||||||
|  | // k×m * k×n → m×n if aᵀ
 | ||||||
|  | // m×k * n×k → m×n if bᵀ
 | ||||||
|  | // k×m * n×k → m×n if aᵀ and bᵀ
 | ||||||
|  | template <typename TC, typename TA, typename TB> | ||||||
|  | void dgemm(bool aᵀ, bool bᵀ, long m, long n, long k, float α, const TA *A, | ||||||
|  |            long lda, const TB *B, long ldb, float β, TC *C, long ldc) { | ||||||
|  | #pragma omp parallel for collapse(2) if (m * n * k > THRESHOLD) | ||||||
|  |   for (long i = 0; i < m; ++i) | ||||||
|  |     for (long j = 0; j < n; ++j) { | ||||||
|  |       double sum = 0; | ||||||
|  |       for (long l = 0; l < k; ++l) | ||||||
|  |         sum = std::fma((aᵀ ? A[lda * l + i] : A[lda * i + l]) * α, | ||||||
|  |                        (bᵀ ? B[ldb * j + l] : B[ldb * l + j]), sum); | ||||||
|  |       C[ldc * i + j] = C[ldc * i + j] * β + sum; | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename T, typename TC, typename TA, typename TB> | ||||||
|  | struct Gemmlin { | ||||||
|  |  public: | ||||||
|  |   Gemmlin(bool aT, bool bT, float α, const TA *A, long lda, const TB *B, | ||||||
|  |           long ldb, float β, TC *C, long ldc) | ||||||
|  |       : aT(aT), | ||||||
|  |         bT(bT), | ||||||
|  |         α(α), | ||||||
|  |         A(A), | ||||||
|  |         lda(lda), | ||||||
|  |         B(B), | ||||||
|  |         ldb(ldb), | ||||||
|  |         β(β), | ||||||
|  |         C(C), | ||||||
|  |         ldc(ldc) { | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   void gemm(long m, long n, long k) { | ||||||
|  |     if (!m || !n) | ||||||
|  |       return; | ||||||
|  |     for (long i = 0; i < m; ++i) | ||||||
|  |       for (long j = 0; j < n; ++j) { | ||||||
|  |         C[ldc * i + j] *= β; | ||||||
|  |       } | ||||||
|  |     if (!k) | ||||||
|  |       return; | ||||||
|  |     cub = sqrt(LV1DCACHE) / sqrt(sizeof(T) * 3); | ||||||
|  |     mnpack(0, m, 0, n, 0, k); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |  private: | ||||||
|  |   void mnpack(long m0, long m,  //
 | ||||||
|  |               long n0, long n,  //
 | ||||||
|  |               long k0, long k) { | ||||||
|  |     long mc = rounddown(std::min(m - m0, cub), 4); | ||||||
|  |     long mp = m0 + (m - m0) / mc * mc; | ||||||
|  |     long nc = rounddown(std::min(n - n0, cub), 4); | ||||||
|  |     long np = n0 + (n - n0) / nc * nc; | ||||||
|  |     long kc = rounddown(std::min(k - k0, cub), 4); | ||||||
|  |     long kp = k0 + (k - k0) / kc * kc; | ||||||
|  |     kpack(m0, mc, mp, n0, nc, np, k0, kc, k, kp); | ||||||
|  |     if (m - mp) | ||||||
|  |       mnpack(mp, m, n0, np, k0, k); | ||||||
|  |     if (n - np) | ||||||
|  |       mnpack(m0, mp, np, n, k0, k); | ||||||
|  |     if (m - mp && n - np) | ||||||
|  |       mnpack(mp, m, np, n, k0, k); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   void kpack(long m0, long mc, long m,  //
 | ||||||
|  |              long n0, long nc, long n,  //
 | ||||||
|  |              long k0, long kc, long k,  //
 | ||||||
|  |              long kp) { | ||||||
|  |     rpack(m0, mc, m, n0, nc, n, k0, kc, kp); | ||||||
|  |     if (k - kp) | ||||||
|  |       rpack(m0, mc, m, n0, nc, n, kp, k - kp, k); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   void rpack(long m0, long mc, long m,  //
 | ||||||
|  |              long n0, long nc, long n,  //
 | ||||||
|  |              long k0, long kc, long k) { | ||||||
|  |     if (!(mc % 4) && !(nc % 4)) | ||||||
|  |       bgemm<4, 4>(m0, mc, m, n0, nc, n, k0, kc, k); | ||||||
|  |     else | ||||||
|  |       bgemm<1, 1>(m0, mc, m, n0, nc, n, k0, kc, k); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   template <int mr, int nr> | ||||||
|  |   void bgemm(long m0, long mc, long m,  //
 | ||||||
|  |              long n0, long nc, long n,  //
 | ||||||
|  |              long k0, long kc, long k) { | ||||||
|  |     ops = (m - m0) * (n - n0) * (k - k0); | ||||||
|  |     ml = (m - m0) / mc; | ||||||
|  |     nl = (n - n0) / nc; | ||||||
|  |     locks = new lock[ml * nl]; | ||||||
|  |     there_will_be_blocks<mr, nr>(m0, mc, m, n0, nc, n, k0, kc, k); | ||||||
|  |     delete[] locks; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   template <int mr, int nr> | ||||||
|  |   void there_will_be_blocks(long m0, long mc, long m, long n0, long nc, long n, | ||||||
|  |                             long k0, long kc, long k) { | ||||||
|  | #pragma omp parallel for collapse(2) if (ops > THRESHOLD && mc * kc > 16) | ||||||
|  |     for (long ic = m0; ic < m; ic += mc) | ||||||
|  |       for (long pc = k0; pc < k; pc += kc) | ||||||
|  |         gizmo<mr, nr>(m0, mc, ic, n0, nc, k0, kc, pc, n); | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   template <int mr, int nr> | ||||||
|  |   PORTABLE OPTIMIZED void gizmo(long m0, long mc, long ic, long n0, long nc, | ||||||
|  |                                 long k0, long kc, long pc, long n) { | ||||||
|  |     T Ac[mc / mr][kc][mr]; | ||||||
|  |     for (long i = 0; i < mc; ++i) | ||||||
|  |       for (long j = 0; j < kc; ++j) | ||||||
|  |         Ac[i / mr][j][i % mr] = α * (aT ? A[lda * (pc + j) + (ic + i)] | ||||||
|  |                                         : A[lda * (ic + i) + (pc + j)]); | ||||||
|  |     for (long jc = n0; jc < n; jc += nc) { | ||||||
|  |       T Bc[nc / nr][nr][kc]; | ||||||
|  |       for (long j = 0; j < nc; ++j) | ||||||
|  |         for (long i = 0; i < kc; ++i) | ||||||
|  |           Bc[j / nr][j % nr][i] = | ||||||
|  |               bT ? B[ldb * (jc + j) + (pc + i)] : B[ldb * (pc + i) + (jc + j)]; | ||||||
|  |       T Cc[nc / nr][mc / mr][nr][mr]; | ||||||
|  |       memset(Cc, 0, nc * mc * sizeof(float)); | ||||||
|  |       for (long jr = 0; jr < nc / nr; ++jr) | ||||||
|  |         for (long ir = 0; ir < mc / mr; ++ir) | ||||||
|  |           for (long pr = 0; pr < kc; ++pr) | ||||||
|  |             for (long j = 0; j < nr; ++j) | ||||||
|  |               for (long i = 0; i < mr; ++i) | ||||||
|  |                 Cc[jr][ir][j][i] += Ac[ir][pr][i] * Bc[jr][j][pr]; | ||||||
|  |       const long lk = nl * ((ic - m0) / mc) + ((jc - n0) / nc); | ||||||
|  |       locks[lk].acquire(); | ||||||
|  |       for (long ir = 0; ir < mc; ir += mr) | ||||||
|  |         for (long jr = 0; jr < nc; jr += nr) | ||||||
|  |           for (long i = 0; i < mr; ++i) | ||||||
|  |             for (long j = 0; j < nr; ++j) | ||||||
|  |               C[ldc * (ic + ir + i) + (jc + jr + j)] += | ||||||
|  |                   Cc[jr / nr][ir / mr][j][i]; | ||||||
|  |       locks[lk].release(); | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   inline long rounddown(long x, long r) { | ||||||
|  |     if (x < r) | ||||||
|  |       return x; | ||||||
|  |     else | ||||||
|  |       return x & -r; | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  |   class lock { | ||||||
|  |    public: | ||||||
|  |     lock() = default; | ||||||
|  |     void acquire() { | ||||||
|  |       while (lock_.exchange(true, std::memory_order_acquire)) { | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     void release() { | ||||||
|  |       lock_.store(false, std::memory_order_release); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |    private: | ||||||
|  |     std::atomic_bool lock_ = false; | ||||||
|  |   }; | ||||||
|  | 
 | ||||||
|  |   bool aT; | ||||||
|  |   bool bT; | ||||||
|  |   float α; | ||||||
|  |   const TA *A; | ||||||
|  |   long lda; | ||||||
|  |   const TB *B; | ||||||
|  |   long ldb; | ||||||
|  |   float β; | ||||||
|  |   TC *C; | ||||||
|  |   long ldc; | ||||||
|  |   long ops; | ||||||
|  |   long nl; | ||||||
|  |   long ml; | ||||||
|  |   lock *locks; | ||||||
|  |   long cub; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | template <typename TC, typename TA, typename TB> | ||||||
|  | void sgemm(bool aT, bool bT, long m, long n, long k, float α, const TA *A, | ||||||
|  |            long lda, const TB *B, long ldb, float β, TC *C, long ldc) { | ||||||
|  |   Gemmlin<float, TC, TA, TB> g{aT, bT, α, A, lda, B, ldb, β, C, ldc}; | ||||||
|  |   g.gemm(m, n, k); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename TA, typename TB> | ||||||
|  | void show(FILE *f, long max, long m, long n, const TA *A, long lda, const TB *B, | ||||||
|  |           long ldb) { | ||||||
|  |   flockfile(f); | ||||||
|  |   fprintf(f, "      "); | ||||||
|  |   for (long j = 0; j < n; ++j) { | ||||||
|  |     fprintf(f, "%13ld", j); | ||||||
|  |   } | ||||||
|  |   fprintf(f, "\n"); | ||||||
|  |   for (long i = 0; i < m; ++i) { | ||||||
|  |     if (i == max) { | ||||||
|  |       fprintf(f, "...\n"); | ||||||
|  |       break; | ||||||
|  |     } | ||||||
|  |     fprintf(f, "%5ld ", i); | ||||||
|  |     for (long j = 0; j < n; ++j) { | ||||||
|  |       if (j == max) { | ||||||
|  |         fprintf(f, " ..."); | ||||||
|  |         break; | ||||||
|  |       } | ||||||
|  |       char ba[16], bb[16]; | ||||||
|  |       sprintf(ba, "%13.7f", static_cast<double>(A[lda * i + j])); | ||||||
|  |       sprintf(bb, "%13.7f", static_cast<double>(B[ldb * i + j])); | ||||||
|  |       for (long k = 0; ba[k] && bb[k]; ++k) { | ||||||
|  |         if (ba[k] != bb[k]) | ||||||
|  |           fputs_unlocked("\33[31m", f); | ||||||
|  |         fputc_unlocked(ba[k], f); | ||||||
|  |         if (ba[k] != bb[k]) | ||||||
|  |           fputs_unlocked("\33[0m", f); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |     fprintf(f, "\n"); | ||||||
|  |   } | ||||||
|  |   funlockfile(f); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | inline unsigned long GetDoubleBits(double f) { | ||||||
|  |   union { | ||||||
|  |     double f; | ||||||
|  |     unsigned long i; | ||||||
|  |   } u; | ||||||
|  |   u.f = f; | ||||||
|  |   return u.i; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | inline bool IsNan(double x) { | ||||||
|  |   return (GetDoubleBits(x) & (-1ull >> 1)) > (0x7ffull << 52); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename TA, typename TB> | ||||||
|  | double diff(long m, long n, const TA *Want, long lda, const TB *Got, long ldb) { | ||||||
|  |   double s = 0; | ||||||
|  |   int got_nans = 0; | ||||||
|  |   int want_nans = 0; | ||||||
|  |   for (long i = 0; i < m; ++i) | ||||||
|  |     for (long j = 0; j < n; ++j) | ||||||
|  |       if (IsNan(Want[ldb * i + j])) | ||||||
|  |         ++want_nans; | ||||||
|  |       else if (IsNan(Got[ldb * i + j])) | ||||||
|  |         ++got_nans; | ||||||
|  |       else | ||||||
|  |         s += std::fabs(Want[lda * i + j] - Got[ldb * i + j]); | ||||||
|  |   if (got_nans) | ||||||
|  |     printf("WARNING: got %d NaNs!\n", got_nans); | ||||||
|  |   if (want_nans) | ||||||
|  |     printf("WARNING: want array has %d NaNs!\n", want_nans); | ||||||
|  |   return s / (m * n); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename TA, typename TB> | ||||||
|  | void show_error(FILE *f, long max, long m, long n, const TA *A, long lda, | ||||||
|  |                 const TB *B, long ldb, const char *file, int line, double sad, | ||||||
|  |                 double tol) { | ||||||
|  |   fprintf(f, "%s:%d: sad %.17g exceeds %g\nwant\n", file, line, sad, tol); | ||||||
|  |   show(f, max, m, n, A, lda, B, ldb); | ||||||
|  |   fprintf(f, "got\n"); | ||||||
|  |   show(f, max, m, n, B, ldb, A, lda); | ||||||
|  |   fprintf(f, "\n"); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename TA, typename TB> | ||||||
|  | void check(double tol, long m, long n, const TA *A, long lda, const TB *B, | ||||||
|  |            long ldb, const char *file, int line) { | ||||||
|  |   double sad = diff(m, n, A, lda, B, ldb); | ||||||
|  |   if (sad <= tol) { | ||||||
|  |     if (!is_self_testing) { | ||||||
|  |       printf("         %g error\n", sad); | ||||||
|  |     } | ||||||
|  |   } else { | ||||||
|  |     show_error(stderr, 16, m, n, A, lda, B, ldb, file, line, sad, tol); | ||||||
|  |     const char *path = "/tmp/openmp_test.log"; | ||||||
|  |     FILE *f = fopen(path, "w"); | ||||||
|  |     if (f) { | ||||||
|  |       show_error(f, 10000, m, n, A, lda, B, ldb, file, line, sad, tol); | ||||||
|  |       printf("see also %s\n", path); | ||||||
|  |     } | ||||||
|  |     exit(1); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #define check(tol, m, n, A, lda, B, ldb) \ | ||||||
|  |   check(tol, m, n, A, lda, B, ldb, __FILE__, __LINE__) | ||||||
|  | 
 | ||||||
|  | long micros(void) { | ||||||
|  |   struct timespec ts; | ||||||
|  |   clock_gettime(CLOCK_REALTIME, &ts); | ||||||
|  |   return ts.tv_sec * 1000000 + (ts.tv_nsec + 999) / 1000; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | #define bench(x)                                                            \ | ||||||
|  |   do {                                                                      \ | ||||||
|  |     long t1 = micros();                                                     \ | ||||||
|  |     for (long i = 0; i < ITERATIONS; ++i) {                                 \ | ||||||
|  |       asm volatile("" ::: "memory");                                        \ | ||||||
|  |       x;                                                                    \ | ||||||
|  |       asm volatile("" ::: "memory");                                        \ | ||||||
|  |     }                                                                       \ | ||||||
|  |     long t2 = micros();                                                     \ | ||||||
|  |     printf("%8" PRId64 " µs %s\n", (t2 - t1 + ITERATIONS - 1) / ITERATIONS, \ | ||||||
|  |            #x);                                                             \ | ||||||
|  |   } while (0) | ||||||
|  | 
 | ||||||
|  | double real01(unsigned long x) {  // (0,1)
 | ||||||
|  |   return 1. / 4503599627370496. * ((x >> 12) + .5); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | double numba(void) {  // (-1,1)
 | ||||||
|  |   return real01(lemur64()) * 2 - 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | template <typename T> | ||||||
|  | void fill(T *A, long n) { | ||||||
|  |   for (long i = 0; i < n; ++i) { | ||||||
|  |     A[i] = numba(); | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void test_gemm(long m, long n, long k) { | ||||||
|  |   float *A = new float[m * k]; | ||||||
|  |   float *At = new float[k * m]; | ||||||
|  |   float *B = new float[k * n]; | ||||||
|  |   float *Bt = new float[n * k]; | ||||||
|  |   float *C = new float[m * n]; | ||||||
|  |   float *GOLD = new float[m * n]; | ||||||
|  |   float α = 1; | ||||||
|  |   float β = 0; | ||||||
|  |   fill(A, m * k); | ||||||
|  |   fill(B, k * n); | ||||||
|  |   dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n); | ||||||
|  |   transpose(m, k, A, k, At, m); | ||||||
|  |   transpose(k, n, B, n, Bt, k); | ||||||
|  |   sgemm(0, 0, m, n, k, α, A, k, B, n, β, C, n); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   sgemm(1, 0, m, n, k, α, At, m, B, n, β, C, n); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   sgemm(0, 1, m, n, k, α, A, k, Bt, k, β, C, n); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   sgemm(1, 1, m, n, k, α, At, m, Bt, k, β, C, n); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   delete[] GOLD; | ||||||
|  |   delete[] C; | ||||||
|  |   delete[] Bt; | ||||||
|  |   delete[] B; | ||||||
|  |   delete[] At; | ||||||
|  |   delete[] A; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void check_gemm_works(void) { | ||||||
|  |   static long kSizes[] = {1, 2, 3, 4, 5, 6, 7, 17, 31, 33, 63, 128, 129}; | ||||||
|  |   is_self_testing = true; | ||||||
|  |   long c = 0; | ||||||
|  |   long N = sizeof(kSizes) / sizeof(kSizes[0]); | ||||||
|  |   for (long i = 0; i < N; ++i) { | ||||||
|  |     long m = kSizes[i]; | ||||||
|  |     for (long j = 0; j < N; ++j) { | ||||||
|  |       long n = kSizes[N - 1 - i]; | ||||||
|  |       for (long k = 0; k < N; ++k) { | ||||||
|  |         long K = kSizes[i]; | ||||||
|  |         if (c++ % 13 == 0) { | ||||||
|  |           printf("testing %2ld %2ld %2ld\r", m, n, K); | ||||||
|  |         } | ||||||
|  |         test_gemm(m, n, K); | ||||||
|  |       } | ||||||
|  |     } | ||||||
|  |   } | ||||||
|  |   printf("\r"); | ||||||
|  |   is_self_testing = false; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | long m = 2333 / 3; | ||||||
|  | long k = 577 / 3; | ||||||
|  | long n = 713 / 3; | ||||||
|  | 
 | ||||||
|  | void check_sgemm(void) { | ||||||
|  |   float *A = new float[m * k]; | ||||||
|  |   float *At = new float[k * m]; | ||||||
|  |   float *B = new float[k * n]; | ||||||
|  |   float *Bt = new float[n * k]; | ||||||
|  |   float *C = new float[m * n]; | ||||||
|  |   double *GOLD = new double[m * n]; | ||||||
|  |   fill(A, m * k); | ||||||
|  |   fill(B, k * n); | ||||||
|  |   transpose(m, k, A, k, At, m); | ||||||
|  |   transpose(k, n, B, n, Bt, k); | ||||||
|  |   bench(dgemm(0, 0, m, n, k, 1, A, k, B, n, 0, GOLD, n)); | ||||||
|  |   bench(sgemm(0, 0, m, n, k, 1, A, k, B, n, 0, C, n)); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   bench(sgemm(1, 0, m, n, k, 1, At, m, B, n, 0, C, n)); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   bench(sgemm(0, 1, m, n, k, 1, A, k, Bt, k, 0, C, n)); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   bench(sgemm(1, 1, m, n, k, 1, At, m, Bt, k, 0, C, n)); | ||||||
|  |   check(PRECISION, m, n, GOLD, n, C, n); | ||||||
|  |   delete[] GOLD; | ||||||
|  |   delete[] C; | ||||||
|  |   delete[] Bt; | ||||||
|  |   delete[] B; | ||||||
|  |   delete[] At; | ||||||
|  |   delete[] A; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | int main(int argc, char *argv[]) { | ||||||
|  |   check_gemm_works(); | ||||||
|  |   check_sgemm(); | ||||||
|  | } | ||||||
							
								
								
									
										10
									
								
								third_party/nsync/futex.internal.h
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								third_party/nsync/futex.internal.h
									
										
									
									
										vendored
									
									
								
							|  | @ -4,8 +4,14 @@ | ||||||
| #include "libc/dce.h" | #include "libc/dce.h" | ||||||
| COSMOPOLITAN_C_START_ | COSMOPOLITAN_C_START_ | ||||||
| 
 | 
 | ||||||
| int nsync_futex_wake_(_Atomic(int) *, int, char); | #ifndef __cplusplus | ||||||
| int nsync_futex_wait_(_Atomic(int) *, int, char, const struct timespec *); | #define _FUTEX_ATOMIC(x) _Atomic(x) | ||||||
|  | #else | ||||||
|  | #define _FUTEX_ATOMIC(x) x | ||||||
|  | #endif | ||||||
|  | 
 | ||||||
|  | int nsync_futex_wake_(_FUTEX_ATOMIC(int) *, int, char); | ||||||
|  | int nsync_futex_wait_(_FUTEX_ATOMIC(int) *, int, char, const struct timespec *); | ||||||
| 
 | 
 | ||||||
| COSMOPOLITAN_C_END_ | COSMOPOLITAN_C_END_ | ||||||
| #endif /* NSYNC_FUTEX_INTERNAL_H_ */ | #endif /* NSYNC_FUTEX_INTERNAL_H_ */ | ||||||
|  |  | ||||||
							
								
								
									
										6
									
								
								third_party/openmp/kmp_lock.cpp
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								third_party/openmp/kmp_lock.cpp
									
										
									
									
										vendored
									
									
								
							|  | @ -380,9 +380,9 @@ __kmp_acquire_futex_lock_timed_template(kmp_futex_lock_t *lck, kmp_int32 gtid) { | ||||||
| 
 | 
 | ||||||
|     long rc; |     long rc; | ||||||
| #ifdef __COSMOPOLITAN__ | #ifdef __COSMOPOLITAN__ | ||||||
|     if ((rc = nsync_futex_wait_((_Atomic(int) *)&(lck->lk.poll), poll_val, false, NULL)) != 0) { |     if ((rc = nsync_futex_wait_((int *)&(lck->lk.poll), poll_val, false, NULL)) != 0) { | ||||||
| #else | #else | ||||||
|     if ((rc = syscall(__NR_futex, (_Atomic(int) *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL, |     if ((rc = syscall(__NR_futex, (int *)&(lck->lk.poll), FUTEX_WAIT, poll_val, NULL, | ||||||
|                       NULL, 0)) != 0) { |                       NULL, 0)) != 0) { | ||||||
| #endif | #endif | ||||||
|       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) " |       KA_TRACE(1000, ("__kmp_acquire_futex_lock: lck:%p, T#%d futex_wait(0x%x) " | ||||||
|  | @ -462,7 +462,7 @@ int __kmp_release_futex_lock(kmp_futex_lock_t *lck, kmp_int32 gtid) { | ||||||
|              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n", |              ("__kmp_release_futex_lock: lck:%p, T#%d futex_wake 1 thread\n", | ||||||
|               lck, gtid)); |               lck, gtid)); | ||||||
| #ifdef __COSMOPOLITAN__ | #ifdef __COSMOPOLITAN__ | ||||||
|     nsync_futex_wake_((_Atomic(int) *)&(lck->lk.poll), 1, false); |     nsync_futex_wake_((int *)&(lck->lk.poll), 1, false); | ||||||
| #else | #else | ||||||
|     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), |     syscall(__NR_futex, &(lck->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), | ||||||
|             NULL, NULL, 0); |             NULL, NULL, 0); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue