Make improvements

- Let OpenMP be usable via cosmocc
- Let libunwind be usable via cosmocc
- Make X86_HAVE(AVXVNNI) work correctly
- Avoid using MAP_GROWSDOWN on qemu-aarch64
- Introduce in6addr_any and in6addr_loopback
- Have thread stacks use MAP_GROWSDOWN by default
- Ask OpenMP to not use filesystem to manage threads
- Make NI_MAXHOST and NI_MAXSERV available w/o _GNU_SOURCE
This commit is contained in:
Justine Tunney 2024-01-29 15:45:10 -08:00
parent 5f8e9f14c1
commit 369aebfc48
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
36 changed files with 416 additions and 80 deletions

View file

@ -68,7 +68,7 @@ void matmul(long m, long n, long k, const T *A, long sa, const T *B, long sb,
}
template <long BM, long BN, typename T>
void sgemmk(long k, const T *A, long sa, const T *B, long sb, T *C, long sc) {
void gemmk(long k, const T *A, long sa, const T *B, long sb, T *C, long sc) {
T S[BM][BN] = {0};
for (long l = 0; l < k; ++l) {
for (long i = 0; i < BM; ++i) {
@ -86,12 +86,12 @@ void sgemmk(long k, const T *A, long sa, const T *B, long sb, T *C, long sc) {
// (m×k)ᵀ * k×n → m×n
template <long BM, long BN, typename T>
void sgemm(long m, long n, long k, const T *A, long sa, const T *B, long sb,
T *C, long sc) {
void gemm(long m, long n, long k, const T *A, long sa, const T *B, long sb,
T *C, long sc) {
#pragma omp parallel for collapse(2)
for (long i = 0; i < m; i += BM) {
for (long j = 0; j < n; j += BN) {
sgemmk<BM, BN>(k, A + i, sa, B + j, sb, C + sc * i + j, sc);
gemmk<BM, BN>(k, A + i, sa, B + j, sb, C + sc * i + j, sc);
}
}
}
@ -221,7 +221,7 @@ void check_transposed_blocking_gemm_is_ok(void) {
bench(matmul(m, n, k, A, k, B, n, C, n));
float *At = new float[k * m];
bench(transpose(m, k, A, k, At, m));
bench((sgemm<8, 4>(m, n, k, At, m, B, n, D, n)));
bench((gemm<8, 4>(m, n, k, At, m, B, n, D, n)));
check(FLAWLESS, m, n, C, n, D, n);
delete[] At;
delete[] D;