cosmopolitan/test/libc/thread/footek_test.c
Justine Tunney 3c61a541bd
Introduce pthread_condattr_setclock()
This is one of the few POSIX APIs that was missing. It lets you choose a
monotonic clock for your condition variables. This might improve perf on
some platforms. It might also grant more flexibility with NTP configs. I
know Qt is one project that believes it needs this. To introduce this, I
needed to change some the *NSYNC APIs, to support passing a clock param.
There's also new benchmarks, demonstrating Cosmopolitan's supremacy over
many libc implementations when it comes to mutex performance. Cygwin has
an alarmingly bad pthread_mutex_t implementation. It is so bad that they
would have been significantly better off if they'd used naive spinlocks.
2024-09-02 23:45:42 -07:00

355 lines
9.6 KiB
C

#define USE POSIX
#define ITERATIONS 50000
#define THREADS 10
// #define ITERATIONS 100000
// #define THREADS 30
#define SPIN 1
#define FUTEX 2
#define POSIX 3
#ifdef __COSMOPOLITAN__
#include <cosmo.h>
#include "third_party/nsync/futex.internal.h"
#endif
#include <assert.h>
#include <pthread.h>
#include <stdatomic.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#ifdef __linux__
#include <linux/futex.h>
#include <sys/syscall.h>
static inline long nsync_futex_wait_(atomic_int *uaddr, int val, char pshare,
int clock,
const struct timespec *timeout) {
return syscall(SYS_futex, uaddr, pshare ? FUTEX_WAIT : FUTEX_WAIT_PRIVATE,
val, timeout, NULL, 0);
}
static inline long nsync_futex_wake_(atomic_int *uaddr, int num_to_wake,
char pshare) {
return syscall(SYS_futex, uaddr, pshare ? FUTEX_WAKE : FUTEX_WAKE_PRIVATE,
num_to_wake, NULL, NULL, 0);
}
#endif
// x86 fleet
// with spin lock
// 30 threads / 100000 iterations
//
// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX
// 1,570,224 us real
// 42,690,880 us user
// 1,999 us sys
//
// footek_test on rhel7.test. 423 µs 2'638 µs 912'241 µs
// 897,815 us real
// 1,763,705 us user
// 9,696 us sys
//
// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs
// 4,225,726 us real
// 16,679,456 us user
// 16,265 us sys
//
// footek_test on freebsd.test. 690 µs 3'011 µs 2'925'997 µs
// 2,916,033 us real
// 17,236,103 us user
// 0 us sys
//
// footek_test on netbsd.test. 1'151 µs 2'634 µs 1'014'867 µs
// 790,332 us real
// 2,359,967 us user
// 0 us sys
//
// footek_test on openbsd.test. 557 µs 3'020 µs 2'554'648 µs
// 2,332,724 us real
// 9,150,000 us user
// 10,000 us sys
//
// footek_test on win10.test. 962 µs 9'698 µs 2'751'905 µs
// 2,528,863 us real
// 56,546,875 us user
// 1,671,875 us sys
// x86 fleet
// with futexes
// 30 threads / 100000 iterations
//
// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX
// 100,746 us real
// 234,451 us user
// 2,638,333 us sys
//
// footek_test on rhel7.test. 376 µs 2'259 µs 153'024 µs
// 146,015 us real
// 169,427 us user
// 68,939 us sys
//
// footek_test on xnu.test. 11'143 µs 9'159 µs 164'865 µs
// 144,917 us real
// 383,317 us user
// 191,203 us sys
//
// footek_test on freebsd.test. 394 µs 2'165 µs 256'227 µs
// 244,286 us real
// 405,395 us user
// 956,122 us sys
//
// footek_test on netbsd.test. 502 µs 2'020 µs 261'895 µs
// 209,095 us real
// 616,634 us user
// 9,945 us sys
//
// footek_test on openbsd.test. 457 µs 2'737 µs 396'342 µs
// 344,876 us real
// 50,000 us user
// 1,240,000 us sys
//
// footek_test on win10.test. 462 µs 59'528 µs 1'348'265 µs
// 1,193,906 us real
// 17,546,875 us user
// 3,000,000 us sys
// x86 fleet
// with posix
// 30 threads / 100000 iterations
//
// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (glibc)
// 111,560 us real
// 153,985 us user
// 2,988,121 us sys
//
// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (musl)
// 392,765 us real
// 1,885,558 us user
// 9,667,865 us sys
//
// footek_test on Linux 6.8 AMD Ryzen Threadripper PRO 7995WX (cosmo)
// 40,965 us real
// 47,168 us user
// 25,398 us sys
//
// footek_test on rhel7.test. 683 µs 1'340 µs 105'977 µs
// 101,934 us real
// 104,771 us user
// 4,068 us sys
//
// footek_test on xnu.test. 2'054 µs 5'352 µs 210'306 µs
// 181,540 us real
// 216,236 us user
// 127,344 us sys
//
// footek_test on freebsd.test. (cosmo)
// 126,803 us real
// 3,100 us user
// 176,744 us sys
//
// footek_test on freebsd.test. (freebsd libc)
// 219,073 us real
// 158,103 us user
// 1,146,252 us sys
//
// footek_test on netbsd.test. 350 µs 3'570 µs 262'186 µs
// 199,882 us real
// 138,178 us user
// 329,501 us sys
//
// footek_test on openbsd.test. (cosmo)
// 138,619 us real
// 30,000 us user
// 110,000 us sys
//
// footek_test on openbsd.test. (openbsd libc)
// 385,431 us real
// 80,000 us user
// 1,350,000 us sys
//
// footek_test on win10.test. (cosmo)
// 156,382 us real
// 312,500 us user
// 31,250 us sys
//
// footek_test on win10.test. (cygwin)
// 9,334,610 us real
// 1,562,000 us user
// 6,093,000 us sys
// arm fleet
// with spin lock
// 30 threads / 100000 iterations
//
// footek_test on studio.test. 961 µs 12'907 µs 1'287'983 µs
// 1,282,084 us real
// 29,359,582 us user
// 34,553 us sys
//
// footek_test on pi.test. 459 µs 16'376 µs 4'095'512 µs
// 4,070,988 us real
// 16,203,990 us user
// 7,999 us sys
//
// footek_test on freebsdarm.test. 502 µs 16'446 µs 7'051'545 µs
// 7,012,493 us real
// 27,936,725 us user
// 7,871 us sys
// arm fleet
// with futexes
// 30 threads / 100000 iterations
//
// footek_test on studio.test. 585 µs 13'597 µs 57'473 µs
// 46,481 us real
// 68,745 us user
// 586,871 us sys
//
// footek_test on pi5.test. 335 µs 13'034 µs 432'358 µs
// 389,619 us real
// 839,848 us user
// 679,112 us sys
//
// footek_test on pi.test. 479 µs 16'539 µs 476'395 µs
// 463,799 us real
// 1,259,267 us user
// 547,681 us sys
//
// footek_test on freebsdarm.test. 364 µs 16'898 µs 1'288'594 µs
// 1,256,134 us real
// 3,770,473 us user
// 1,214,755 us sys
// arm fleet
// with posix
// 30 threads / 100000 iterations
//
// footek_test on Apple M2 Ultra (Apple Libc)
// 45,443 us real
// 30,201 us user
// 864,650 us sys
//
// footek_test on Apple M2 Ultra (Cosmo Libc)
// 65,118 us real
// 77,891 us user
// 1,023,575 us sys
//
// footek_test on pi5.test. 407 µs 12'661 µs 198'133 µs
// 152,791 us real
// 143,678 us user
// 14,736 us sys
//
// footek_test on studio.test. 463 µs 13'286 µs 234'742 µs
// 227,916 us real
// 294,162 us user
// 155,062 us sys
//
// footek_test on pi.test. 374 µs 15'720 µs 249'245 µs
// 233,504 us real
// 301,072 us user
// 187,153 us sys
//
// footek_test on freebsdarm.test. 328 µs 16'614 µs 918'647 µs
// 877,124 us real
// 1,377,338 us user
// 798,230 us sys
#define MUTEX_LOCKED(word) ((word) & 8)
#define MUTEX_WAITING(word) ((word) & 16)
#define MUTEX_LOCK(word) ((word) | 8)
#define MUTEX_SET_WAITING(word) ((word) | 16)
#define MUTEX_UNLOCK(word) ((word) & ~(8 | 16))
void lock(atomic_int *futex) {
int word, cs;
for (int i = 0; i < 4; ++i) {
word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return;
}
if (word == 1)
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
#if USE == FUTEX
nsync_futex_wait_(futex, 2, 0, 0, 0);
#endif
pthread_setcancelstate(cs, 0);
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
}
}
void unlock(atomic_int *futex) {
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
if (word == 2) {
atomic_store_explicit(futex, 0, memory_order_release);
#if USE == FUTEX
nsync_futex_wake_(futex, 1, 0);
#endif
}
}
int g_chores;
atomic_int g_lock;
pthread_mutex_t g_locker = PTHREAD_MUTEX_INITIALIZER;
void *worker(void *arg) {
for (int i = 0; i < ITERATIONS; ++i) {
#if USE == POSIX
pthread_mutex_lock(&g_locker);
++g_chores;
pthread_mutex_unlock(&g_locker);
#else
lock(&g_lock);
++g_chores;
unlock(&g_lock);
#endif
}
return 0;
}
struct timeval tub(struct timeval a, struct timeval b) {
a.tv_sec -= b.tv_sec;
if (a.tv_usec < b.tv_usec) {
a.tv_usec += 1000000;
a.tv_sec--;
}
a.tv_usec -= b.tv_usec;
return a;
}
long tomicros(struct timeval x) {
return x.tv_sec * 1000000ul + x.tv_usec;
}
int main() {
struct timeval start;
gettimeofday(&start, 0);
pthread_t th[THREADS];
for (int i = 0; i < THREADS; ++i)
pthread_create(&th[i], 0, worker, 0);
for (int i = 0; i < THREADS; ++i)
pthread_join(th[i], 0);
assert(g_chores == THREADS * ITERATIONS);
struct rusage ru;
struct timeval end;
gettimeofday(&end, 0);
getrusage(RUSAGE_SELF, &ru);
printf("%16ld us real\n"
"%16ld us user\n"
"%16ld us sys\n",
tomicros(tub(end, start)), //
tomicros(ru.ru_utime), //
tomicros(ru.ru_stime));
#ifdef __COSMOPOLITAN__
CheckForMemoryLeaks();
#endif
}