Make atomics faster on aarch64

This change implements the compiler runtime for ARM v8.1 ISE atomics and
gets rid of the mandatory -mno-outline-atomics flag. It can dramatically
speed things up, on newer ARM CPUs, as indicated by the changed lines in
test/libc/thread/footek_test.c. In llamafile dispatching on hwcap atomic
also shaved microseconds off synchronization barriers.
This commit is contained in:
Justine Tunney 2024-08-16 11:05:37 -07:00
parent de0cde8def
commit 11d9fb521d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
12 changed files with 2053 additions and 71 deletions

View file

@ -10,26 +10,26 @@
#include <unistd.h>
#include "third_party/nsync/futex.internal.h"
// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
// arm fleet
// with futexes
// 30 threads / 100000 iterations
//
// 242,604 us real
// 4,222,946 us user
// 1,079,229 us sys
// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs
// 1,362,557 us real
// 3,232,978 us user
// 2,104,824 us sys
// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs
// 1,346,482 us real
// 3,370,513 us user
// 1,992,383 us sys
// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs
// 54,183 us real
// 84,723 us user
// 741,667 us sys
// footek_test on studio.test. 609 µs 14'106 µs 65'607 µs
// 406,588 us real
// 884,696 us user
// 720,567 us sys
// footek_test on pi5.test. 334 µs 13'398 µs 408'450 µs
// 1,253,808 us real
// 3,608,426 us user
// 1,378,765 us sys
// footek_test on freebsdarm.test. 367 µs 16'466 µs 1'287'915 µs
// 1,316,058 us real
// 3,286,528 us user
// 1,738,756 us sys
// footek_test on pi.test. 450 µs 16'787 µs 1'338'420 µs
// arm fleet
// without futexes
@ -106,9 +106,14 @@
// 16,265 us sys
// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs
#define USE_FUTEX 1
#define THREADS 30
#define ITERATIONS 30000
#define SPIN 1
#define FUTEX 2
#define NSYNC 3
#define USE NSYNC
#define THREADS 10
#define ITERATIONS 50000
#define MUTEX_LOCKED(word) ((word) & 8)
#define MUTEX_WAITING(word) ((word) & 16)
@ -130,7 +135,7 @@ void lock(atomic_int *futex) {
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
#if USE_FUTEX
#if USE == FUTEX
nsync_futex_wait_(futex, 2, 0, 0);
#endif
pthread_setcancelstate(cs, 0);
@ -142,7 +147,7 @@ void unlock(atomic_int *futex) {
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
if (word == 2) {
atomic_store_explicit(futex, 0, memory_order_release);
#if USE_FUTEX
#if USE == FUTEX
nsync_futex_wake_(futex, 1, 0);
#endif
}
@ -154,9 +159,15 @@ pthread_mutex_t g_locker;
void *worker(void *arg) {
for (int i = 0; i < ITERATIONS; ++i) {
#if USE == NSYNC
pthread_mutex_lock(&g_locker);
++g_chores;
pthread_mutex_unlock(&g_locker);
#else
lock(&g_lock);
++g_chores;
unlock(&g_lock);
#endif
}
return 0;
}
@ -186,51 +197,52 @@ int main() {
CheckForMemoryLeaks();
}
// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
// x86 fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 186,976 us real
// 43,609 us user
// 205,585 us sys
// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs
// 238,902 us real
// 235,743 us user
// 97,881 us sys
// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs
// 201,285 us real
// 249,612 us user
// 141,230 us sys
// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs
// 303,363 us real
// 60,000 us user
// 410,000 us sys
// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs
// 386,085 us real
// 586,455 us user
// 466,991 us sys
// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs
// 245,010 us real
// 177,702 us real
// 183,488 us user
// 54,921 us sys
// footek_test on rhel7.test. 304 µs 2'225 µs 185'809 µs
// 191,346 us real
// 43,746 us user
// 257,012 us sys
// footek_test on freebsd.test. 405 µs 2'186 µs 200'568 µs
// 194,344 us real
// 228,235 us user
// 143,203 us sys
// footek_test on xnu.test. 33'207 µs 5'164 µs 220'693 µs
// 199,882 us real
// 138,178 us user
// 329,501 us sys
// footek_test on netbsd.test. 350 µs 3'570 µs 262'186 µs
// 291,255 us real
// 70,000 us user
// 440,000 us sys
// footek_test on openbsd.test. 628 µs 3'232 µs 342'136 µs
// 250,072 us real
// 437,500 us user
// 140,625 us sys
// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs
// 93,750 us sys
// footek_test on win10.test. 996 µs 10'949 µs 398'435 µs
// arm fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 87,132 us real
// 183,517 us user
// 20,020 us sys
// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs
// 679,374 us real
// 957,678 us user
// 605,078 us sys
// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs
// 902,343 us real
// 1,459,706 us user
// 781,140 us sys
// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs
// 88,681 us real
// 163,500 us user
// 22,183 us sys
// footek_test on studio.test. 651 µs 15'086 µs 98'632 µs
// 157,701 us real
// 215,597 us user
// 46,436 us sys
// footek_test on pi5.test. 296 µs 13'222 µs 159'805 µs
// 699,863 us real
// 1,027,981 us user
// 648,353 us sys
// footek_test on pi.test. 419 µs 16'716 µs 721'851 µs
// 843,858 us real
// 1,432,362 us user
// 696,613 us sys
// footek_test on freebsdarm.test. 349 µs 16'613 µs 876'863 µs