Make atomics faster on aarch64

This change implements the compiler runtime for ARM v8.1 ISE atomics and
gets rid of the mandatory -mno-outline-atomics flag. It can dramatically
speed things up, on newer ARM CPUs, as indicated by the changed lines in
test/libc/thread/footek_test.c. In llamafile dispatching on hwcap atomic
also shaved microseconds off synchronization barriers.
This commit is contained in:
Justine Tunney 2024-08-16 11:05:37 -07:00
parent de0cde8def
commit 11d9fb521d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
12 changed files with 2053 additions and 71 deletions

View file

@ -132,7 +132,7 @@ endif
ifneq ($(findstring aarch64,$(MODE)),)
ARCH = aarch64
HOSTS ?= pi studio freebsdarm
HOSTS ?= pi pi5 studio freebsdarm
else
ARCH = x86_64
HOSTS ?= freebsd rhel7 xnu openbsd netbsd win10

View file

@ -310,7 +310,7 @@ SECTIONS {
. = ALIGN(__privileged_end > __privileged_start ? CONSTANT(COMMONPAGESIZE) : 0);
/*END: morphable code */
__privileged_start = .;
*(.privileged)
*(.privileged .privileged.*)
__privileged_end = .;
KEEP(*(.ape.pad.text))

View file

@ -115,14 +115,10 @@ ifeq ($(ARCH), aarch64)
# - Cosmopolitan Libc uses x28 for thread-local storage because Apple
# forbids us from using tpidr_el0 too.
#
# - Cosmopolitan currently lacks an implementation of the runtime
# libraries needed by the -moutline-atomics flag
#
DEFAULT_COPTS += \
-ffixed-x18 \
-ffixed-x28 \
-fsigned-char \
-mno-outline-atomics
-fsigned-char
endif
MATHEMATICAL = \

File diff suppressed because it is too large Load diff

32
libc/intrin/armlse.c Normal file
View file

@ -0,0 +1,32 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/getauxval.h"
#include "libc/runtime/runtime.h"
#include "libc/sysv/consts/auxv.h"
#include "libc/sysv/consts/hwcap.h"
#ifdef __aarch64__
bool __aarch64_have_lse_atomics;
static __attribute__((__constructor__(1))) void __aarch64_atomics_init(void) {
struct AuxiliaryValue x = __getauxval(AT_HWCAP);
__aarch64_have_lse_atomics = !!(x.value & HWCAP_ATOMICS);
}
#endif /* __aarch64__ */

24
libc/intrin/atomic.c Normal file
View file

@ -0,0 +1,24 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/atomic.h"
bool dog(_Atomic(long) *p, long *e, long w) {
return atomic_compare_exchange_weak_explicit(p, e, w, memory_order_acq_rel,
memory_order_relaxed);
}

View file

@ -354,7 +354,6 @@ static errno_t _pthread_cancel_everyone(void) {
*/
errno_t pthread_cancel(pthread_t thread) {
struct PosixThread *arg;
unassert(thread);
if ((arg = (struct PosixThread *)thread)) {
return _pthread_cancel_single(arg);
} else {

View file

@ -10,26 +10,26 @@
#include <unistd.h>
#include "third_party/nsync/futex.internal.h"
// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
// arm fleet
// with futexes
// 30 threads / 100000 iterations
//
// 242,604 us real
// 4,222,946 us user
// 1,079,229 us sys
// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs
// 1,362,557 us real
// 3,232,978 us user
// 2,104,824 us sys
// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs
// 1,346,482 us real
// 3,370,513 us user
// 1,992,383 us sys
// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs
// 54,183 us real
// 84,723 us user
// 741,667 us sys
// footek_test on studio.test. 609 µs 14'106 µs 65'607 µs
// 406,588 us real
// 884,696 us user
// 720,567 us sys
// footek_test on pi5.test. 334 µs 13'398 µs 408'450 µs
// 1,253,808 us real
// 3,608,426 us user
// 1,378,765 us sys
// footek_test on freebsdarm.test. 367 µs 16'466 µs 1'287'915 µs
// 1,316,058 us real
// 3,286,528 us user
// 1,738,756 us sys
// footek_test on pi.test. 450 µs 16'787 µs 1'338'420 µs
// arm fleet
// without futexes
@ -106,9 +106,14 @@
// 16,265 us sys
// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs
#define USE_FUTEX 1
#define THREADS 30
#define ITERATIONS 30000
#define SPIN 1
#define FUTEX 2
#define NSYNC 3
#define USE NSYNC
#define THREADS 10
#define ITERATIONS 50000
#define MUTEX_LOCKED(word) ((word) & 8)
#define MUTEX_WAITING(word) ((word) & 16)
@ -130,7 +135,7 @@ void lock(atomic_int *futex) {
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
#if USE_FUTEX
#if USE == FUTEX
nsync_futex_wait_(futex, 2, 0, 0);
#endif
pthread_setcancelstate(cs, 0);
@ -142,7 +147,7 @@ void unlock(atomic_int *futex) {
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
if (word == 2) {
atomic_store_explicit(futex, 0, memory_order_release);
#if USE_FUTEX
#if USE == FUTEX
nsync_futex_wake_(futex, 1, 0);
#endif
}
@ -154,9 +159,15 @@ pthread_mutex_t g_locker;
void *worker(void *arg) {
for (int i = 0; i < ITERATIONS; ++i) {
#if USE == NSYNC
pthread_mutex_lock(&g_locker);
++g_chores;
pthread_mutex_unlock(&g_locker);
#else
lock(&g_lock);
++g_chores;
unlock(&g_lock);
#endif
}
return 0;
}
@ -186,51 +197,52 @@ int main() {
CheckForMemoryLeaks();
}
// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
// x86 fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 186,976 us real
// 43,609 us user
// 205,585 us sys
// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs
// 238,902 us real
// 235,743 us user
// 97,881 us sys
// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs
// 201,285 us real
// 249,612 us user
// 141,230 us sys
// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs
// 303,363 us real
// 60,000 us user
// 410,000 us sys
// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs
// 386,085 us real
// 586,455 us user
// 466,991 us sys
// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs
// 245,010 us real
// 177,702 us real
// 183,488 us user
// 54,921 us sys
// footek_test on rhel7.test. 304 µs 2'225 µs 185'809 µs
// 191,346 us real
// 43,746 us user
// 257,012 us sys
// footek_test on freebsd.test. 405 µs 2'186 µs 200'568 µs
// 194,344 us real
// 228,235 us user
// 143,203 us sys
// footek_test on xnu.test. 33'207 µs 5'164 µs 220'693 µs
// 199,882 us real
// 138,178 us user
// 329,501 us sys
// footek_test on netbsd.test. 350 µs 3'570 µs 262'186 µs
// 291,255 us real
// 70,000 us user
// 440,000 us sys
// footek_test on openbsd.test. 628 µs 3'232 µs 342'136 µs
// 250,072 us real
// 437,500 us user
// 140,625 us sys
// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs
// 93,750 us sys
// footek_test on win10.test. 996 µs 10'949 µs 398'435 µs
// arm fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 87,132 us real
// 183,517 us user
// 20,020 us sys
// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs
// 679,374 us real
// 957,678 us user
// 605,078 us sys
// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs
// 902,343 us real
// 1,459,706 us user
// 781,140 us sys
// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs
// 88,681 us real
// 163,500 us user
// 22,183 us sys
// footek_test on studio.test. 651 µs 15'086 µs 98'632 µs
// 157,701 us real
// 215,597 us user
// 46,436 us sys
// footek_test on pi5.test. 296 µs 13'222 µs 159'805 µs
// 699,863 us real
// 1,027,981 us user
// 648,353 us sys
// footek_test on pi.test. 419 µs 16'716 µs 721'851 µs
// 843,858 us real
// 1,432,362 us user
// 696,613 us sys
// footek_test on freebsdarm.test. 349 µs 16'613 µs 876'863 µs

View file

@ -245,7 +245,7 @@ static void CheckPrivilegedCrossReferences(void) {
if (~shdr->sh_flags & SHF_EXECINSTR)
continue; // data reference
if ((secname = GetElfString(elf, esize, secstrs, shdr->sh_name)) &&
strcmp(".privileged", secname)) {
!startswith(secname, ".privileged")) {
tinyprint(2, epath,
": code in .privileged section "
"references symbol '",

View file

@ -343,7 +343,7 @@ LDLIBS_X86_64="-lcosmo"
CRT_AARCH64="$LIB_AARCH64/crt.o"
CPPFLAGS_AARCH64="$CPPFLAGS -fsigned-char"
CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28"
LDFLAGS_AARCH64="$LDFLAGS -L$LIB_AARCH64 -L$BIN/../aarch64-linux-cosmo/lib -Wl,-T,$LIB_AARCH64/aarch64.lds -Wl,-z,common-page-size=16384 -Wl,-z,max-page-size=16384"
LDLIBS_AARCH64="-lcosmo"

View file

@ -131,7 +131,7 @@ elif [ x"$ARCH" = x"aarch64" ]; then
OBJCOPYFLAGS="-S"
PAGESZ=16384
CPPFLAGS="$CPPFLAGS -fsigned-char"
CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28"
LDFLAGS="$LDFLAGS -Wl,-T,$LIB/aarch64.lds"
else
fatal_error "$ARCH: unsupported architecture"

View file

@ -79,7 +79,7 @@ o/$(MODE)/tool/hello/hello-pe.ape: \
# elf2pe can generate binaries that don't have dll imports
o/$(MODE)/tool/hello/life-pe.dbg: \
o/$(MODE)/tool/hello/life-pe.o
@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain #-Ttext-segment=0x140000000
@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain -Ttext-segment=0x140000000
o/$(MODE)/tool/hello/life-pe.ape: \
o/$(MODE)/tool/hello/life-pe.dbg \
o/$(MODE)/tool/build/elf2pe