mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 06:53:33 +00:00
Make atomics faster on aarch64
This change implements the compiler runtime for ARM v8.1 ISE atomics and gets rid of the mandatory -mno-outline-atomics flag. It can dramatically speed things up, on newer ARM CPUs, as indicated by the changed lines in test/libc/thread/footek_test.c. In llamafile dispatching on hwcap atomic also shaved microseconds off synchronization barriers.
This commit is contained in:
parent
de0cde8def
commit
11d9fb521d
12 changed files with 2053 additions and 71 deletions
2
Makefile
2
Makefile
|
@ -132,7 +132,7 @@ endif
|
|||
|
||||
ifneq ($(findstring aarch64,$(MODE)),)
|
||||
ARCH = aarch64
|
||||
HOSTS ?= pi studio freebsdarm
|
||||
HOSTS ?= pi pi5 studio freebsdarm
|
||||
else
|
||||
ARCH = x86_64
|
||||
HOSTS ?= freebsd rhel7 xnu openbsd netbsd win10
|
||||
|
|
|
@ -310,7 +310,7 @@ SECTIONS {
|
|||
. = ALIGN(__privileged_end > __privileged_start ? CONSTANT(COMMONPAGESIZE) : 0);
|
||||
/*END: morphable code */
|
||||
__privileged_start = .;
|
||||
*(.privileged)
|
||||
*(.privileged .privileged.*)
|
||||
__privileged_end = .;
|
||||
|
||||
KEEP(*(.ape.pad.text))
|
||||
|
|
|
@ -115,14 +115,10 @@ ifeq ($(ARCH), aarch64)
|
|||
# - Cosmopolitan Libc uses x28 for thread-local storage because Apple
|
||||
# forbids us from using tpidr_el0 too.
|
||||
#
|
||||
# - Cosmopolitan currently lacks an implementation of the runtime
|
||||
# libraries needed by the -moutline-atomics flag
|
||||
#
|
||||
DEFAULT_COPTS += \
|
||||
-ffixed-x18 \
|
||||
-ffixed-x28 \
|
||||
-fsigned-char \
|
||||
-mno-outline-atomics
|
||||
-fsigned-char
|
||||
endif
|
||||
|
||||
MATHEMATICAL = \
|
||||
|
|
1919
libc/intrin/aarch64/atomics.S
Normal file
1919
libc/intrin/aarch64/atomics.S
Normal file
File diff suppressed because it is too large
Load diff
32
libc/intrin/armlse.c
Normal file
32
libc/intrin/armlse.c
Normal file
|
@ -0,0 +1,32 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2024 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/getauxval.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/sysv/consts/auxv.h"
|
||||
#include "libc/sysv/consts/hwcap.h"
|
||||
#ifdef __aarch64__
|
||||
|
||||
bool __aarch64_have_lse_atomics;
|
||||
|
||||
static __attribute__((__constructor__(1))) void __aarch64_atomics_init(void) {
|
||||
struct AuxiliaryValue x = __getauxval(AT_HWCAP);
|
||||
__aarch64_have_lse_atomics = !!(x.value & HWCAP_ATOMICS);
|
||||
}
|
||||
|
||||
#endif /* __aarch64__ */
|
24
libc/intrin/atomic.c
Normal file
24
libc/intrin/atomic.c
Normal file
|
@ -0,0 +1,24 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2024 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/intrin/atomic.h"
|
||||
|
||||
bool dog(_Atomic(long) *p, long *e, long w) {
|
||||
return atomic_compare_exchange_weak_explicit(p, e, w, memory_order_acq_rel,
|
||||
memory_order_relaxed);
|
||||
}
|
|
@ -354,7 +354,6 @@ static errno_t _pthread_cancel_everyone(void) {
|
|||
*/
|
||||
errno_t pthread_cancel(pthread_t thread) {
|
||||
struct PosixThread *arg;
|
||||
unassert(thread);
|
||||
if ((arg = (struct PosixThread *)thread)) {
|
||||
return _pthread_cancel_single(arg);
|
||||
} else {
|
||||
|
|
|
@ -10,26 +10,26 @@
|
|||
#include <unistd.h>
|
||||
#include "third_party/nsync/futex.internal.h"
|
||||
|
||||
// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
|
||||
// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
|
||||
// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
|
||||
|
||||
// arm fleet
|
||||
// with futexes
|
||||
// 30 threads / 100000 iterations
|
||||
//
|
||||
// 242,604 us real
|
||||
// 4,222,946 us user
|
||||
// 1,079,229 us sys
|
||||
// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs
|
||||
// 1,362,557 us real
|
||||
// 3,232,978 us user
|
||||
// 2,104,824 us sys
|
||||
// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs
|
||||
// 1,346,482 us real
|
||||
// 3,370,513 us user
|
||||
// 1,992,383 us sys
|
||||
// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs
|
||||
// 54,183 us real
|
||||
// 84,723 us user
|
||||
// 741,667 us sys
|
||||
// footek_test on studio.test. 609 µs 14'106 µs 65'607 µs
|
||||
// 406,588 us real
|
||||
// 884,696 us user
|
||||
// 720,567 us sys
|
||||
// footek_test on pi5.test. 334 µs 13'398 µs 408'450 µs
|
||||
// 1,253,808 us real
|
||||
// 3,608,426 us user
|
||||
// 1,378,765 us sys
|
||||
// footek_test on freebsdarm.test. 367 µs 16'466 µs 1'287'915 µs
|
||||
// 1,316,058 us real
|
||||
// 3,286,528 us user
|
||||
// 1,738,756 us sys
|
||||
// footek_test on pi.test. 450 µs 16'787 µs 1'338'420 µs
|
||||
|
||||
// arm fleet
|
||||
// without futexes
|
||||
|
@ -106,9 +106,14 @@
|
|||
// 16,265 us sys
|
||||
// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs
|
||||
|
||||
#define USE_FUTEX 1
|
||||
#define THREADS 30
|
||||
#define ITERATIONS 30000
|
||||
#define SPIN 1
|
||||
#define FUTEX 2
|
||||
#define NSYNC 3
|
||||
|
||||
#define USE NSYNC
|
||||
|
||||
#define THREADS 10
|
||||
#define ITERATIONS 50000
|
||||
|
||||
#define MUTEX_LOCKED(word) ((word) & 8)
|
||||
#define MUTEX_WAITING(word) ((word) & 16)
|
||||
|
@ -130,7 +135,7 @@ void lock(atomic_int *futex) {
|
|||
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
|
||||
while (word > 0) {
|
||||
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
|
||||
#if USE_FUTEX
|
||||
#if USE == FUTEX
|
||||
nsync_futex_wait_(futex, 2, 0, 0);
|
||||
#endif
|
||||
pthread_setcancelstate(cs, 0);
|
||||
|
@ -142,7 +147,7 @@ void unlock(atomic_int *futex) {
|
|||
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
|
||||
if (word == 2) {
|
||||
atomic_store_explicit(futex, 0, memory_order_release);
|
||||
#if USE_FUTEX
|
||||
#if USE == FUTEX
|
||||
nsync_futex_wake_(futex, 1, 0);
|
||||
#endif
|
||||
}
|
||||
|
@ -154,9 +159,15 @@ pthread_mutex_t g_locker;
|
|||
|
||||
void *worker(void *arg) {
|
||||
for (int i = 0; i < ITERATIONS; ++i) {
|
||||
#if USE == NSYNC
|
||||
pthread_mutex_lock(&g_locker);
|
||||
++g_chores;
|
||||
pthread_mutex_unlock(&g_locker);
|
||||
#else
|
||||
lock(&g_lock);
|
||||
++g_chores;
|
||||
unlock(&g_lock);
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -186,51 +197,52 @@ int main() {
|
|||
CheckForMemoryLeaks();
|
||||
}
|
||||
|
||||
// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
|
||||
// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
|
||||
|
||||
// x86 fleet
|
||||
// with pthread_mutex_t
|
||||
// 30 threads / 100000 iterations
|
||||
//
|
||||
// 186,976 us real
|
||||
// 43,609 us user
|
||||
// 205,585 us sys
|
||||
// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs
|
||||
// 238,902 us real
|
||||
// 235,743 us user
|
||||
// 97,881 us sys
|
||||
// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs
|
||||
// 201,285 us real
|
||||
// 249,612 us user
|
||||
// 141,230 us sys
|
||||
// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs
|
||||
// 303,363 us real
|
||||
// 60,000 us user
|
||||
// 410,000 us sys
|
||||
// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs
|
||||
// 386,085 us real
|
||||
// 586,455 us user
|
||||
// 466,991 us sys
|
||||
// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs
|
||||
// 245,010 us real
|
||||
// 177,702 us real
|
||||
// 183,488 us user
|
||||
// 54,921 us sys
|
||||
// footek_test on rhel7.test. 304 µs 2'225 µs 185'809 µs
|
||||
// 191,346 us real
|
||||
// 43,746 us user
|
||||
// 257,012 us sys
|
||||
// footek_test on freebsd.test. 405 µs 2'186 µs 200'568 µs
|
||||
// 194,344 us real
|
||||
// 228,235 us user
|
||||
// 143,203 us sys
|
||||
// footek_test on xnu.test. 33'207 µs 5'164 µs 220'693 µs
|
||||
// 199,882 us real
|
||||
// 138,178 us user
|
||||
// 329,501 us sys
|
||||
// footek_test on netbsd.test. 350 µs 3'570 µs 262'186 µs
|
||||
// 291,255 us real
|
||||
// 70,000 us user
|
||||
// 440,000 us sys
|
||||
// footek_test on openbsd.test. 628 µs 3'232 µs 342'136 µs
|
||||
// 250,072 us real
|
||||
// 437,500 us user
|
||||
// 140,625 us sys
|
||||
// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs
|
||||
// 93,750 us sys
|
||||
// footek_test on win10.test. 996 µs 10'949 µs 398'435 µs
|
||||
|
||||
// arm fleet
|
||||
// with pthread_mutex_t
|
||||
// 30 threads / 100000 iterations
|
||||
//
|
||||
// 87,132 us real
|
||||
// 183,517 us user
|
||||
// 20,020 us sys
|
||||
// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs
|
||||
// 679,374 us real
|
||||
// 957,678 us user
|
||||
// 605,078 us sys
|
||||
// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs
|
||||
// 902,343 us real
|
||||
// 1,459,706 us user
|
||||
// 781,140 us sys
|
||||
// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs
|
||||
// 88,681 us real
|
||||
// 163,500 us user
|
||||
// 22,183 us sys
|
||||
// footek_test on studio.test. 651 µs 15'086 µs 98'632 µs
|
||||
// 157,701 us real
|
||||
// 215,597 us user
|
||||
// 46,436 us sys
|
||||
// footek_test on pi5.test. 296 µs 13'222 µs 159'805 µs
|
||||
// 699,863 us real
|
||||
// 1,027,981 us user
|
||||
// 648,353 us sys
|
||||
// footek_test on pi.test. 419 µs 16'716 µs 721'851 µs
|
||||
// 843,858 us real
|
||||
// 1,432,362 us user
|
||||
// 696,613 us sys
|
||||
// footek_test on freebsdarm.test. 349 µs 16'613 µs 876'863 µs
|
||||
|
|
|
@ -245,7 +245,7 @@ static void CheckPrivilegedCrossReferences(void) {
|
|||
if (~shdr->sh_flags & SHF_EXECINSTR)
|
||||
continue; // data reference
|
||||
if ((secname = GetElfString(elf, esize, secstrs, shdr->sh_name)) &&
|
||||
strcmp(".privileged", secname)) {
|
||||
!startswith(secname, ".privileged")) {
|
||||
tinyprint(2, epath,
|
||||
": code in .privileged section "
|
||||
"references symbol '",
|
||||
|
|
|
@ -343,7 +343,7 @@ LDLIBS_X86_64="-lcosmo"
|
|||
|
||||
CRT_AARCH64="$LIB_AARCH64/crt.o"
|
||||
CPPFLAGS_AARCH64="$CPPFLAGS -fsigned-char"
|
||||
CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
|
||||
CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28"
|
||||
LDFLAGS_AARCH64="$LDFLAGS -L$LIB_AARCH64 -L$BIN/../aarch64-linux-cosmo/lib -Wl,-T,$LIB_AARCH64/aarch64.lds -Wl,-z,common-page-size=16384 -Wl,-z,max-page-size=16384"
|
||||
LDLIBS_AARCH64="-lcosmo"
|
||||
|
||||
|
|
|
@ -131,7 +131,7 @@ elif [ x"$ARCH" = x"aarch64" ]; then
|
|||
OBJCOPYFLAGS="-S"
|
||||
PAGESZ=16384
|
||||
CPPFLAGS="$CPPFLAGS -fsigned-char"
|
||||
CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics"
|
||||
CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28"
|
||||
LDFLAGS="$LDFLAGS -Wl,-T,$LIB/aarch64.lds"
|
||||
else
|
||||
fatal_error "$ARCH: unsupported architecture"
|
||||
|
|
|
@ -79,7 +79,7 @@ o/$(MODE)/tool/hello/hello-pe.ape: \
|
|||
# elf2pe can generate binaries that don't have dll imports
|
||||
o/$(MODE)/tool/hello/life-pe.dbg: \
|
||||
o/$(MODE)/tool/hello/life-pe.o
|
||||
@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain #-Ttext-segment=0x140000000
|
||||
@$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain -Ttext-segment=0x140000000
|
||||
o/$(MODE)/tool/hello/life-pe.ape: \
|
||||
o/$(MODE)/tool/hello/life-pe.dbg \
|
||||
o/$(MODE)/tool/build/elf2pe
|
||||
|
|
Loading…
Reference in a new issue