diff --git a/Makefile b/Makefile index dcc3278ff..c0c55c837 100644 --- a/Makefile +++ b/Makefile @@ -132,7 +132,7 @@ endif ifneq ($(findstring aarch64,$(MODE)),) ARCH = aarch64 -HOSTS ?= pi studio freebsdarm +HOSTS ?= pi pi5 studio freebsdarm else ARCH = x86_64 HOSTS ?= freebsd rhel7 xnu openbsd netbsd win10 diff --git a/ape/ape.lds b/ape/ape.lds index 4e6db724a..4bf0f0fd8 100644 --- a/ape/ape.lds +++ b/ape/ape.lds @@ -310,7 +310,7 @@ SECTIONS { . = ALIGN(__privileged_end > __privileged_start ? CONSTANT(COMMONPAGESIZE) : 0); /*END: morphable code */ __privileged_start = .; - *(.privileged) + *(.privileged .privileged.*) __privileged_end = .; KEEP(*(.ape.pad.text)) diff --git a/build/definitions.mk b/build/definitions.mk index 774983244..703a5c381 100644 --- a/build/definitions.mk +++ b/build/definitions.mk @@ -115,14 +115,10 @@ ifeq ($(ARCH), aarch64) # - Cosmopolitan Libc uses x28 for thread-local storage because Apple # forbids us from using tpidr_el0 too. # -# - Cosmopolitan currently lacks an implementation of the runtime -# libraries needed by the -moutline-atomics flag -# DEFAULT_COPTS += \ -ffixed-x18 \ -ffixed-x28 \ - -fsigned-char \ - -mno-outline-atomics + -fsigned-char endif MATHEMATICAL = \ diff --git a/libc/intrin/aarch64/atomics.S b/libc/intrin/aarch64/atomics.S new file mode 100644 index 000000000..17bc04fc3 --- /dev/null +++ b/libc/intrin/aarch64/atomics.S @@ -0,0 +1,1919 @@ +// Copyright 2024 Justine Alexandra Roberts Tunney +// +// Permission to use, copy, modify, and/or distribute this software for +// any purpose with or without fee is hereby granted, provided that the +// above copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR +// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +// PERFORMANCE OF THIS SOFTWARE. + +#include "libc/macros.h" + +// aarch64 atomics compiler runtime +// +// armv8.1 introduced atomic instructions that go considerably faster. +// you can pass the -mno-outline-atomics flag to the compiler to avoid +// this runtime, however that'll go slower. + +.arch armv8-a+lse + +.macro .prvfn name + .privileged + .balign 16 +\name: +.endm + +.macro .begfn name + .section .text.\name,"ax",%progbits + .balign 16 + .ftrace1 +\name: + .ftrace2 +.endm + +.macro jnatom label + adrp x16,__aarch64_have_lse_atomics + ldrb w16,[x16,:lo12:__aarch64_have_lse_atomics] + cbz w16,\label +.endm + + +.begfn __aarch64_swp1_relax + jnatom 1f + swpb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + stxrb w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp1_relax,globl + +.begfn __aarch64_swp1_acq + jnatom 1f + swpab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + stxrb w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp1_acq,globl + +.begfn __aarch64_swp1_rel + jnatom 1f + swplb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + stlxrb w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp1_rel,globl + +.begfn __aarch64_swp1_acq_rel + jnatom 1f + swpalb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + stlxrb w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp1_acq_rel,globl + +.begfn __aarch64_swp1_sync + jnatom 1f + swpab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + stxrb w17,w16,[x1] + cbnz w17,0b + dmb ish + ret +.endfn __aarch64_swp1_sync,globl + + +.begfn __aarch64_cas1_relax + jnatom 1f + casb w0,w1,[x2] + ret +1: uxtb w16,w0 +0: ldxrb w0,[x2] + cmp w0,w16 + bne 1f + stxrb w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas1_relax,globl + +.begfn __aarch64_cas1_acq + jnatom 1f + casab w0,w1,[x2] + ret +1: uxtb w16,w0 +0: ldaxrb w0,[x2] + cmp w0,w16 + bne 1f + stxrb w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas1_acq,globl + +.begfn __aarch64_cas1_rel + jnatom 1f + caslb w0,w1,[x2] + ret +1: uxtb w16,w0 +0: ldxrb w0,[x2] + cmp w0,w16 + bne 1f + stlxrb w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas1_rel,globl + +.begfn __aarch64_cas1_acq_rel + jnatom 1f + casalb w0,w1,[x2] + ret +1: uxtb w16,w0 +0: ldaxrb w0,[x2] + cmp w0,w16 + bne 1f + stlxrb w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas1_acq_rel,globl + +.begfn __aarch64_cas1_sync + jnatom 1f + casalb w0,w1,[x2] + ret +1: uxtb w16,w0 +0: ldxrb w0,[x2] + cmp w0,w16 + bne 1f + stlxrb w17,w1,[x2] + cbnz w17,0b +1: dmb ish + ret +.endfn __aarch64_cas1_sync,globl + + +.begfn __aarch64_ldadd1_relax + jnatom 1f + ldaddb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + add w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd1_relax,globl + +.begfn __aarch64_ldadd1_acq + jnatom 1f + ldaddab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + add w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd1_acq,globl + +.begfn __aarch64_ldadd1_rel + jnatom 1f + ldaddlb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + add w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd1_rel,globl + +.begfn __aarch64_ldadd1_acq_rel + jnatom 1f + ldaddalb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + add w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd1_acq_rel,globl + +.begfn __aarch64_ldadd1_sync + jnatom 1f + ldaddalb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + add w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldadd1_sync,globl + + +.begfn __aarch64_ldset1_relax + jnatom 1f + ldsetb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + orr w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset1_relax,globl + +.begfn __aarch64_ldset1_acq + jnatom 1f + ldsetab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + orr w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset1_acq,globl + +.begfn __aarch64_ldset1_rel + jnatom 1f + ldsetlb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + orr w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset1_rel,globl + +.begfn __aarch64_ldset1_acq_rel + jnatom 1f + ldsetalb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + orr w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset1_acq_rel,globl + +.begfn __aarch64_ldset1_sync + jnatom 1f + ldsetalb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + orr w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldset1_sync,globl + + +.begfn __aarch64_ldclr1_relax + jnatom 1f + ldclrb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + bic w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr1_relax,globl + +.begfn __aarch64_ldclr1_acq + jnatom 1f + ldclrab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + bic w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr1_acq,globl + +.begfn __aarch64_ldclr1_rel + jnatom 1f + ldclrlb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + bic w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr1_rel,globl + +.begfn __aarch64_ldclr1_acq_rel + jnatom 1f + ldclralb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + bic w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr1_acq_rel,globl + +.begfn __aarch64_ldclr1_sync + jnatom 1f + ldclralb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + bic w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldclr1_sync,globl + + +.begfn __aarch64_ldeor1_relax + jnatom 1f + ldeorb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + eor w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor1_relax,globl + +.begfn __aarch64_ldeor1_acq + jnatom 1f + ldeorab w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + eor w17,w0,w16 + stxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor1_acq,globl + +.begfn __aarch64_ldeor1_rel + jnatom 1f + ldeorlb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + eor w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor1_rel,globl + +.begfn __aarch64_ldeor1_acq_rel + jnatom 1f + ldeoralb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrb w0,[x1] + eor w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor1_acq_rel,globl + +.begfn __aarch64_ldeor1_sync + jnatom 1f + ldeoralb w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrb w0,[x1] + eor w17,w0,w16 + stlxrb w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldeor1_sync,globl + + +.begfn __aarch64_swp2_relax + jnatom 1f + swph w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + stxrh w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp2_relax,globl + +.begfn __aarch64_swp2_acq + jnatom 1f + swpah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + stxrh w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp2_acq,globl + +.begfn __aarch64_swp2_rel + jnatom 1f + swplh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + stlxrh w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp2_rel,globl + +.begfn __aarch64_swp2_acq_rel + jnatom 1f + swpalh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + stlxrh w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp2_acq_rel,globl + +.begfn __aarch64_swp2_sync + jnatom 1f + swpah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + stxrh w17,w16,[x1] + cbnz w17,0b + dmb ish + ret +.endfn __aarch64_swp2_sync,globl + + +.begfn __aarch64_cas2_relax + jnatom 1f + cash w0,w1,[x2] + ret +1: uxth w16,w0 +0: ldxrh w0,[x2] + cmp w0,w16 + bne 1f + stxrh w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas2_relax,globl + +.begfn __aarch64_cas2_acq + jnatom 1f + casah w0,w1,[x2] + ret +1: uxth w16,w0 +0: ldaxrh w0,[x2] + cmp w0,w16 + bne 1f + stxrh w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas2_acq,globl + +.begfn __aarch64_cas2_rel + jnatom 1f + caslh w0,w1,[x2] + ret +1: uxth w16,w0 +0: ldxrh w0,[x2] + cmp w0,w16 + bne 1f + stlxrh w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas2_rel,globl + +.begfn __aarch64_cas2_acq_rel + jnatom 1f + casalh w0,w1,[x2] + ret +1: uxth w16,w0 +0: ldaxrh w0,[x2] + cmp w0,w16 + bne 1f + stlxrh w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas2_acq_rel,globl + +.begfn __aarch64_cas2_sync + jnatom 1f + casalh w0,w1,[x2] + ret +1: uxth w16,w0 +0: ldxrh w0,[x2] + cmp w0,w16 + bne 1f + stlxrh w17,w1,[x2] + cbnz w17,0b +1: dmb ish + ret +.endfn __aarch64_cas2_sync,globl + + +.begfn __aarch64_ldadd2_relax + jnatom 1f + ldaddh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + add w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd2_relax,globl + +.begfn __aarch64_ldadd2_acq + jnatom 1f + ldaddah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + add w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd2_acq,globl + +.begfn __aarch64_ldadd2_rel + jnatom 1f + ldaddlh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + add w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd2_rel,globl + +.begfn __aarch64_ldadd2_acq_rel + jnatom 1f + ldaddalh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + add w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd2_acq_rel,globl + +.begfn __aarch64_ldadd2_sync + jnatom 1f + ldaddalh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + add w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldadd2_sync,globl + + +.begfn __aarch64_ldset2_relax + jnatom 1f + ldseth w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + orr w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset2_relax,globl + +.begfn __aarch64_ldset2_acq + jnatom 1f + ldsetah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + orr w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset2_acq,globl + +.begfn __aarch64_ldset2_rel + jnatom 1f + ldsetlh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + orr w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset2_rel,globl + +.begfn __aarch64_ldset2_acq_rel + jnatom 1f + ldsetalh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + orr w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset2_acq_rel,globl + +.begfn __aarch64_ldset2_sync + jnatom 1f + ldsetalh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + orr w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldset2_sync,globl + + +.begfn __aarch64_ldclr2_relax + jnatom 1f + ldclrh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + bic w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr2_relax,globl + +.begfn __aarch64_ldclr2_acq + jnatom 1f + ldclrah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + bic w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr2_acq,globl + +.begfn __aarch64_ldclr2_rel + jnatom 1f + ldclrlh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + bic w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr2_rel,globl + +.begfn __aarch64_ldclr2_acq_rel + jnatom 1f + ldclralh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + bic w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr2_acq_rel,globl + +.begfn __aarch64_ldclr2_sync + jnatom 1f + ldclralh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + bic w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldclr2_sync,globl + + +.begfn __aarch64_ldeor2_relax + jnatom 1f + ldeorh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + eor w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor2_relax,globl + +.begfn __aarch64_ldeor2_acq + jnatom 1f + ldeorah w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + eor w17,w0,w16 + stxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor2_acq,globl + +.begfn __aarch64_ldeor2_rel + jnatom 1f + ldeorlh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + eor w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor2_rel,globl + +.begfn __aarch64_ldeor2_acq_rel + jnatom 1f + ldeoralh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxrh w0,[x1] + eor w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor2_acq_rel,globl + +.begfn __aarch64_ldeor2_sync + jnatom 1f + ldeoralh w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxrh w0,[x1] + eor w17,w0,w16 + stlxrh w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldeor2_sync,globl + + +.begfn __aarch64_swp4_relax + jnatom 1f + swp w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + stxr w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp4_relax,globl + +.begfn __aarch64_swp4_acq + jnatom 1f + swpa w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + stxr w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp4_acq,globl + +.begfn __aarch64_swp4_rel + jnatom 1f + swpl w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + stlxr w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp4_rel,globl + +.begfn __aarch64_swp4_acq_rel + jnatom 1f + swpal w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + stlxr w17,w16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp4_acq_rel,globl + +.begfn __aarch64_swp4_sync + jnatom 1f + swpa w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + stxr w17,w16,[x1] + cbnz w17,0b + dmb ish + ret +.endfn __aarch64_swp4_sync,globl + + +.begfn __aarch64_cas4_relax + jnatom 1f + cas w0,w1,[x2] + ret +1: mov w16,w0 +0: ldxr w0,[x2] + cmp w0,w16 + bne 1f + stxr w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas4_relax,globl + +.begfn __aarch64_cas4_acq + jnatom 1f + casa w0,w1,[x2] + ret +1: mov w16,w0 +0: ldaxr w0,[x2] + cmp w0,w16 + bne 1f + stxr w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas4_acq,globl + +.begfn __aarch64_cas4_rel + jnatom 1f + casl w0,w1,[x2] + ret +1: mov w16,w0 +0: ldxr w0,[x2] + cmp w0,w16 + bne 1f + stlxr w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas4_rel,globl + +.begfn __aarch64_cas4_acq_rel + jnatom 1f + casal w0,w1,[x2] + ret +1: mov w16,w0 +0: ldaxr w0,[x2] + cmp w0,w16 + bne 1f + stlxr w17,w1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas4_acq_rel,globl + +.begfn __aarch64_cas4_sync + jnatom 1f + casal w0,w1,[x2] + ret +1: mov w16,w0 +0: ldxr w0,[x2] + cmp w0,w16 + bne 1f + stlxr w17,w1,[x2] + cbnz w17,0b +1: dmb ish + ret +.endfn __aarch64_cas4_sync,globl + + +.begfn __aarch64_ldadd4_relax + jnatom 1f + ldadd w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + add w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd4_relax,globl + +.begfn __aarch64_ldadd4_acq + jnatom 1f + ldadda w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + add w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd4_acq,globl + +.begfn __aarch64_ldadd4_rel + jnatom 1f + ldaddl w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + add w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd4_rel,globl + +.begfn __aarch64_ldadd4_acq_rel + jnatom 1f + ldaddal w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + add w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd4_acq_rel,globl + +.begfn __aarch64_ldadd4_sync + jnatom 1f + ldaddal w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + add w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldadd4_sync,globl + + +.begfn __aarch64_ldset4_relax + jnatom 1f + ldset w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + orr w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset4_relax,globl + +.begfn __aarch64_ldset4_acq + jnatom 1f + ldseta w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + orr w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset4_acq,globl + +.begfn __aarch64_ldset4_rel + jnatom 1f + ldsetl w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + orr w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset4_rel,globl + +.begfn __aarch64_ldset4_acq_rel + jnatom 1f + ldsetal w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + orr w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset4_acq_rel,globl + +.begfn __aarch64_ldset4_sync + jnatom 1f + ldsetal w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + orr w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldset4_sync,globl + + +.begfn __aarch64_ldclr4_relax + jnatom 1f + ldclr w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + bic w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr4_relax,globl + +.begfn __aarch64_ldclr4_acq + jnatom 1f + ldclra w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + bic w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr4_acq,globl + +.begfn __aarch64_ldclr4_rel + jnatom 1f + ldclrl w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + bic w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr4_rel,globl + +.begfn __aarch64_ldclr4_acq_rel + jnatom 1f + ldclral w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + bic w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr4_acq_rel,globl + +.begfn __aarch64_ldclr4_sync + jnatom 1f + ldclral w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + bic w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldclr4_sync,globl + + +.begfn __aarch64_ldeor4_relax + jnatom 1f + ldeor w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + eor w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor4_relax,globl + +.begfn __aarch64_ldeor4_acq + jnatom 1f + ldeora w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + eor w17,w0,w16 + stxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor4_acq,globl + +.begfn __aarch64_ldeor4_rel + jnatom 1f + ldeorl w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + eor w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor4_rel,globl + +.begfn __aarch64_ldeor4_acq_rel + jnatom 1f + ldeoral w0,w0,[x1] + ret +1: mov w16,w0 +0: ldaxr w0,[x1] + eor w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor4_acq_rel,globl + +.begfn __aarch64_ldeor4_sync + jnatom 1f + ldeoral w0,w0,[x1] + ret +1: mov w16,w0 +0: ldxr w0,[x1] + eor w17,w0,w16 + stlxr w15,w17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldeor4_sync,globl + + +.begfn __aarch64_swp8_relax + jnatom 1f + swp x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp8_relax,globl + +.begfn __aarch64_swp8_acq + jnatom 1f + swpa x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp8_acq,globl + +.begfn __aarch64_swp8_rel + jnatom 1f + swpl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stlxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp8_rel,globl + +.begfn __aarch64_swp8_acq_rel + jnatom 1f + swpal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + stlxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp8_acq_rel,globl + +.begfn __aarch64_swp8_sync + jnatom 1f + swpa x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + dmb ish + ret +.endfn __aarch64_swp8_sync,globl + + +.prvfn __aarch64_cas8_relax + jnatom 1f + cas x0,x1,[x2] + ret +1: mov x16,x0 +0: ldxr x0,[x2] + cmp x0,x16 + bne 1f + stxr w17,x1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas8_relax,globl + +.prvfn __aarch64_cas8_acq + jnatom 1f + casa x0,x1,[x2] + ret +1: mov x16,x0 +0: ldaxr x0,[x2] + cmp x0,x16 + bne 1f + stxr w17,x1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas8_acq,globl + +.prvfn __aarch64_cas8_rel + jnatom 1f + casl x0,x1,[x2] + ret +1: mov x16,x0 +0: ldxr x0,[x2] + cmp x0,x16 + bne 1f + stlxr w17,x1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas8_rel,globl + +.begfn __aarch64_cas8_acq_rel + jnatom 1f + casal x0,x1,[x2] + ret +1: mov x16,x0 +0: ldaxr x0,[x2] + cmp x0,x16 + bne 1f + stlxr w17,x1,[x2] + cbnz w17,0b +1: ret +.endfn __aarch64_cas8_acq_rel,globl + +.begfn __aarch64_cas8_sync + jnatom 1f + casal x0,x1,[x2] + ret +1: mov x16,x0 +0: ldxr x0,[x2] + cmp x0,x16 + bne 1f + stlxr w17,x1,[x2] + cbnz w17,0b +1: dmb ish + ret +.endfn __aarch64_cas8_sync,globl + + +.begfn __aarch64_ldadd8_relax + jnatom 1f + ldadd x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd8_relax,globl + +.begfn __aarch64_ldadd8_acq + jnatom 1f + ldadda x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + add x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd8_acq,globl + +.begfn __aarch64_ldadd8_rel + jnatom 1f + ldaddl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd8_rel,globl + +.begfn __aarch64_ldadd8_acq_rel + jnatom 1f + ldaddal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd8_acq_rel,globl + +.begfn __aarch64_ldadd8_sync + jnatom 1f + ldaddal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldadd8_sync,globl + + +.begfn __aarch64_ldset8_relax + jnatom 1f + ldset x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset8_relax,globl + +.begfn __aarch64_ldset8_acq + jnatom 1f + ldseta x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + orr x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset8_acq,globl + +.begfn __aarch64_ldset8_rel + jnatom 1f + ldsetl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset8_rel,globl + +.begfn __aarch64_ldset8_acq_rel + jnatom 1f + ldsetal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset8_acq_rel,globl + +.begfn __aarch64_ldset8_sync + jnatom 1f + ldsetal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldset8_sync,globl + + +.begfn __aarch64_ldclr8_relax + jnatom 1f + ldclr x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr8_relax,globl + +.begfn __aarch64_ldclr8_acq + jnatom 1f + ldclra x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + bic x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr8_acq,globl + +.begfn __aarch64_ldclr8_rel + jnatom 1f + ldclrl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr8_rel,globl + +.begfn __aarch64_ldclr8_acq_rel + jnatom 1f + ldclral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr8_acq_rel,globl + +.begfn __aarch64_ldclr8_sync + jnatom 1f + ldclral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldclr8_sync,globl + + +.begfn __aarch64_ldeor8_relax + jnatom 1f + ldeor x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor8_relax,globl + +.begfn __aarch64_ldeor8_acq + jnatom 1f + ldeora x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + eor x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor8_acq,globl + +.begfn __aarch64_ldeor8_rel + jnatom 1f + ldeorl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor8_rel,globl + +.begfn __aarch64_ldeor8_acq_rel + jnatom 1f + ldeoral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor8_acq_rel,globl + +.begfn __aarch64_ldeor8_sync + jnatom 1f + ldeoral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldeor8_sync,globl + + +.begfn __aarch64_swp16_relax + jnatom 1f + swp x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp16_relax,globl + +.begfn __aarch64_swp16_acq + jnatom 1f + swpa x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp16_acq,globl + +.begfn __aarch64_swp16_rel + jnatom 1f + swpl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stlxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp16_rel,globl + +.begfn __aarch64_swp16_acq_rel + jnatom 1f + swpal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + stlxr w17,x16,[x1] + cbnz w17,0b + ret +.endfn __aarch64_swp16_acq_rel,globl + +.begfn __aarch64_swp16_sync + jnatom 1f + swpa x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + stxr w17,x16,[x1] + cbnz w17,0b + dmb ish + ret +.endfn __aarch64_swp16_sync,globl + + +.begfn __aarch64_cas16_relax + jnatom 1f + casp x0,x1,x2,x3,[x4] + ret +1: mov x16,x0 + mov x17,x1 +0: ldxp x0,x1,[x4] + cmp x0,x16 + ccmp x1,x17,#0,eq + csel x15,x2,x0,eq + csel x14,x3,x1,eq + stxp w13,x15,x14,[x4] + cbnz w13,0b + ret +.endfn __aarch64_cas16_relax,globl + +.begfn __aarch64_cas16_acq + jnatom 1f + caspa x0,x1,x2,x3,[x4] + ret +1: mov x16,x0 + mov x17,x1 +0: ldaxp x0,x1,[x4] + cmp x0,x16 + ccmp x1,x17,#0,eq + csel x15,x2,x0,eq + csel x14,x3,x1,eq + stxp w13,x15,x14,[x4] + cbnz w13,0b + ret +.endfn __aarch64_cas16_acq,globl + +.begfn __aarch64_cas16_rel + jnatom 1f + caspl x0,x1,x2,x3,[x4] + ret +1: mov x16,x0 + mov x17,x1 +0: ldxp x0,x1,[x4] + cmp x0,x16 + ccmp x1,x17,#0,eq + csel x15,x2,x0,eq + csel x14,x3,x1,eq + stlxp w13,x15,x14,[x4] + cbnz w13,0b + ret +.endfn __aarch64_cas16_rel,globl + +.begfn __aarch64_cas16_acq_rel + jnatom 1f + caspal x0,x1,x2,x3,[x4] + ret +1: mov x16,x0 + mov x17,x1 +0: ldaxp x0,x1,[x4] + cmp x0,x16 + ccmp x1,x17,#0,eq + csel x15,x2,x0,eq + csel x14,x3,x1,eq + stlxp w13,x15,x14,[x4] + cbnz w13,0b + ret +.endfn __aarch64_cas16_acq_rel,globl + +.begfn __aarch64_cas16_sync + jnatom 1f + caspal x0,x1,x2,x3,[x4] + ret +1: mov x16,x0 + mov x17,x1 +0: ldxp x0,x1,[x4] + cmp x0,x16 + ccmp x1,x17,#0,eq + csel x15,x2,x0,eq + csel x14,x3,x1,eq + stlxp w13,x15,x14,[x4] + cbnz w13,0b + dmb ish + ret +.endfn __aarch64_cas16_sync,globl + + +.begfn __aarch64_ldadd16_relax + jnatom 1f + ldadd x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd16_relax,globl + +.begfn __aarch64_ldadd16_acq + jnatom 1f + ldadda x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + add x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd16_acq,globl + +.begfn __aarch64_ldadd16_rel + jnatom 1f + ldaddl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd16_rel,globl + +.begfn __aarch64_ldadd16_acq_rel + jnatom 1f + ldaddal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldadd16_acq_rel,globl + +.begfn __aarch64_ldadd16_sync + jnatom 1f + ldaddal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + add x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldadd16_sync,globl + + +.begfn __aarch64_ldset16_relax + jnatom 1f + ldset x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset16_relax,globl + +.begfn __aarch64_ldset16_acq + jnatom 1f + ldseta x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + orr x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset16_acq,globl + +.begfn __aarch64_ldset16_rel + jnatom 1f + ldsetl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset16_rel,globl + +.begfn __aarch64_ldset16_acq_rel + jnatom 1f + ldsetal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldset16_acq_rel,globl + +.begfn __aarch64_ldset16_sync + jnatom 1f + ldsetal x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + orr x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldset16_sync,globl + + +.begfn __aarch64_ldclr16_relax + jnatom 1f + ldclr x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr16_relax,globl + +.begfn __aarch64_ldclr16_acq + jnatom 1f + ldclra x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + bic x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr16_acq,globl + +.begfn __aarch64_ldclr16_rel + jnatom 1f + ldclrl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr16_rel,globl + +.begfn __aarch64_ldclr16_acq_rel + jnatom 1f + ldclral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldclr16_acq_rel,globl + +.begfn __aarch64_ldclr16_sync + jnatom 1f + ldclral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + bic x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldclr16_sync,globl + + +.begfn __aarch64_ldeor16_relax + jnatom 1f + ldeor x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor16_relax,globl + +.begfn __aarch64_ldeor16_acq + jnatom 1f + ldeora x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + eor x17,x0,x16 + stxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor16_acq,globl + +.begfn __aarch64_ldeor16_rel + jnatom 1f + ldeorl x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor16_rel,globl + +.begfn __aarch64_ldeor16_acq_rel + jnatom 1f + ldeoral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldaxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + ret +.endfn __aarch64_ldeor16_acq_rel,globl + +.begfn __aarch64_ldeor16_sync + jnatom 1f + ldeoral x0,x0,[x1] + ret +1: mov x16,x0 +0: ldxr x0,[x1] + eor x17,x0,x16 + stlxr w15,x17,[x1] + cbnz w15,0b + dmb ish + ret +.endfn __aarch64_ldeor16_sync,globl diff --git a/libc/intrin/armlse.c b/libc/intrin/armlse.c new file mode 100644 index 000000000..b05bf0709 --- /dev/null +++ b/libc/intrin/armlse.c @@ -0,0 +1,32 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/intrin/getauxval.h" +#include "libc/runtime/runtime.h" +#include "libc/sysv/consts/auxv.h" +#include "libc/sysv/consts/hwcap.h" +#ifdef __aarch64__ + +bool __aarch64_have_lse_atomics; + +static __attribute__((__constructor__(1))) void __aarch64_atomics_init(void) { + struct AuxiliaryValue x = __getauxval(AT_HWCAP); + __aarch64_have_lse_atomics = !!(x.value & HWCAP_ATOMICS); +} + +#endif /* __aarch64__ */ diff --git a/libc/intrin/atomic.c b/libc/intrin/atomic.c new file mode 100644 index 000000000..f46f74f49 --- /dev/null +++ b/libc/intrin/atomic.c @@ -0,0 +1,24 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/intrin/atomic.h" + +bool dog(_Atomic(long) *p, long *e, long w) { + return atomic_compare_exchange_weak_explicit(p, e, w, memory_order_acq_rel, + memory_order_relaxed); +} diff --git a/libc/thread/pthread_cancel.c b/libc/thread/pthread_cancel.c index bd6a1c6ea..5ddbea0db 100644 --- a/libc/thread/pthread_cancel.c +++ b/libc/thread/pthread_cancel.c @@ -354,7 +354,6 @@ static errno_t _pthread_cancel_everyone(void) { */ errno_t pthread_cancel(pthread_t thread) { struct PosixThread *arg; - unassert(thread); if ((arg = (struct PosixThread *)thread)) { return _pthread_cancel_single(arg); } else { diff --git a/test/libc/thread/footek_test.c b/test/libc/thread/footek_test.c index 98e07e5e9..acaae0727 100644 --- a/test/libc/thread/footek_test.c +++ b/test/libc/thread/footek_test.c @@ -10,26 +10,26 @@ #include #include "third_party/nsync/futex.internal.h" -// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT -// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN -// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER. - // arm fleet // with futexes // 30 threads / 100000 iterations // -// 242,604 us real -// 4,222,946 us user -// 1,079,229 us sys -// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs -// 1,362,557 us real -// 3,232,978 us user -// 2,104,824 us sys -// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs -// 1,346,482 us real -// 3,370,513 us user -// 1,992,383 us sys -// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs +// 54,183 us real +// 84,723 us user +// 741,667 us sys +// footek_test on studio.test. 609 µs 14'106 µs 65'607 µs +// 406,588 us real +// 884,696 us user +// 720,567 us sys +// footek_test on pi5.test. 334 µs 13'398 µs 408'450 µs +// 1,253,808 us real +// 3,608,426 us user +// 1,378,765 us sys +// footek_test on freebsdarm.test. 367 µs 16'466 µs 1'287'915 µs +// 1,316,058 us real +// 3,286,528 us user +// 1,738,756 us sys +// footek_test on pi.test. 450 µs 16'787 µs 1'338'420 µs // arm fleet // without futexes @@ -106,9 +106,14 @@ // 16,265 us sys // footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs -#define USE_FUTEX 1 -#define THREADS 30 -#define ITERATIONS 30000 +#define SPIN 1 +#define FUTEX 2 +#define NSYNC 3 + +#define USE NSYNC + +#define THREADS 10 +#define ITERATIONS 50000 #define MUTEX_LOCKED(word) ((word) & 8) #define MUTEX_WAITING(word) ((word) & 16) @@ -130,7 +135,7 @@ void lock(atomic_int *futex) { word = atomic_exchange_explicit(futex, 2, memory_order_acquire); while (word > 0) { pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs); -#if USE_FUTEX +#if USE == FUTEX nsync_futex_wait_(futex, 2, 0, 0); #endif pthread_setcancelstate(cs, 0); @@ -142,7 +147,7 @@ void unlock(atomic_int *futex) { int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release); if (word == 2) { atomic_store_explicit(futex, 0, memory_order_release); -#if USE_FUTEX +#if USE == FUTEX nsync_futex_wake_(futex, 1, 0); #endif } @@ -154,9 +159,15 @@ pthread_mutex_t g_locker; void *worker(void *arg) { for (int i = 0; i < ITERATIONS; ++i) { +#if USE == NSYNC + pthread_mutex_lock(&g_locker); + ++g_chores; + pthread_mutex_unlock(&g_locker); +#else lock(&g_lock); ++g_chores; unlock(&g_lock); +#endif } return 0; } @@ -186,51 +197,52 @@ int main() { CheckForMemoryLeaks(); } -// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC -// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES - // x86 fleet // with pthread_mutex_t // 30 threads / 100000 iterations // -// 186,976 us real -// 43,609 us user -// 205,585 us sys -// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs -// 238,902 us real -// 235,743 us user -// 97,881 us sys -// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs -// 201,285 us real -// 249,612 us user -// 141,230 us sys -// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs -// 303,363 us real -// 60,000 us user -// 410,000 us sys -// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs -// 386,085 us real -// 586,455 us user -// 466,991 us sys -// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs -// 245,010 us real +// 177,702 us real +// 183,488 us user +// 54,921 us sys +// footek_test on rhel7.test. 304 µs 2'225 µs 185'809 µs +// 191,346 us real +// 43,746 us user +// 257,012 us sys +// footek_test on freebsd.test. 405 µs 2'186 µs 200'568 µs +// 194,344 us real +// 228,235 us user +// 143,203 us sys +// footek_test on xnu.test. 33'207 µs 5'164 µs 220'693 µs +// 199,882 us real +// 138,178 us user +// 329,501 us sys +// footek_test on netbsd.test. 350 µs 3'570 µs 262'186 µs +// 291,255 us real +// 70,000 us user +// 440,000 us sys +// footek_test on openbsd.test. 628 µs 3'232 µs 342'136 µs +// 250,072 us real // 437,500 us user -// 140,625 us sys -// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs +// 93,750 us sys +// footek_test on win10.test. 996 µs 10'949 µs 398'435 µs // arm fleet // with pthread_mutex_t // 30 threads / 100000 iterations // -// 87,132 us real -// 183,517 us user -// 20,020 us sys -// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs -// 679,374 us real -// 957,678 us user -// 605,078 us sys -// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs -// 902,343 us real -// 1,459,706 us user -// 781,140 us sys -// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs +// 88,681 us real +// 163,500 us user +// 22,183 us sys +// footek_test on studio.test. 651 µs 15'086 µs 98'632 µs +// 157,701 us real +// 215,597 us user +// 46,436 us sys +// footek_test on pi5.test. 296 µs 13'222 µs 159'805 µs +// 699,863 us real +// 1,027,981 us user +// 648,353 us sys +// footek_test on pi.test. 419 µs 16'716 µs 721'851 µs +// 843,858 us real +// 1,432,362 us user +// 696,613 us sys +// footek_test on freebsdarm.test. 349 µs 16'613 µs 876'863 µs diff --git a/tool/build/fixupobj.c b/tool/build/fixupobj.c index 09d1625b6..f2adb73ca 100644 --- a/tool/build/fixupobj.c +++ b/tool/build/fixupobj.c @@ -245,7 +245,7 @@ static void CheckPrivilegedCrossReferences(void) { if (~shdr->sh_flags & SHF_EXECINSTR) continue; // data reference if ((secname = GetElfString(elf, esize, secstrs, shdr->sh_name)) && - strcmp(".privileged", secname)) { + !startswith(secname, ".privileged")) { tinyprint(2, epath, ": code in .privileged section " "references symbol '", diff --git a/tool/cosmocc/bin/cosmocc b/tool/cosmocc/bin/cosmocc index 59364e18c..8ee4cf364 100755 --- a/tool/cosmocc/bin/cosmocc +++ b/tool/cosmocc/bin/cosmocc @@ -343,7 +343,7 @@ LDLIBS_X86_64="-lcosmo" CRT_AARCH64="$LIB_AARCH64/crt.o" CPPFLAGS_AARCH64="$CPPFLAGS -fsigned-char" -CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics" +CFLAGS_AARCH64="$CFLAGS -ffixed-x18 -ffixed-x28" LDFLAGS_AARCH64="$LDFLAGS -L$LIB_AARCH64 -L$BIN/../aarch64-linux-cosmo/lib -Wl,-T,$LIB_AARCH64/aarch64.lds -Wl,-z,common-page-size=16384 -Wl,-z,max-page-size=16384" LDLIBS_AARCH64="-lcosmo" diff --git a/tool/cosmocc/bin/cosmocross b/tool/cosmocc/bin/cosmocross index ced49c754..65aa487ea 100755 --- a/tool/cosmocc/bin/cosmocross +++ b/tool/cosmocc/bin/cosmocross @@ -131,7 +131,7 @@ elif [ x"$ARCH" = x"aarch64" ]; then OBJCOPYFLAGS="-S" PAGESZ=16384 CPPFLAGS="$CPPFLAGS -fsigned-char" - CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28 -mno-outline-atomics" + CFLAGS="$CFLAGS -ffixed-x18 -ffixed-x28" LDFLAGS="$LDFLAGS -Wl,-T,$LIB/aarch64.lds" else fatal_error "$ARCH: unsupported architecture" diff --git a/tool/hello/BUILD.mk b/tool/hello/BUILD.mk index bb2cbb1cd..2a899b671 100644 --- a/tool/hello/BUILD.mk +++ b/tool/hello/BUILD.mk @@ -79,7 +79,7 @@ o/$(MODE)/tool/hello/hello-pe.ape: \ # elf2pe can generate binaries that don't have dll imports o/$(MODE)/tool/hello/life-pe.dbg: \ o/$(MODE)/tool/hello/life-pe.o - @$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain #-Ttext-segment=0x140000000 + @$(COMPILE) -ALINK.elf $(LINK) $(LINKARGS) $(OUTPUT_OPTION) -q -e WinMain -Ttext-segment=0x140000000 o/$(MODE)/tool/hello/life-pe.ape: \ o/$(MODE)/tool/hello/life-pe.dbg \ o/$(MODE)/tool/build/elf2pe