From 3774b28d8f3b9e8a946beb9550bee85e5454fc9f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 18 Mar 2024 20:50:04 -0400 Subject: [PATCH 01/20] locking/qspinlock: Always evaluate lockevent* non-event parameter once The 'inc' parameter of lockevent_add() and the cond parameter of lockevent_cond_inc() are only evaluated when CONFIG_LOCK_EVENT_COUNTS is on. That can cause problem if those parameters are expressions with side effect like a "++". Fix this by evaluating those non-event parameters once even if CONFIG_LOCK_EVENT_COUNTS is off. This will also eliminate the need of the __maybe_unused attribute to the wait_early local variable in pv_wait_node(). Suggested-by: Ingo Molnar Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20240319005004.1692705-1-longman@redhat.com --- kernel/locking/lock_events.h | 4 ++-- kernel/locking/qspinlock_paravirt.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index a6016b91803d..d2345e9c0190 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -53,8 +53,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #else /* CONFIG_LOCK_EVENT_COUNTS */ #define lockevent_inc(ev) -#define lockevent_add(ev, c) -#define lockevent_cond_inc(ev, c) +#define lockevent_add(ev, c) do { (void)(c); } while (0) +#define lockevent_cond_inc(ev, c) do { (void)(c); } while (0) #endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ae2b12f68b90..169950fe1aad 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -294,7 +294,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - bool __maybe_unused wait_early; + bool wait_early; int loop; for (;;) { From 91a1d97ef482c1e4c9d4c1c656a53b0f6b16d0ed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Mar 2024 19:01:03 +0100 Subject: [PATCH 02/20] jump_label,module: Don't alloc static_key_mod for __ro_after_init keys When a static_key is marked ro_after_init, its state will never change (after init), therefore jump_label_update() will never need to iterate the entries, and thus module load won't actually need to track this -- avoiding the static_key::next write. Therefore, mark these keys such that jump_label_add_module() might recognise them and avoid the modification. Use the special state: 'static_key_linked(key) && !static_key_mod(key)' to denote such keys. jump_label_add_module() does not exist under CONFIG_JUMP_LABEL=n, so the newly-introduced jump_label_init_ro() can be defined as a nop for that configuration. [ mingo: Renamed jump_label_ro() to jump_label_init_ro() ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-2-vschneid@redhat.com --- include/asm-generic/sections.h | 5 ++++ include/linux/jump_label.h | 3 ++ init/main.c | 1 + kernel/jump_label.c | 53 ++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index db13bb620f52..c768de6f19a9 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr) addr < (unsigned long)__end_rodata; } +static inline bool is_kernel_ro_after_init(unsigned long addr) +{ + return addr >= (unsigned long)__start_ro_after_init && + addr < (unsigned long)__end_ro_after_init; +} /** * is_kernel_inittext - checks if the pointer address is located in the * .init.text section diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f0a949b7c973..f5a2727ca4a9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); +extern void jump_label_init_ro(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } +static __always_inline void jump_label_init_ro(void) { } + static __always_inline bool static_key_false(struct static_key *key) { if (unlikely_notrace(static_key_count(key) > 0)) diff --git a/init/main.c b/init/main.c index 2ca52474d0c3..6c3f251d6ef8 100644 --- a/init/main.c +++ b/init/main.c @@ -1408,6 +1408,7 @@ static void mark_readonly(void) * insecure pages which are W+X. */ flush_module_init_free_work(); + jump_label_init_ro(); mark_rodata_ro(); debug_checkwx(); rodata_test(); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index d9c822bbffb8..3218fa5688b9 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -530,6 +530,45 @@ void __init jump_label_init(void) cpus_read_unlock(); } +static inline bool static_key_sealed(struct static_key *key) +{ + return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); +} + +static inline void static_key_seal(struct static_key *key) +{ + unsigned long type = key->type & JUMP_TYPE_TRUE; + key->type = JUMP_TYPE_LINKED | type; +} + +void jump_label_init_ro(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + if (WARN_ON_ONCE(!static_key_initialized)) + return; + + cpus_read_lock(); + jump_label_lock(); + + for (iter = iter_start; iter < iter_stop; iter++) { + struct static_key *iterk = jump_entry_key(iter); + + if (!is_kernel_ro_after_init((unsigned long)iterk)) + continue; + + if (static_key_sealed(iterk)) + continue; + + static_key_seal(iterk); + } + + jump_label_unlock(); + cpus_read_unlock(); +} + #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -650,6 +689,15 @@ static int jump_label_add_module(struct module *mod) static_key_set_entries(key, iter); continue; } + + /* + * If the key was sealed at init, then there's no need to keep a + * reference to its module entries - just patch them now and be + * done with it. + */ + if (static_key_sealed(key)) + goto do_poke; + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; @@ -675,6 +723,7 @@ static int jump_label_add_module(struct module *mod) static_key_set_linked(key); /* Only update if we've changed from our initial state */ +do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } @@ -699,6 +748,10 @@ static void jump_label_del_module(struct module *mod) if (within_module((unsigned long)key, mod)) continue; + /* No @jlm allocated because key was sealed at init. */ + if (static_key_sealed(key)) + continue; + /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; From 4eab44a8ae98df53e59f6af3c860142177235507 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Mar 2024 19:01:04 +0100 Subject: [PATCH 03/20] context_tracking: Make context_tracking_key __ro_after_init context_tracking_key is only ever enabled in __init ct_cpu_tracker_user(), so mark it as __ro_after_init. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-3-vschneid@redhat.com --- kernel/context_tracking.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 70ae70d03823..24b1e1143260 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -432,7 +432,7 @@ static __always_inline void ct_kernel_enter(bool user, int offset) { } #define CREATE_TRACE_POINTS #include -DEFINE_STATIC_KEY_FALSE(context_tracking_key); +DEFINE_STATIC_KEY_FALSE_RO(context_tracking_key); EXPORT_SYMBOL_GPL(context_tracking_key); static noinstr bool context_tracking_recursion_enter(void) From ddd8afacc4f65a01204d7a36b8fd96c908e9b72c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Mar 2024 19:01:05 +0100 Subject: [PATCH 04/20] x86/kvm: Make kvm_async_pf_enabled __ro_after_init kvm_async_pf_enabled is only ever enabled in __init kvm_guest_init(), so mark it as __ro_after_init. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Sean Christopherson Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-4-vschneid@redhat.com --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4cadfd606e8e..31a48bae2392 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -44,7 +44,7 @@ #include #include -DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); +DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled); static int kvmapf = 1; From 79a4567b2e8ae4d0282602a24f76f5e2382f5b98 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Mar 2024 19:01:06 +0100 Subject: [PATCH 05/20] x86/tsc: Make __use_tsc __ro_after_init __use_tsc is only ever enabled in __init tsc_enable_sched_clock(), so mark it as __ro_after_init. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20240313180106.2917308-5-vschneid@redhat.com --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5a69a49acc96..0f7624ed1d1d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -44,7 +44,7 @@ EXPORT_SYMBOL(tsc_khz); static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; -static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; From 929ad065ba2967be238dfdc0895b79fda62c7f16 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:56 +0200 Subject: [PATCH 06/20] locking/atomic/x86: Correct the definition of __arch_try_cmpxchg128() Correct the definition of __arch_try_cmpxchg128(), introduced by: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") Fixes: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-2-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 44b08b53ab32..c1d6cd58f809 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -62,7 +62,7 @@ static __always_inline u128 arch_cmpxchg128_local(volatile u128 *ptr, u128 old, asm volatile(_lock "cmpxchg16b %[ptr]" \ CC_SET(e) \ : CC_OUT(e) (ret), \ - [ptr] "+m" (*ptr), \ + [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high) \ : "memory"); \ From 7016cc5def44b9dcb28089efae4412fa0d6c78c2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:57 +0200 Subject: [PATCH 07/20] locking/atomic/x86: Modernize x86_32 arch_{,try_}_cmpxchg64{,_local}() Commit: b23e139d0b66 ("arch: Introduce arch_{,try_}_cmpxchg128{,_local}()") introduced arch_{,try_}_cmpxchg128{,_local}() for x86_64 targets. Modernize existing x86_32 arch_{,try_}_cmpxchg64{,_local}() definitions to follow the same structure as the definitions introduced by the above commit. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-3-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 183 +++++++++++++++++------------- 1 file changed, 102 insertions(+), 81 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index b5731c51f0f4..fe40d0681ea8 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,103 +3,124 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, + * Note: if you use __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ +union __u64_halves { + u64 full; + struct { + u32 low, high; + }; +}; + +#define __arch_cmpxchg64(_ptr, _old, _new, _lock) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + o.full; \ +}) + + +static __always_inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64(ptr, old, new, LOCK_PREFIX); +} + +static __always_inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64(ptr, old, new,); +} + +#define __arch_try_cmpxchg64(_ptr, _oldp, _new, _lock) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(_lock "cmpxchg8b %[ptr]" \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); +} + #ifdef CONFIG_X86_CMPXCHG64 -#define arch_cmpxchg64(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_cmpxchg64_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ - (unsigned long long)(n))) -#define arch_try_cmpxchg64(ptr, po, n) \ - __try_cmpxchg64((ptr), (unsigned long long *)(po), \ - (unsigned long long)(n)) -#endif -static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) -{ - u64 prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; -} +#define arch_cmpxchg64 __cmpxchg64 -static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) -{ - u64 prev; - asm volatile("cmpxchg8b %1" - : "=A" (prev), - "+m" (*ptr) - : "b" ((u32)new), - "c" ((u32)(new >> 32)), - "0" (old) - : "memory"); - return prev; -} +#define arch_cmpxchg64_local __cmpxchg64_local -static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) -{ - bool success; - u64 old = *pold; - asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" - CC_SET(z) - : CC_OUT(z) (success), - [ptr] "+m" (*ptr), - "+A" (old) - : "b" ((u32)new), - "c" ((u32)(new >> 32)) - : "memory"); +#define arch_try_cmpxchg64 __try_cmpxchg64 - if (unlikely(!success)) - *pold = old; - return success; -} +#else -#ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define arch_cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io(LOCK_PREFIX_HERE \ - "call cmpxchg8b_emu", \ - "lock; cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu(_ptr, _old, _new) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + "call cmpxchg8b_emu", \ + "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) +static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu(ptr, old, new); +} +#define arch_cmpxchg64 arch_cmpxchg64 -#define arch_cmpxchg64_local(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (o); \ - __typeof__(*(ptr)) __new = (n); \ - alternative_io("call cmpxchg8b_emu", \ - "cmpxchg8b (%%esi)" , \ - X86_FEATURE_CX8, \ - "=A" (__ret), \ - "S" ((ptr)), "0" (__old), \ - "b" ((unsigned int)__new), \ - "c" ((unsigned int)(__new>>32)) \ - : "memory"); \ - __ret; }) +#define __arch_cmpxchg64_emu_local(_ptr, _old, _new) \ +({ \ + union __u64_halves o = { .full = (_old), }, \ + n = { .full = (_new), }; \ + \ + asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ + "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + : [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + o.full; \ +}) + +static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) +{ + return __arch_cmpxchg64_emu_local(ptr, old, new); +} +#define arch_cmpxchg64_local arch_cmpxchg64_local #endif From aef95dac9ce4f271cc43195ffc175114ed934cbe Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 8 Apr 2024 11:13:58 +0200 Subject: [PATCH 08/20] locking/atomic/x86: Introduce arch_try_cmpxchg64() for !CONFIG_X86_CMPXCHG64 Commit: 6d12c8d308e68 ("percpu: Wire up cmpxchg128") improved emulated cmpxchg8b_emu() library function to return success/failure in a ZF flag. Define arch_try_cmpxchg64() for !CONFIG_X86_CMPXCHG64 targets to override the generic archy_try_cmpxchg() with an optimized target specific implementation that handles ZF flag. The assembly code at the call sites improves from: bf56d: e8 fc ff ff ff call cmpxchg8b_emu bf572: 8b 74 24 28 mov 0x28(%esp),%esi bf576: 89 c3 mov %eax,%ebx bf578: 89 d1 mov %edx,%ecx bf57a: 8b 7c 24 2c mov 0x2c(%esp),%edi bf57e: 89 f0 mov %esi,%eax bf580: 89 fa mov %edi,%edx bf582: 31 d8 xor %ebx,%eax bf584: 31 ca xor %ecx,%edx bf586: 09 d0 or %edx,%eax bf588: 0f 84 e3 01 00 00 je bf771 <...> to: bf572: e8 fc ff ff ff call cmpxchg8b_emu bf577: 0f 84 b6 01 00 00 je bf733 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240408091547.90111-4-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fe40d0681ea8..9e0d330dd5d0 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -122,6 +122,34 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 } #define arch_cmpxchg64_local arch_cmpxchg64_local +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + "call cmpxchg8b_emu", \ + "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu(ptr, oldp, new); +} +#define arch_try_cmpxchg64 arch_try_cmpxchg64 + #endif #define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) From 276b893049e4cdc2f33c009706a75ec18a114485 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:33 +0200 Subject: [PATCH 09/20] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg() to x86_32 Introduce arch_atomic64_try_cmpxchg() for 32-bit targets to use optimized target specific implementation instead of a generic one. This implementation eliminates dual-word compare after cmpxchg8b instruction and improves generated asm code from: 2273: f0 0f c7 0f lock cmpxchg8b (%edi) 2277: 8b 74 24 2c mov 0x2c(%esp),%esi 227b: 89 d3 mov %edx,%ebx 227d: 89 c2 mov %eax,%edx 227f: 89 5c 24 10 mov %ebx,0x10(%esp) 2283: 8b 7c 24 30 mov 0x30(%esp),%edi 2287: 89 44 24 1c mov %eax,0x1c(%esp) 228b: 31 f2 xor %esi,%edx 228d: 89 d0 mov %edx,%eax 228f: 89 da mov %ebx,%edx 2291: 31 fa xor %edi,%edx 2293: 09 d0 or %edx,%eax 2295: 0f 85 a5 00 00 00 jne 2340 <...> to: 2270: f0 0f c7 0f lock cmpxchg8b (%edi) 2274: 0f 85 a6 00 00 00 jne 2320 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-1-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 3486d91b8595..ec217aaf41eb 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -61,12 +61,18 @@ ATOMIC64_DECL(add_unless); #undef __ATOMIC64_DECL #undef ATOMIC64_EXPORT -static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) +static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { - return arch_cmpxchg64(&v->counter, o, n); + return arch_cmpxchg64(&v->counter, old, new); } #define arch_atomic64_cmpxchg arch_atomic64_cmpxchg +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) +{ + return arch_try_cmpxchg64(&v->counter, old, new); +} +#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg + static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { s64 o; From e73c4e34a0e9e3dfcb4e5ee4ccd3039a7b603218 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:34 +0200 Subject: [PATCH 10/20] locking/atomic/x86: Introduce arch_atomic64_read_nonatomic() to x86_32 Introduce arch_atomic64_read_nonatomic() for 32-bit targets to load the value from atomic64_t location in a non-atomic way. This function is intended to be used in cases where a subsequent atomic operation will handle the torn value, and can be used to prime the first iteration of unconditional try_cmpxchg() loops. Suggested-by: Mark Rutland Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-2-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index ec217aaf41eb..bbdf174de110 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -14,6 +14,32 @@ typedef struct { #define ATOMIC64_INIT(val) { (val) } +/* + * Read an atomic64_t non-atomically. + * + * This is intended to be used in cases where a subsequent atomic operation + * will handle the torn value, and can be used to prime the first iteration + * of unconditional try_cmpxchg() loops, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + * + * This is NOT safe to use where the value is not always checked by a + * subsequent atomic operation, such as in conditional try_cmpxchg() loops + * that can break before the atomic operation, e.g.: + * + * s64 val = arch_atomic64_read_nonatomic(v); + * do { + * if (condition(val)) + * break; + * } while (!arch_atomic64_try_cmpxchg(v, &val, val OP i); + */ +static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) +{ + /* See comment in arch_atomic_read(). */ + return __READ_ONCE(v->counter); +} + #define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...) #ifndef ATOMIC64_EXPORT #define ATOMIC64_DECL_ONE __ATOMIC64_DECL From 95ece48165c136b96fae0f6144f55cbf8b24aeb9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:35 +0200 Subject: [PATCH 11/20] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to use arch_atomic64_try_cmpxchg(). This implementation avoids one extra trip through the CMPXCHG loop. The value preload before the cmpxchg loop does not need to be atomic. Use arch_atomic64_read_nonatomic(v) to load the value from atomic_t location in a non-atomic way. The generated code improves from: 1917d5: 31 c9 xor %ecx,%ecx 1917d7: 31 db xor %ebx,%ebx 1917d9: 89 4c 24 3c mov %ecx,0x3c(%esp) 1917dd: 8b 74 24 24 mov 0x24(%esp),%esi 1917e1: 89 c8 mov %ecx,%eax 1917e3: 89 5c 24 34 mov %ebx,0x34(%esp) 1917e7: 8b 7c 24 28 mov 0x28(%esp),%edi 1917eb: 21 ce and %ecx,%esi 1917ed: 89 74 24 4c mov %esi,0x4c(%esp) 1917f1: 21 df and %ebx,%edi 1917f3: 89 de mov %ebx,%esi 1917f5: 89 7c 24 50 mov %edi,0x50(%esp) 1917f9: 8b 54 24 4c mov 0x4c(%esp),%edx 1917fd: 8b 7c 24 2c mov 0x2c(%esp),%edi 191801: 8b 4c 24 50 mov 0x50(%esp),%ecx 191805: 89 d3 mov %edx,%ebx 191807: 89 f2 mov %esi,%edx 191809: f0 0f c7 0f lock cmpxchg8b (%edi) 19180d: 89 c1 mov %eax,%ecx 19180f: 8b 74 24 34 mov 0x34(%esp),%esi 191813: 89 d3 mov %edx,%ebx 191815: 89 44 24 4c mov %eax,0x4c(%esp) 191819: 8b 44 24 3c mov 0x3c(%esp),%eax 19181d: 89 df mov %ebx,%edi 19181f: 89 54 24 44 mov %edx,0x44(%esp) 191823: 89 ca mov %ecx,%edx 191825: 31 de xor %ebx,%esi 191827: 31 c8 xor %ecx,%eax 191829: 09 f0 or %esi,%eax 19182b: 75 ac jne 1917d9 <...> to: 1912ba: 8b 06 mov (%esi),%eax 1912bc: 8b 56 04 mov 0x4(%esi),%edx 1912bf: 89 44 24 3c mov %eax,0x3c(%esp) 1912c3: 89 c1 mov %eax,%ecx 1912c5: 23 4c 24 34 and 0x34(%esp),%ecx 1912c9: 89 d3 mov %edx,%ebx 1912cb: 23 5c 24 38 and 0x38(%esp),%ebx 1912cf: 89 54 24 40 mov %edx,0x40(%esp) 1912d3: 89 4c 24 2c mov %ecx,0x2c(%esp) 1912d7: 89 5c 24 30 mov %ebx,0x30(%esp) 1912db: 8b 5c 24 2c mov 0x2c(%esp),%ebx 1912df: 8b 4c 24 30 mov 0x30(%esp),%ecx 1912e3: f0 0f c7 0e lock cmpxchg8b (%esi) 1912e7: 0f 85 f3 02 00 00 jne 1915e0 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-3-ubizjak@gmail.com --- arch/x86/include/asm/atomic64_32.h | 43 +++++++++++++----------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index bbdf174de110..40ff73b5ec55 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -227,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); } static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i)); - return old; + return val; } #define arch_atomic64_fetch_and arch_atomic64_fetch_and static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); } static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i)); - return old; + return val; } #define arch_atomic64_fetch_or arch_atomic64_fetch_or static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); } static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i)); - return old; + return val; } #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - s64 old, c = 0; + s64 val = arch_atomic64_read_nonatomic(v); - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) - c = old; + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i)); - return old; + return val; } #define arch_atomic64_fetch_add arch_atomic64_fetch_add From 21689e4bfb9ae8f8b45279c53faecaa5a056ffa5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 10 Apr 2024 08:29:36 +0200 Subject: [PATCH 12/20] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions There is no need to implement arch_atomic_sub() family of inline functions, corresponding macros can be directly implemented using arch_atomic_add() inlines with negated argument. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240410062957.322614-4-ubizjak@gmail.com --- arch/x86/include/asm/atomic.h | 12 ++---------- arch/x86/include/asm/atomic64_64.h | 12 ++---------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 55a55ec04350..55b4d24356ea 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v) } #define arch_atomic_add_return arch_atomic_add_return -static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) -{ - return arch_atomic_add_return(-i, v); -} -#define arch_atomic_sub_return arch_atomic_sub_return +#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { @@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) } #define arch_atomic_fetch_add arch_atomic_fetch_add -static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic_fetch_sub arch_atomic_fetch_sub +#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 3165c0feedf7..ae12acae5b06 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) } #define arch_atomic64_add_return arch_atomic64_add_return -static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) -{ - return arch_atomic64_add_return(-i, v); -} -#define arch_atomic64_sub_return arch_atomic64_sub_return +#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v) static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { @@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) } #define arch_atomic64_fetch_add arch_atomic64_fetch_add -static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) -{ - return xadd(&v->counter, -i); -} -#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub +#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v) static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { From 79a34e3d8411050c3c7550c5163d6f9dc41e8f66 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 21 Mar 2024 20:52:47 +0100 Subject: [PATCH 13/20] locking/qspinlock: Use atomic_try_cmpxchg_relaxed() in xchg_tail() Use atomic_try_cmpxchg_relaxed(*ptr, &old, new) instead of atomic_cmpxchg_relaxed (*ptr, old, new) == old in xchg_tail(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after CMPXCHG. No functional change intended. Since this code requires NR_CPUS >= 16k, I have tested it by unconditionally setting _Q_PENDING_BITS to 1 in . Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240321195309.484275-1-ubizjak@gmail.com --- kernel/locking/qspinlock.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ebe6b8ec7cb3..1df5fef8a656 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -220,21 +220,18 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) */ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { - u32 old, new, val = atomic_read(&lock->val); + u32 old, new; - for (;;) { - new = (val & _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; /* * We can use relaxed semantics since the caller ensures that * the MCS node is properly initialized before updating the * tail. */ - old = atomic_cmpxchg_relaxed(&lock->val, val, new); - if (old == val) - break; + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - val = old; - } return old; } #endif /* _Q_PENDING_BITS == 8 */ From 6a97734f2222e0352f1900e3eb3167e9069b91bb Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 25 Mar 2024 15:09:32 +0100 Subject: [PATCH 14/20] locking/pvqspinlock: Use try_cmpxchg_acquire() in trylock_clear_pending() Replace this pattern in trylock_clear_pending(): cmpxchg_acquire(*ptr, old, new) == old ... with the simpler and faster: try_cmpxchg_acquire(*ptr, &old, new) The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after the CMPXCHG. Also change the return type of the function to bool and streamline the control flow in the _Q_PENDING_BITS == 8 variant a bit. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Reviewed-by: Linus Torvalds Link: https://lore.kernel.org/r/20240325140943.815051-1-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 31 ++++++++++++----------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 169950fe1aad..77ba80bd95f9 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -116,11 +116,12 @@ static __always_inline void set_pending(struct qspinlock *lock) * barrier. Therefore, an atomic cmpxchg_acquire() is used to acquire the * lock just to be sure that it will get it. */ -static __always_inline int trylock_clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { + u16 old = _Q_PENDING_VAL; + return !READ_ONCE(lock->locked) && - (cmpxchg_acquire(&lock->locked_pending, _Q_PENDING_VAL, - _Q_LOCKED_VAL) == _Q_PENDING_VAL); + try_cmpxchg_acquire(&lock->locked_pending, &old, _Q_LOCKED_VAL); } #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) @@ -128,27 +129,21 @@ static __always_inline void set_pending(struct qspinlock *lock) atomic_or(_Q_PENDING_VAL, &lock->val); } -static __always_inline int trylock_clear_pending(struct qspinlock *lock) +static __always_inline bool trylock_clear_pending(struct qspinlock *lock) { - int val = atomic_read(&lock->val); - - for (;;) { - int old, new; - - if (val & _Q_LOCKED_MASK) - break; + int old, new; + old = atomic_read(&lock->val); + do { + if (old & _Q_LOCKED_MASK) + return false; /* * Try to clear pending bit & set locked bit */ - old = val; - new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - val = atomic_cmpxchg_acquire(&lock->val, old, new); + new = (old & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + } while (!atomic_try_cmpxchg_acquire (&lock->val, &old, new)); - if (val == old) - return 1; - } - return 0; + return true; } #endif /* _Q_PENDING_BITS == 8 */ From fea0e1820b51fff95c85518eb9cf3386f367908d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 11 Apr 2024 21:22:55 +0200 Subject: [PATCH 15/20] locking/pvqspinlock: Use try_cmpxchg() in qspinlock_paravirt.h Use try_cmpxchg(*ptr, &old, new) instead of cmpxchg(*ptr, old, new) == old in qspinlock_paravirt.h x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240411192317.25432-2-ubizjak@gmail.com --- kernel/locking/qspinlock_paravirt.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 77ba80bd95f9..f5a36e67b593 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -86,9 +86,10 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) */ for (;;) { int val = atomic_read(&lock->val); + u8 old = 0; if (!(val & _Q_LOCKED_PENDING_MASK) && - (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { + try_cmpxchg_acquire(&lock->locked, &old, _Q_LOCKED_VAL)) { lockevent_inc(pv_lock_stealing); return true; } @@ -211,8 +212,9 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + struct qspinlock *old = NULL; hopcnt++; - if (!cmpxchg(&he->lock, NULL, lock)) { + if (try_cmpxchg(&he->lock, &old, lock)) { WRITE_ONCE(he->node, node); lockevent_pv_hop(hopcnt); return &he->lock; @@ -355,7 +357,7 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; - + enum vcpu_state old = vcpu_halted; /* * If the vCPU is indeed halted, advance its state to match that of * pv_wait_node(). If OTOH this fails, the vCPU was running and will @@ -372,8 +374,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * subsequent writes. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&pn->state, vcpu_halted, vcpu_hashed) - != vcpu_halted) + if (!try_cmpxchg_relaxed(&pn->state, &old, vcpu_hashed)) return; /* @@ -541,15 +542,14 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #ifndef __pv_queued_spin_unlock __visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { - u8 locked; + u8 locked = _Q_LOCKED_VAL; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg_release(&lock->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) + if (try_cmpxchg_release(&lock->locked, &locked, 0)) return; __pv_queued_spin_unlock_slowpath(lock, locked); From 91095666125a666c8f20c2323b742c53165c0325 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 12 Apr 2024 10:38:53 +0200 Subject: [PATCH 16/20] locking/pvqspinlock/x86: Remove redundant CMP after CMPXCHG in __raw_callee_save___pv_queued_spin_unlock() x86 CMPXCHG instruction returns success in the ZF flag. Remove redundant CMP instruction after CMPXCHG that performs the same check. Also update the function comment to mention the modern version of the equivalent C code. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240412083908.282802-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock_paravirt.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index ef9697f20129..466af57b8ed6 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -25,9 +25,9 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); * * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { - * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); + * u8 lockval = _Q_LOCKED_VAL; * - * if (likely(lockval == _Q_LOCKED_VAL)) + * if (try_cmpxchg(&lock->locked, &lockval, 0)) * return; * pv_queued_spin_unlock_slowpath(lock, lockval); * } @@ -43,7 +43,6 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); "mov $0x1,%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ - "cmp $0x1,%al\n\t" \ "jne .slowpath\n\t" \ "pop %rdx\n\t" \ FRAME_END \ From d26e46f6bf329cfcc469878709baa41d3bfc7cc3 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 14 Apr 2024 18:12:43 +0200 Subject: [PATCH 17/20] locking/atomic/x86: Introduce arch_try_cmpxchg64_local() Introduce arch_try_cmpxchg64_local() for 64-bit and 32-bit targets to improve code using cmpxchg64_local(). On 64-bit targets, the generated assembly improves from: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 48 85 c0 test %rax,%rax 3e32: 0f 85 9f 00 00 00 jne 3ed7 <...> to: 3e28: 31 c0 xor %eax,%eax 3e2a: 4d 0f b1 7d 00 cmpxchg %r15,0x0(%r13) 3e2f: 0f 85 9f 00 00 00 jne 3ed4 <...> where a TEST instruction after CMPXCHG is saved. The improvements for 32-bit targets are even more noticeable, because double-word compare after CMPXCHG8B gets eliminated. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Waiman Long Link: https://lore.kernel.org/r/20240414161257.49145-1-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 34 +++++++++++++++++++++++++++++++ arch/x86/include/asm/cmpxchg_64.h | 6 ++++++ 2 files changed, 40 insertions(+) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 9e0d330dd5d0..9dedc13d5a77 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -64,6 +64,11 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne return __arch_try_cmpxchg64(ptr, oldp, new, LOCK_PREFIX); } +static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64(ptr, oldp, new,); +} + #ifdef CONFIG_X86_CMPXCHG64 #define arch_cmpxchg64 __cmpxchg64 @@ -72,6 +77,8 @@ static __always_inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 ne #define arch_try_cmpxchg64 __try_cmpxchg64 +#define arch_try_cmpxchg64_local __try_cmpxchg64_local + #else /* @@ -150,6 +157,33 @@ static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 } #define arch_try_cmpxchg64 arch_try_cmpxchg64 +#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \ +({ \ + union __u64_halves o = { .full = *(_oldp), }, \ + n = { .full = (_new), }; \ + bool ret; \ + \ + asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ + "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + CC_SET(e) \ + : CC_OUT(e) (ret), \ + [ptr] "+m" (*(_ptr)), \ + "+a" (o.low), "+d" (o.high) \ + : "b" (n.low), "c" (n.high), "S" (_ptr) \ + : "memory"); \ + \ + if (unlikely(!ret)) \ + *(_oldp) = o.full; \ + \ + likely(ret); \ +}) + +static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) +{ + return __arch_try_cmpxchg64_emu_local(ptr, oldp, new); +} +#define arch_try_cmpxchg64_local arch_try_cmpxchg64_local + #endif #define system_has_cmpxchg64() boot_cpu_has(X86_FEATURE_CX8) diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index c1d6cd58f809..5e241306db26 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,6 +20,12 @@ arch_try_cmpxchg((ptr), (po), (n)); \ }) +#define arch_try_cmpxchg64_local(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg_local((ptr), (po), (n)); \ +}) + union __u128_halves { u128 full; struct { From 33eb8ab4ec83cf0975d0113966c7e71cd6be60b2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 17 Apr 2024 19:58:12 +0200 Subject: [PATCH 18/20] locking/atomic/x86: Merge __arch{,_try}_cmpxchg64_emu_local() with __arch{,_try}_cmpxchg64_emu() Macros __arch{,_try}_cmpxchg64_emu() are almost identical to their local variants __arch{,_try}_cmpxchg64_emu_local(), differing only by lock prefixes. Merge these two macros by introducing additional macro parameters to pass lock location and lock prefix from their respective static inline functions. No functional change intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/20240417175830.161561-1-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 56 ++++++------------------------- 1 file changed, 10 insertions(+), 46 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 9dedc13d5a77..ed2797f132ce 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -86,14 +86,14 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, * to simulate the cmpxchg8b on the 80386 and 80486 CPU. */ -#define __arch_cmpxchg64_emu(_ptr, _old, _new) \ +#define __arch_cmpxchg64_emu(_ptr, _old, _new, _lock_loc, _lock) \ ({ \ union __u64_halves o = { .full = (_old), }, \ n = { .full = (_new), }; \ \ - asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ : [ptr] "+m" (*(_ptr)), \ "+a" (o.low), "+d" (o.high) \ : "b" (n.low), "c" (n.high), "S" (_ptr) \ @@ -104,40 +104,25 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, static __always_inline u64 arch_cmpxchg64(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu(ptr, old, new); + return __arch_cmpxchg64_emu(ptr, old, new, LOCK_PREFIX_HERE, "lock; "); } #define arch_cmpxchg64 arch_cmpxchg64 -#define __arch_cmpxchg64_emu_local(_ptr, _old, _new) \ -({ \ - union __u64_halves o = { .full = (_old), }, \ - n = { .full = (_new), }; \ - \ - asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ - "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ - : [ptr] "+m" (*(_ptr)), \ - "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ - : "memory"); \ - \ - o.full; \ -}) - static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) { - return __arch_cmpxchg64_emu_local(ptr, old, new); + return __arch_cmpxchg64_emu(ptr, old, new, ,); } #define arch_cmpxchg64_local arch_cmpxchg64_local -#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new) \ +#define __arch_try_cmpxchg64_emu(_ptr, _oldp, _new, _lock_loc, _lock) \ ({ \ union __u64_halves o = { .full = *(_oldp), }, \ n = { .full = (_new), }; \ bool ret; \ \ - asm volatile(ALTERNATIVE(LOCK_PREFIX_HERE \ + asm volatile(ALTERNATIVE(_lock_loc \ "call cmpxchg8b_emu", \ - "lock; cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ + _lock "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ CC_SET(e) \ : CC_OUT(e) (ret), \ [ptr] "+m" (*(_ptr)), \ @@ -153,34 +138,13 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64 static __always_inline bool arch_try_cmpxchg64(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu(ptr, oldp, new); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, LOCK_PREFIX_HERE, "lock; "); } #define arch_try_cmpxchg64 arch_try_cmpxchg64 -#define __arch_try_cmpxchg64_emu_local(_ptr, _oldp, _new) \ -({ \ - union __u64_halves o = { .full = *(_oldp), }, \ - n = { .full = (_new), }; \ - bool ret; \ - \ - asm volatile(ALTERNATIVE("call cmpxchg8b_emu", \ - "cmpxchg8b %[ptr]", X86_FEATURE_CX8) \ - CC_SET(e) \ - : CC_OUT(e) (ret), \ - [ptr] "+m" (*(_ptr)), \ - "+a" (o.low), "+d" (o.high) \ - : "b" (n.low), "c" (n.high), "S" (_ptr) \ - : "memory"); \ - \ - if (unlikely(!ret)) \ - *(_oldp) = o.full; \ - \ - likely(ret); \ -}) - static __always_inline bool arch_try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, u64 new) { - return __arch_try_cmpxchg64_emu_local(ptr, oldp, new); + return __arch_try_cmpxchg64_emu(ptr, oldp, new, ,); } #define arch_try_cmpxchg64_local arch_try_cmpxchg64_local From 94af3a04e3f386d4f060d903826e85aa006ce252 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Apr 2024 14:00:38 +0200 Subject: [PATCH 19/20] locking/qspinlock/x86: Micro-optimize virt_spin_lock() Optimize virt_spin_lock() to use simpler and faster: atomic_try_cmpxchg(*ptr, &val, new) instead of: atomic_cmpxchg(*ptr, val, new) == val The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after the CMPXCHG. Also optimize retry loop a bit. atomic_try_cmpxchg() fails iff &lock->val != 0, so there is no need to load and compare the lock value again - cpu_relax() can be unconditinally called in this case. This allows us to generate optimized: 1f: ba 01 00 00 00 mov $0x1,%edx 24: 8b 03 mov (%rbx),%eax 26: 85 c0 test %eax,%eax 28: 75 63 jne 8d <...> 2a: f0 0f b1 13 lock cmpxchg %edx,(%rbx) 2e: 75 5d jne 8d <...> ... 8d: f3 90 pause 8f: eb 93 jmp 24 <...> instead of: 1f: ba 01 00 00 00 mov $0x1,%edx 24: 8b 03 mov (%rbx),%eax 26: 85 c0 test %eax,%eax 28: 75 13 jne 3d <...> 2a: f0 0f b1 13 lock cmpxchg %edx,(%rbx) 2e: 85 c0 test %eax,%eax 30: 75 f2 jne 24 <...> ... 3d: f3 90 pause 3f: eb e3 jmp 24 <...> Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Cc: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240422120054.199092-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index cde8357bb226..a053c1293975 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -85,6 +85,8 @@ DECLARE_STATIC_KEY_TRUE(virt_spin_lock_key); #define virt_spin_lock virt_spin_lock static inline bool virt_spin_lock(struct qspinlock *lock) { + int val; + if (!static_branch_likely(&virt_spin_lock_key)) return false; @@ -94,10 +96,13 @@ static inline bool virt_spin_lock(struct qspinlock *lock) * horrible lock 'holder' preemption issues. */ - do { - while (atomic_read(&lock->val) != 0) - cpu_relax(); - } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); + __retry: + val = atomic_read(&lock->val); + + if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) { + cpu_relax(); + goto __retry; + } return true; } From 532453e7aa78f3962fb4d86caf40ff81ebf62160 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Apr 2024 17:17:35 +0200 Subject: [PATCH 20/20] locking/pvqspinlock/x86: Use _Q_LOCKED_VAL in PV_UNLOCK_ASM macro Use _Q_LOCKED_VAL instead of hardcoded $0x1 in PV_UNLOCK_ASM macro. No functional changes intended. Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar Acked-by: Boqun Feng Cc: Waiman Long Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240422151752.53997-1-ubizjak@gmail.com --- arch/x86/include/asm/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 466af57b8ed6..0a985784be9b 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -40,7 +40,7 @@ __PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define PV_UNLOCK_ASM \ FRAME_BEGIN \ "push %rdx\n\t" \ - "mov $0x1,%eax\n\t" \ + "mov $" __stringify(_Q_LOCKED_VAL) ",%eax\n\t" \ "xor %edx,%edx\n\t" \ LOCK_PREFIX "cmpxchg %dl,(%rdi)\n\t" \ "jne .slowpath\n\t" \