LoongArch: Add subword xchg/cmpxchg emulation

LoongArch only support 32-bit/64-bit xchg/cmpxchg in native. But percpu
operation, qspinlock and some drivers need 8-bit/16-bit xchg/cmpxchg. We
add subword xchg/cmpxchg emulation in this patch because the emulation
has better performance than the generic implementation (on NUMA system),
and it can fix some build errors meanwhile [1].

LoongArch's guarantee for forward progress (avoid many ll/sc happening
at the same time and no one succeeds):

We have the "exclusive access (with timeout) of ll" feature to avoid
simultaneous ll (which also blocks other memory load/store on the same
address), and the "random delay of sc" feature to avoid simultaneous
sc. It is a mandatory requirement for multi-core LoongArch processors
to implement such features, only except those single-core and dual-core
processors (they also don't support multi-chip interconnection).

Feature bits are introduced in CPUCFG3, bit 3 and bit 4 [2].

[1] https://lore.kernel.org/loongarch/CAAhV-H6vvkuOzy8OemWdYK3taj5Jn3bFX0ZTwE=twM8ywpBUYA@mail.gmail.com/T/#t
[2] https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#_cpucfg

Reported-by: Sudip Mukherjee (Codethink) <sudipm.mukherjee@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Rui Wang <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
This commit is contained in:
Huacai Chen 2022-08-25 19:34:59 +08:00
parent 092e9ebe52
commit 720dc7ab25
2 changed files with 105 additions and 1 deletions

View file

@ -5,8 +5,9 @@
#ifndef __ASM_CMPXCHG_H
#define __ASM_CMPXCHG_H
#include <asm/barrier.h>
#include <linux/bits.h>
#include <linux/build_bug.h>
#include <asm/barrier.h>
#define __xchg_asm(amswap_db, m, val) \
({ \
@ -21,10 +22,53 @@
__ret; \
})
static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
unsigned int size)
{
unsigned int shift;
u32 old32, mask, temp;
volatile u32 *ptr32;
/* Mask value to the correct size. */
mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
val &= mask;
/*
* Calculate a shift & mask that correspond to the value we wish to
* exchange within the naturally aligned 4 byte integerthat includes
* it.
*/
shift = (unsigned long)ptr & 0x3;
shift *= BITS_PER_BYTE;
mask <<= shift;
/*
* Calculate a pointer to the naturally aligned 4 byte integer that
* includes our byte of interest, and load its value.
*/
ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
asm volatile (
"1: ll.w %0, %3 \n"
" andn %1, %0, %z4 \n"
" or %1, %1, %z5 \n"
" sc.w %1, %2 \n"
" beqz %1, 1b \n"
: "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
: "ZC" (*ptr32), "Jr" (mask), "Jr" (val << shift)
: "memory");
return (old32 & mask) >> shift;
}
static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
int size)
{
switch (size) {
case 1:
case 2:
return __xchg_small(ptr, x, size);
case 4:
return __xchg_asm("amswap_db.w", (volatile u32 *)ptr, (u32)x);
@ -67,10 +111,62 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
__ret; \
})
static inline unsigned int __cmpxchg_small(volatile void *ptr, unsigned int old,
unsigned int new, unsigned int size)
{
unsigned int shift;
u32 old32, mask, temp;
volatile u32 *ptr32;
/* Mask inputs to the correct size. */
mask = GENMASK((size * BITS_PER_BYTE) - 1, 0);
old &= mask;
new &= mask;
/*
* Calculate a shift & mask that correspond to the value we wish to
* compare & exchange within the naturally aligned 4 byte integer
* that includes it.
*/
shift = (unsigned long)ptr & 0x3;
shift *= BITS_PER_BYTE;
old <<= shift;
new <<= shift;
mask <<= shift;
/*
* Calculate a pointer to the naturally aligned 4 byte integer that
* includes our byte of interest, and load its value.
*/
ptr32 = (volatile u32 *)((unsigned long)ptr & ~0x3);
asm volatile (
"1: ll.w %0, %3 \n"
" and %1, %0, %z4 \n"
" bne %1, %z5, 2f \n"
" andn %1, %0, %z4 \n"
" or %1, %1, %z6 \n"
" sc.w %1, %2 \n"
" beqz %1, 1b \n"
" b 3f \n"
"2: \n"
__WEAK_LLSC_MB
"3: \n"
: "=&r" (old32), "=&r" (temp), "=ZC" (*ptr32)
: "ZC" (*ptr32), "Jr" (mask), "Jr" (old), "Jr" (new)
: "memory");
return (old32 & mask) >> shift;
}
static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
unsigned long new, unsigned int size)
{
switch (size) {
case 1:
case 2:
return __cmpxchg_small(ptr, old, new, size);
case 4:
return __cmpxchg_asm("ll.w", "sc.w", (volatile u32 *)ptr,
(u32)old, new);

View file

@ -123,6 +123,10 @@ static inline unsigned long __percpu_xchg(void *ptr, unsigned long val,
int size)
{
switch (size) {
case 1:
case 2:
return __xchg_small((volatile void *)ptr, val, size);
case 4:
return __xchg_asm("amswap.w", (volatile u32 *)ptr, (u32)val);
@ -204,9 +208,13 @@ do { \
#define this_cpu_write_4(pcp, val) _percpu_write(pcp, val)
#define this_cpu_write_8(pcp, val) _percpu_write(pcp, val)
#define this_cpu_xchg_1(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_2(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_4(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_xchg_8(pcp, val) _percpu_xchg(pcp, val)
#define this_cpu_cmpxchg_1(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_2(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_4(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)
#define this_cpu_cmpxchg_8(ptr, o, n) _protect_cmpxchg_local(ptr, o, n)