s390/checksum: provide vector register variant of csum_partial()
Provide a faster variant of csum_partial() which uses vector registers instead of the cksm instruction. Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
parent
3a74f44de2
commit
cb2a1dd589
|
@ -30,22 +30,7 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum)
|
|||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Computes the checksum of a memory block at buff, length len,
|
||||
* and adds in "sum" (32-bit).
|
||||
*
|
||||
* Returns a 32-bit number suitable for feeding into itself
|
||||
* or csum_tcpudp_magic.
|
||||
*
|
||||
* This function must be called with even lengths, except
|
||||
* for the last fragment, which may be odd.
|
||||
*
|
||||
* It's best to have buff aligned on a 32-bit boundary.
|
||||
*/
|
||||
static inline __wsum csum_partial(const void *buff, int len, __wsum sum)
|
||||
{
|
||||
return cksm(buff, len, sum);
|
||||
}
|
||||
__wsum csum_partial(const void *buff, int len, __wsum sum);
|
||||
|
||||
/*
|
||||
* Fold a partial checksum without adding pseudo headers.
|
||||
|
|
|
@ -521,6 +521,15 @@
|
|||
VMRL \vr1, \vr2, \vr3, 3
|
||||
.endm
|
||||
|
||||
/* VECTOR LOAD WITH LENGTH */
|
||||
.macro VLL v, gr, disp, base
|
||||
VX_NUM v1, \v
|
||||
GR_NUM b2, \base
|
||||
GR_NUM r3, \gr
|
||||
.word 0xE700 | ((v1&15) << 4) | r3
|
||||
.word (b2 << 12) | (\disp)
|
||||
MRXBOPC 0, 0x37, v1
|
||||
.endm
|
||||
|
||||
/* Vector integer instructions */
|
||||
|
||||
|
@ -534,6 +543,16 @@
|
|||
MRXBOPC 0, 0x68, v1, v2, v3
|
||||
.endm
|
||||
|
||||
/* VECTOR CHECKSUM */
|
||||
.macro VCKSM vr1, vr2, vr3
|
||||
VX_NUM v1, \vr1
|
||||
VX_NUM v2, \vr2
|
||||
VX_NUM v3, \vr3
|
||||
.word 0xE700 | ((v1&15) << 4) | (v2&15)
|
||||
.word ((v3&15) << 12)
|
||||
MRXBOPC 0, 0x66, v1, v2, v3
|
||||
.endm
|
||||
|
||||
/* VECTOR EXCLUSIVE OR */
|
||||
.macro VX vr1, vr2, vr3
|
||||
VX_NUM v1, \vr1
|
||||
|
|
|
@ -108,6 +108,89 @@ static __always_inline void fpu_stfpc(unsigned int *fpc)
|
|||
: "memory");
|
||||
}
|
||||
|
||||
static __always_inline void fpu_vcksm(u8 v1, u8 v2, u8 v3)
|
||||
{
|
||||
asm volatile("VCKSM %[v1],%[v2],%[v3]"
|
||||
:
|
||||
: [v1] "I" (v1), [v2] "I" (v2), [v3] "I" (v3)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CC_IS_CLANG
|
||||
|
||||
static __always_inline void fpu_vl(u8 v1, const void *vxr)
|
||||
{
|
||||
instrument_read(vxr, sizeof(__vector128));
|
||||
asm volatile("\n"
|
||||
" la 1,%[vxr]\n"
|
||||
" VL %[v1],0,,1\n"
|
||||
:
|
||||
: [vxr] "R" (*(__vector128 *)vxr),
|
||||
[v1] "I" (v1)
|
||||
: "memory", "1");
|
||||
}
|
||||
|
||||
#else /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
static __always_inline void fpu_vl(u8 v1, const void *vxr)
|
||||
{
|
||||
instrument_read(vxr, sizeof(__vector128));
|
||||
asm volatile("VL %[v1],%O[vxr],,%R[vxr]\n"
|
||||
:
|
||||
: [vxr] "Q" (*(__vector128 *)vxr),
|
||||
[v1] "I" (v1)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
static __always_inline u64 fpu_vlgvf(u8 v, u16 index)
|
||||
{
|
||||
u64 val;
|
||||
|
||||
asm volatile("VLGVF %[val],%[v],%[index]"
|
||||
: [val] "=d" (val)
|
||||
: [v] "I" (v), [index] "L" (index)
|
||||
: "memory");
|
||||
return val;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CC_IS_CLANG
|
||||
|
||||
static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
|
||||
{
|
||||
unsigned int size;
|
||||
|
||||
size = min(index + 1, sizeof(__vector128));
|
||||
instrument_read(vxr, size);
|
||||
asm volatile("\n"
|
||||
" la 1,%[vxr]\n"
|
||||
" VLL %[v1],%[index],0,1\n"
|
||||
:
|
||||
: [vxr] "R" (*(u8 *)vxr),
|
||||
[index] "d" (index),
|
||||
[v1] "I" (v1)
|
||||
: "memory", "1");
|
||||
}
|
||||
|
||||
#else /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
static __always_inline void fpu_vll(u8 v1, u32 index, const void *vxr)
|
||||
{
|
||||
unsigned int size;
|
||||
|
||||
size = min(index + 1, sizeof(__vector128));
|
||||
instrument_read(vxr, size);
|
||||
asm volatile("VLL %[v1],%[index],%O[vxr],%R[vxr]\n"
|
||||
:
|
||||
: [vxr] "Q" (*(u8 *)vxr),
|
||||
[index] "d" (index),
|
||||
[v1] "I" (v1)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
#endif /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
#ifdef CONFIG_CC_IS_CLANG
|
||||
|
||||
#define fpu_vlm(_v1, _v3, _vxrs) \
|
||||
|
@ -148,6 +231,14 @@ static __always_inline void fpu_stfpc(unsigned int *fpc)
|
|||
|
||||
#endif /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index)
|
||||
{
|
||||
asm volatile("VLVGF %[v],%[val],%[index]"
|
||||
:
|
||||
: [v] "I" (v), [val] "d" (val), [index] "L" (index)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CC_IS_CLANG
|
||||
|
||||
#define fpu_vstm(_v1, _v3, _vxrs) \
|
||||
|
@ -186,5 +277,13 @@ static __always_inline void fpu_stfpc(unsigned int *fpc)
|
|||
|
||||
#endif /* CONFIG_CC_IS_CLANG */
|
||||
|
||||
static __always_inline void fpu_vzero(u8 v)
|
||||
{
|
||||
asm volatile("VZERO %[v]"
|
||||
:
|
||||
: [v] "I" (v)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif /* __ASM_S390_FPU_INSN_H */
|
||||
|
|
|
@ -32,12 +32,16 @@ struct kernel_fpu_##vxr_size { \
|
|||
__vector128 vxrs[vxr_size] __aligned(8); \
|
||||
}
|
||||
|
||||
KERNEL_FPU_STRUCT(8);
|
||||
KERNEL_FPU_STRUCT(16);
|
||||
KERNEL_FPU_STRUCT(32);
|
||||
|
||||
#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name) \
|
||||
struct kernel_fpu_##vxr_size name __uninitialized
|
||||
|
||||
#define DECLARE_KERNEL_FPU_ONSTACK8(name) \
|
||||
DECLARE_KERNEL_FPU_ONSTACK(8, name)
|
||||
|
||||
#define DECLARE_KERNEL_FPU_ONSTACK16(name) \
|
||||
DECLARE_KERNEL_FPU_ONSTACK(16, name)
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#
|
||||
|
||||
lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
|
||||
lib-y += csum-partial.o
|
||||
obj-y += mem.o xor.o
|
||||
lib-$(CONFIG_KPROBES) += probes.o
|
||||
lib-$(CONFIG_UPROBES) += probes.o
|
||||
|
|
|
@ -0,0 +1,63 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <asm/checksum.h>
|
||||
#include <asm/fpu.h>
|
||||
|
||||
/*
|
||||
* Computes the checksum of a memory block at buff, length len,
|
||||
* and adds in "sum" (32-bit).
|
||||
*
|
||||
* Returns a 32-bit number suitable for feeding into itself
|
||||
* or csum_tcpudp_magic.
|
||||
*
|
||||
* This function must be called with even lengths, except
|
||||
* for the last fragment, which may be odd.
|
||||
*
|
||||
* It's best to have buff aligned on a 64-bit boundary.
|
||||
*/
|
||||
__wsum csum_partial(const void *buff, int len, __wsum sum)
|
||||
{
|
||||
DECLARE_KERNEL_FPU_ONSTACK8(vxstate);
|
||||
|
||||
if (!cpu_has_vx())
|
||||
return cksm(buff, len, sum);
|
||||
kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23);
|
||||
fpu_vlvgf(16, (__force u32)sum, 1);
|
||||
fpu_vzero(17);
|
||||
fpu_vzero(18);
|
||||
fpu_vzero(19);
|
||||
while (len >= 64) {
|
||||
fpu_vlm(20, 23, buff);
|
||||
fpu_vcksm(16, 20, 16);
|
||||
fpu_vcksm(17, 21, 17);
|
||||
fpu_vcksm(18, 22, 18);
|
||||
fpu_vcksm(19, 23, 19);
|
||||
buff += 64;
|
||||
len -= 64;
|
||||
}
|
||||
while (len >= 32) {
|
||||
fpu_vlm(20, 21, buff);
|
||||
fpu_vcksm(16, 20, 16);
|
||||
fpu_vcksm(17, 21, 17);
|
||||
buff += 32;
|
||||
len -= 32;
|
||||
}
|
||||
while (len >= 16) {
|
||||
fpu_vl(20, buff);
|
||||
fpu_vcksm(16, 20, 16);
|
||||
buff += 16;
|
||||
len -= 16;
|
||||
}
|
||||
if (len) {
|
||||
fpu_vll(20, len - 1, buff);
|
||||
fpu_vcksm(16, 20, 16);
|
||||
}
|
||||
fpu_vcksm(18, 19, 18);
|
||||
fpu_vcksm(16, 17, 16);
|
||||
fpu_vcksm(16, 18, 16);
|
||||
sum = (__force __wsum)fpu_vlgvf(16, 1);
|
||||
kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23);
|
||||
return sum;
|
||||
}
|
||||
EXPORT_SYMBOL(csum_partial);
|
Loading…
Reference in New Issue