From dcd3e1de9d17dc43dfed87a9fc814b9dec508043 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 3 Feb 2024 11:45:23 +0100 Subject: [PATCH] s390/checksum: provide csum_partial_copy_nocheck() With csum_partial(), which reads all bytes into registers it is easy to also implement csum_partial_copy_nocheck() which copies the buffer while calculating its checksum. For a 512 byte buffer this reduces the runtime by 19%. Compared to the old generic variant (memcpy() + cksm instruction) runtime is reduced by 42%). Signed-off-by: Heiko Carstens --- arch/s390/include/asm/checksum.h | 3 ++ arch/s390/include/asm/fpu-insn-asm.h | 10 +++++ arch/s390/include/asm/fpu-insn.h | 58 ++++++++++++++++++++++++++++ arch/s390/lib/csum-partial.c | 54 +++++++++++++++++++------- 4 files changed, 112 insertions(+), 13 deletions(-) diff --git a/arch/s390/include/asm/checksum.h b/arch/s390/include/asm/checksum.h index 00095cc20afa..b89159591ca0 100644 --- a/arch/s390/include/asm/checksum.h +++ b/arch/s390/include/asm/checksum.h @@ -32,6 +32,9 @@ static inline __wsum cksm(const void *buff, int len, __wsum sum) __wsum csum_partial(const void *buff, int len, __wsum sum); +#define _HAVE_ARCH_CSUM_AND_COPY +__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len); + /* * Fold a partial checksum without adding pseudo headers. */ diff --git a/arch/s390/include/asm/fpu-insn-asm.h b/arch/s390/include/asm/fpu-insn-asm.h index aaf42c513a21..02ccfe46050a 100644 --- a/arch/s390/include/asm/fpu-insn-asm.h +++ b/arch/s390/include/asm/fpu-insn-asm.h @@ -531,6 +531,16 @@ MRXBOPC 0, 0x37, v1 .endm +/* VECTOR STORE WITH LENGTH */ +.macro VSTL v, gr, disp, base + VX_NUM v1, \v + GR_NUM b2, \base + GR_NUM r3, \gr + .word 0xE700 | ((v1&15) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x3f, v1 +.endm + /* Vector integer instructions */ /* VECTOR AND */ diff --git a/arch/s390/include/asm/fpu-insn.h b/arch/s390/include/asm/fpu-insn.h index 7e9997fa45d3..35c4fbe0bdd6 100644 --- a/arch/s390/include/asm/fpu-insn.h +++ b/arch/s390/include/asm/fpu-insn.h @@ -241,6 +241,64 @@ static __always_inline void fpu_vlvgf(u8 v, u32 val, u16 index) #ifdef CONFIG_CC_IS_CLANG +static __always_inline void fpu_vst(u8 v1, const void *vxr) +{ + instrument_write(vxr, sizeof(__vector128)); + asm volatile("\n" + " la 1,%[vxr]\n" + " VST %[v1],0,,1\n" + : [vxr] "=R" (*(__vector128 *)vxr) + : [v1] "I" (v1) + : "memory", "1"); +} + +#else /* CONFIG_CC_IS_CLANG */ + +static __always_inline void fpu_vst(u8 v1, const void *vxr) +{ + instrument_write(vxr, sizeof(__vector128)); + asm volatile("VST %[v1],%O[vxr],,%R[vxr]\n" + : [vxr] "=Q" (*(__vector128 *)vxr) + : [v1] "I" (v1) + : "memory"); +} + +#endif /* CONFIG_CC_IS_CLANG */ + +#ifdef CONFIG_CC_IS_CLANG + +static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_write(vxr, size); + asm volatile("\n" + " la 1,%[vxr]\n" + " VSTL %[v1],%[index],0,1\n" + : [vxr] "=R" (*(u8 *)vxr) + : [index] "d" (index), [v1] "I" (v1) + : "memory", "1"); +} + +#else /* CONFIG_CC_IS_CLANG */ + +static __always_inline void fpu_vstl(u8 v1, u32 index, const void *vxr) +{ + unsigned int size; + + size = min(index + 1, sizeof(__vector128)); + instrument_write(vxr, size); + asm volatile("VSTL %[v1],%[index],%O[vxr],%R[vxr]\n" + : [vxr] "=Q" (*(u8 *)vxr) + : [index] "d" (index), [v1] "I" (v1) + : "memory"); +} + +#endif /* CONFIG_CC_IS_CLANG */ + +#ifdef CONFIG_CC_IS_CLANG + #define fpu_vstm(_v1, _v3, _vxrs) \ ({ \ unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128); \ diff --git a/arch/s390/lib/csum-partial.c b/arch/s390/lib/csum-partial.c index 3ea009cbc3b7..458abd9bac70 100644 --- a/arch/s390/lib/csum-partial.c +++ b/arch/s390/lib/csum-partial.c @@ -5,8 +5,8 @@ #include /* - * Computes the checksum of a memory block at buff, length len, - * and adds in "sum" (32-bit). + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit). If copy is true copies to dst. * * Returns a 32-bit number suitable for feeding into itself * or csum_tcpudp_magic. @@ -14,43 +14,60 @@ * This function must be called with even lengths, except * for the last fragment, which may be odd. * - * It's best to have buff aligned on a 64-bit boundary. + * It's best to have src and dst aligned on a 64-bit boundary. */ -__wsum csum_partial(const void *buff, int len, __wsum sum) +static __always_inline __wsum csum_copy(void *dst, const void *src, int len, __wsum sum, bool copy) { DECLARE_KERNEL_FPU_ONSTACK8(vxstate); - if (!cpu_has_vx()) - return cksm(buff, len, sum); + if (!cpu_has_vx()) { + if (copy) + memcpy(dst, src, len); + return cksm(dst, len, sum); + } kernel_fpu_begin(&vxstate, KERNEL_VXR_V16V23); fpu_vlvgf(16, (__force u32)sum, 1); fpu_vzero(17); fpu_vzero(18); fpu_vzero(19); while (len >= 64) { - fpu_vlm(20, 23, buff); + fpu_vlm(20, 23, src); + if (copy) { + fpu_vstm(20, 23, dst); + dst += 64; + } fpu_vcksm(16, 20, 16); fpu_vcksm(17, 21, 17); fpu_vcksm(18, 22, 18); fpu_vcksm(19, 23, 19); - buff += 64; + src += 64; len -= 64; } while (len >= 32) { - fpu_vlm(20, 21, buff); + fpu_vlm(20, 21, src); + if (copy) { + fpu_vstm(20, 21, dst); + dst += 32; + } fpu_vcksm(16, 20, 16); fpu_vcksm(17, 21, 17); - buff += 32; + src += 32; len -= 32; } while (len >= 16) { - fpu_vl(20, buff); + fpu_vl(20, src); + if (copy) { + fpu_vst(20, dst); + dst += 16; + } fpu_vcksm(16, 20, 16); - buff += 16; + src += 16; len -= 16; } if (len) { - fpu_vll(20, len - 1, buff); + fpu_vll(20, len - 1, src); + if (copy) + fpu_vstl(20, len - 1, dst); fpu_vcksm(16, 20, 16); } fpu_vcksm(18, 19, 18); @@ -60,4 +77,15 @@ __wsum csum_partial(const void *buff, int len, __wsum sum) kernel_fpu_end(&vxstate, KERNEL_VXR_V16V23); return sum; } + +__wsum csum_partial(const void *buff, int len, __wsum sum) +{ + return csum_copy(NULL, buff, len, sum, false); +} EXPORT_SYMBOL(csum_partial); + +__wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) +{ + return csum_copy(dst, src, len, 0, true); +} +EXPORT_SYMBOL(csum_partial_copy_nocheck);