amd64: switch csum_partial_copy_generic() to new calling conventions

... and fold handling of misaligned case into it.

Implementation note: we stash the "will we need to rol8 the sum in the end"
flag into the MSB of %rcx (the lower 32 bits are used for length); the rest
is pretty straightforward.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
Al Viro 2020-07-19 21:56:07 -04:00
parent fdf8bee96f
commit daf52375c1
3 changed files with 94 additions and 123 deletions

View File

@ -130,10 +130,7 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
/* Do not call this directly. Use the wrappers below */
extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst,
int len, __wsum sum,
int *src_err_ptr, int *dst_err_ptr);
extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);

View File

@ -18,9 +18,6 @@
* rdi source
* rsi destination
* edx len (32bit)
* ecx sum (32bit)
* r8 src_err_ptr (int)
* r9 dst_err_ptr (int)
*
* Output
* eax 64bit sum. undefined in case of exception.
@ -31,44 +28,32 @@
.macro source
10:
_ASM_EXTABLE_UA(10b, .Lbad_source)
_ASM_EXTABLE_UA(10b, .Lfault)
.endm
.macro dest
20:
_ASM_EXTABLE_UA(20b, .Lbad_dest)
_ASM_EXTABLE_UA(20b, .Lfault)
.endm
/*
* No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
* potentially unmapped kernel address.
*/
.macro ignore L=.Lignore
30:
_ASM_EXTABLE(30b, \L)
.endm
SYM_FUNC_START(csum_partial_copy_generic)
cmpl $3*64, %edx
jle .Lignore
.Lignore:
subq $7*8, %rsp
movq %rbx, 2*8(%rsp)
movq %r12, 3*8(%rsp)
movq %r14, 4*8(%rsp)
movq %r13, 5*8(%rsp)
movq %r15, 6*8(%rsp)
movq %r8, (%rsp)
movq %r9, 1*8(%rsp)
movl %ecx, %eax
movl %edx, %ecx
subq $5*8, %rsp
movq %rbx, 0*8(%rsp)
movq %r12, 1*8(%rsp)
movq %r14, 2*8(%rsp)
movq %r13, 3*8(%rsp)
movq %r15, 4*8(%rsp)
movl $-1, %eax
xorl %r9d, %r9d
movq %rcx, %r12
movl %edx, %ecx
cmpl $8, %ecx
jb .Lshort
testb $7, %sil
jne .Lunaligned
.Laligned:
movl %ecx, %r12d
shrq $6, %r12
jz .Lhandle_tail /* < 64 */
@ -99,7 +84,12 @@ SYM_FUNC_START(csum_partial_copy_generic)
source
movq 56(%rdi), %r13
ignore 2f
30:
/*
* No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
* potentially unmapped kernel address.
*/
_ASM_EXTABLE(30b, 2f)
prefetcht0 5*64(%rdi)
2:
adcq %rbx, %rax
@ -131,8 +121,6 @@ SYM_FUNC_START(csum_partial_copy_generic)
dest
movq %r13, 56(%rsi)
3:
leaq 64(%rdi), %rdi
leaq 64(%rsi), %rsi
@ -142,8 +130,8 @@ SYM_FUNC_START(csum_partial_copy_generic)
/* do last up to 56 bytes */
.Lhandle_tail:
/* ecx: count */
movl %ecx, %r10d
/* ecx: count, rcx.63: the end result needs to be rol8 */
movq %rcx, %r10
andl $63, %ecx
shrl $3, %ecx
jz .Lfold
@ -172,6 +160,7 @@ SYM_FUNC_START(csum_partial_copy_generic)
.Lhandle_7:
movl %r10d, %ecx
andl $7, %ecx
.L1: /* .Lshort rejoins the common path here */
shrl $1, %ecx
jz .Lhandle_1
movl $2, %edx
@ -203,26 +192,65 @@ SYM_FUNC_START(csum_partial_copy_generic)
adcl %r9d, %eax /* carry */
.Lende:
movq 2*8(%rsp), %rbx
movq 3*8(%rsp), %r12
movq 4*8(%rsp), %r14
movq 5*8(%rsp), %r13
movq 6*8(%rsp), %r15
addq $7*8, %rsp
testq %r10, %r10
js .Lwas_odd
.Lout:
movq 0*8(%rsp), %rbx
movq 1*8(%rsp), %r12
movq 2*8(%rsp), %r14
movq 3*8(%rsp), %r13
movq 4*8(%rsp), %r15
addq $5*8, %rsp
ret
.Lshort:
movl %ecx, %r10d
jmp .L1
.Lunaligned:
xorl %ebx, %ebx
testb $1, %sil
jne .Lodd
1: testb $2, %sil
je 2f
source
movw (%rdi), %bx
dest
movw %bx, (%rsi)
leaq 2(%rdi), %rdi
subq $2, %rcx
leaq 2(%rsi), %rsi
addq %rbx, %rax
2: testb $4, %sil
je .Laligned
source
movl (%rdi), %ebx
dest
movl %ebx, (%rsi)
leaq 4(%rdi), %rdi
subq $4, %rcx
leaq 4(%rsi), %rsi
addq %rbx, %rax
jmp .Laligned
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
movq (%rsp), %rax
testq %rax, %rax
jz .Lende
movl $-EFAULT, (%rax)
jmp .Lende
.Lodd:
source
movb (%rdi), %bl
dest
movb %bl, (%rsi)
leaq 1(%rdi), %rdi
leaq 1(%rsi), %rsi
/* decrement, set MSB */
leaq -1(%rcx, %rcx), %rcx
rorq $1, %rcx
shll $8, %ebx
addq %rbx, %rax
jmp 1b
.Lbad_dest:
movq 8(%rsp), %rax
testq %rax, %rax
jz .Lende
movl $-EFAULT, (%rax)
jmp .Lende
.Lwas_odd:
roll $8, %eax
jmp .Lout
/* Exception: just return 0 */
.Lfault:
xorl %eax, %eax
jmp .Lout
SYM_FUNC_END(csum_partial_copy_generic)

View File

@ -21,49 +21,16 @@
* src and dst are best aligned to 64bits.
*/
__wsum
csum_and_copy_from_user(const void __user *src, void *dst,
int len)
csum_and_copy_from_user(const void __user *src, void *dst, int len)
{
int err = 0;
__wsum isum = ~0U;
__wsum sum;
might_sleep();
if (!user_access_begin(src, len))
return 0;
/*
* Why 6, not 7? To handle odd addresses aligned we
* would need to do considerable complications to fix the
* checksum which is defined as an 16bit accumulator. The
* fix alignment code is primarily for performance
* compatibility with 32bit and that will handle odd
* addresses slowly too.
*/
if (unlikely((unsigned long)src & 6)) {
while (((unsigned long)src & 6) && len >= 2) {
__u16 val16;
unsafe_get_user(val16, (const __u16 __user *)src, out);
*(__u16 *)dst = val16;
isum = (__force __wsum)add32_with_carry(
(__force unsigned)isum, val16);
src += 2;
dst += 2;
len -= 2;
}
}
isum = csum_partial_copy_generic((__force const void *)src,
dst, len, isum, &err, NULL);
sum = csum_partial_copy_generic((__force const void *)src, dst, len);
user_access_end();
if (unlikely(err))
isum = 0;
return isum;
out:
user_access_end();
return 0;
return sum;
}
EXPORT_SYMBOL(csum_and_copy_from_user);
@ -79,37 +46,16 @@ EXPORT_SYMBOL(csum_and_copy_from_user);
* src and dst are best aligned to 64bits.
*/
__wsum
csum_and_copy_to_user(const void *src, void __user *dst,
int len)
csum_and_copy_to_user(const void *src, void __user *dst, int len)
{
__wsum ret, isum = ~0U;
int err = 0;
__wsum sum;
might_sleep();
if (!user_access_begin(dst, len))
return 0;
if (unlikely((unsigned long)dst & 6)) {
while (((unsigned long)dst & 6) && len >= 2) {
__u16 val16 = *(__u16 *)src;
isum = (__force __wsum)add32_with_carry(
(__force unsigned)isum, val16);
unsafe_put_user(val16, (__u16 __user *)dst, out);
src += 2;
dst += 2;
len -= 2;
}
}
ret = csum_partial_copy_generic(src, (void __force *)dst,
len, isum, NULL, &err);
sum = csum_partial_copy_generic(src, (void __force *)dst, len);
user_access_end();
return err ? 0 : ret;
out:
user_access_end();
return 0;
return sum;
}
EXPORT_SYMBOL(csum_and_copy_to_user);
@ -125,7 +71,7 @@ EXPORT_SYMBOL(csum_and_copy_to_user);
__wsum
csum_partial_copy_nocheck(const void *src, void *dst, int len)
{
return csum_partial_copy_generic(src, dst, len, 0, NULL, NULL);
return csum_partial_copy_generic(src, dst, len);
}
EXPORT_SYMBOL(csum_partial_copy_nocheck);