x86: improve on the non-rep 'clear_user' function

The old version was oddly written to have the repeat count in multiple
registers.  So instead of taking advantage of %rax being zero, it had
some sub-counts in it.  All just for a "single word clearing" loop,
which isn't even efficient to begin with.

So get rid of those games, and just keep all the state in the same
registers we got it in (and that we should return things in).  That not
only makes this act much more like 'rep stos' (which this function is
replacing), but makes it much easier to actually do the obvious loop
unrolling.

Also rename the function from the now nonsensical 'clear_user_original'
to what it now clearly is: 'rep_stos_alternative'.

End result: if we don't have a fast 'rep stosb', at least we can have a
fast fallback for it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Linus Torvalds 2023-04-16 14:06:58 -07:00
parent 577e6a7fd5
commit 8c9b6a88b7
3 changed files with 74 additions and 48 deletions

View File

@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
*/
__must_check unsigned long
clear_user_original(void __user *addr, unsigned long len);
rep_stos_alternative(void __user *addr, unsigned long len);
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
{
@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
asm volatile(
"1:\n\t"
ALTERNATIVE("rep stosb",
"call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
"call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
"2:\n"
_ASM_EXTABLE_UA(1b, 2b)
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT

View File

@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
* Input:
* rdi destination
* rcx count
* rax is zero
*
* Output:
* rcx: uncleared bytes or 0 if successful.
*/
SYM_FUNC_START(clear_user_original)
/*
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
* i.e., no need for a 'q' suffix and thus a REX prefix.
*/
mov %ecx,%eax
shr $3,%rcx
jz .Lrest_bytes
SYM_FUNC_START(rep_stos_alternative)
cmpq $64,%rcx
jae .Lunrolled
# do the qwords first
.p2align 4
.Lqwords:
movq $0,(%rdi)
lea 8(%rdi),%rdi
dec %rcx
jnz .Lqwords
cmp $8,%ecx
jae .Lword
.Lrest_bytes:
and $7, %eax
jz .Lexit
testl %ecx,%ecx
je .Lexit
# now do the rest bytes
.Lbytes:
movb $0,(%rdi)
.Lclear_user_tail:
0: movb %al,(%rdi)
inc %rdi
dec %eax
jnz .Lbytes
dec %rcx
jnz .Lclear_user_tail
.Lexit:
RET
_ASM_EXTABLE_UA( 0b, .Lexit)
.Lword:
1: movq %rax,(%rdi)
addq $8,%rdi
sub $8,%ecx
je .Lexit
cmp $8,%ecx
jae .Lword
jmp .Lclear_user_tail
.p2align 4
.Lunrolled:
10: movq %rax,(%rdi)
11: movq %rax,8(%rdi)
12: movq %rax,16(%rdi)
13: movq %rax,24(%rdi)
14: movq %rax,32(%rdi)
15: movq %rax,40(%rdi)
16: movq %rax,48(%rdi)
17: movq %rax,56(%rdi)
addq $64,%rdi
subq $64,%rcx
cmpq $64,%rcx
jae .Lunrolled
cmpl $8,%ecx
jae .Lword
testl %ecx,%ecx
jne .Lclear_user_tail
RET
/*
* %rax still needs to be cleared in the exception case because this function is called
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
* in case it might reuse it somewhere.
* If we take an exception on any of the
* word stores, we know that %rcx isn't zero,
* so we can just go to the tail clearing to
* get the exact count.
*
* The unrolled case might end up clearing
* some bytes twice. Don't care.
*
* We could use the value in %rdi to avoid
* a second fault on the exact count case,
* but do we really care? No.
*
* Finally, we could try to align %rdi at the
* top of the unrolling. But unaligned stores
* just aren't that common or expensive.
*/
xor %eax,%eax
RET
.Lqwords_exception:
# convert remaining qwords back into bytes to return to caller
shl $3, %rcx
and $7, %eax
add %rax,%rcx
jmp .Lexit
.Lbytes_exception:
mov %eax,%ecx
jmp .Lexit
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
SYM_FUNC_END(clear_user_original)
EXPORT_SYMBOL(clear_user_original)
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
SYM_FUNC_END(rep_stos_alternative)
EXPORT_SYMBOL(rep_stos_alternative)

View File

@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
"copy_mc_fragile_handle_tail",
"copy_mc_enhanced_fast_string",
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
"clear_user_original",
"rep_stos_alternative",
"copy_user_generic_unrolled",
"__copy_user_nocache",
NULL