mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-06 16:49:22 +00:00
x86: bring back rep movsq for user access on CPUs without ERMS
[ Upstream commitca96b162bf
] Intel CPUs ship with ERMS for over a decade, but this is not true for AMD. In particular one reasonably recent uarch (EPYC 7R13) does not have it (or at least the bit is inactive when running on the Amazon EC2 cloud -- I found rather conflicting information about AMD CPUs vs the extension). Hand-rolled mov loops executing in this case are quite pessimal compared to rep movsq for bigger sizes. While the upper limit depends on uarch, everyone is well south of 1KB AFAICS and sizes bigger than that are common. While technically ancient CPUs may be suffering from rep usage, gcc has been emitting it for years all over kernel code, so I don't think this is a legitimate concern. Sample result from read1_processes from will-it-scale (4KB reads/s): before: 1507021 after: 1721828 (+14%) Note that the cutoff point for rep usage is set to 64 bytes, which is way too conservative but I'm sticking to what was done in47ee3f1dd9
("x86: re-introduce support for ERMS copies for user space accesses"). That is to say *some* copies will now go slower, which is fixable but beyond the scope of this patch. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
parent
f4b62612b4
commit
5618ae952d
2 changed files with 14 additions and 45 deletions
|
@ -116,7 +116,7 @@ copy_user_generic(void *to, const void *from, unsigned long len)
|
||||||
"2:\n"
|
"2:\n"
|
||||||
_ASM_EXTABLE_UA(1b, 2b)
|
_ASM_EXTABLE_UA(1b, 2b)
|
||||||
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
|
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
|
||||||
: : "memory", "rax", "r8", "r9", "r10", "r11");
|
: : "memory", "rax");
|
||||||
clac();
|
clac();
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
* NOTE! The calling convention is very intentionally the same as
|
* NOTE! The calling convention is very intentionally the same as
|
||||||
* for 'rep movs', so that we can rewrite the function call with
|
* for 'rep movs', so that we can rewrite the function call with
|
||||||
* just a plain 'rep movs' on machines that have FSRM. But to make
|
* just a plain 'rep movs' on machines that have FSRM. But to make
|
||||||
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
|
* it simpler for us, we can clobber rsi/rdi and rax freely.
|
||||||
*/
|
*/
|
||||||
SYM_FUNC_START(rep_movs_alternative)
|
SYM_FUNC_START(rep_movs_alternative)
|
||||||
cmpq $64,%rcx
|
cmpq $64,%rcx
|
||||||
|
@ -68,55 +68,24 @@ SYM_FUNC_START(rep_movs_alternative)
|
||||||
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
|
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
|
||||||
|
|
||||||
.Llarge:
|
.Llarge:
|
||||||
0: ALTERNATIVE "jmp .Lunrolled", "rep movsb", X86_FEATURE_ERMS
|
0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
|
||||||
1: RET
|
1: RET
|
||||||
|
|
||||||
_ASM_EXTABLE_UA( 0b, 1b)
|
_ASM_EXTABLE_UA( 0b, 1b)
|
||||||
|
|
||||||
.p2align 4
|
.Llarge_movsq:
|
||||||
.Lunrolled:
|
movq %rcx,%rax
|
||||||
10: movq (%rsi),%r8
|
shrq $3,%rcx
|
||||||
11: movq 8(%rsi),%r9
|
andl $7,%eax
|
||||||
12: movq 16(%rsi),%r10
|
0: rep movsq
|
||||||
13: movq 24(%rsi),%r11
|
movl %eax,%ecx
|
||||||
14: movq %r8,(%rdi)
|
|
||||||
15: movq %r9,8(%rdi)
|
|
||||||
16: movq %r10,16(%rdi)
|
|
||||||
17: movq %r11,24(%rdi)
|
|
||||||
20: movq 32(%rsi),%r8
|
|
||||||
21: movq 40(%rsi),%r9
|
|
||||||
22: movq 48(%rsi),%r10
|
|
||||||
23: movq 56(%rsi),%r11
|
|
||||||
24: movq %r8,32(%rdi)
|
|
||||||
25: movq %r9,40(%rdi)
|
|
||||||
26: movq %r10,48(%rdi)
|
|
||||||
27: movq %r11,56(%rdi)
|
|
||||||
addq $64,%rsi
|
|
||||||
addq $64,%rdi
|
|
||||||
subq $64,%rcx
|
|
||||||
cmpq $64,%rcx
|
|
||||||
jae .Lunrolled
|
|
||||||
cmpl $8,%ecx
|
|
||||||
jae .Lword
|
|
||||||
testl %ecx,%ecx
|
testl %ecx,%ecx
|
||||||
jne .Lcopy_user_tail
|
jne .Lcopy_user_tail
|
||||||
RET
|
RET
|
||||||
|
|
||||||
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
|
1: leaq (%rax,%rcx,8),%rcx
|
||||||
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
|
jmp .Lcopy_user_tail
|
||||||
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
|
_ASM_EXTABLE_UA( 0b, 1b)
|
||||||
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
|
|
||||||
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
|
|
||||||
SYM_FUNC_END(rep_movs_alternative)
|
SYM_FUNC_END(rep_movs_alternative)
|
||||||
EXPORT_SYMBOL(rep_movs_alternative)
|
EXPORT_SYMBOL(rep_movs_alternative)
|
||||||
|
|
Loading…
Reference in a new issue