2019-05-19 12:08:55 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2006-09-26 08:52:32 +00:00
|
|
|
#include <linux/linkage.h>
|
x86/clear_user: Make it faster
Based on a patch by Mark Hemment <markhemm@googlemail.com> and
incorporating very sane suggestions from Linus.
The point here is to have the default case with FSRM - which is supposed
to be the majority of x86 hw out there - if not now then soon - be
directly inlined into the instruction stream so that no function call
overhead is taking place.
Drop the early clobbers from the @size and @addr operands as those are
not needed anymore since we have single instruction alternatives.
The benchmarks I ran would show very small improvements and a PF
benchmark would even show weird things like slowdowns with higher core
counts.
So for a ~6m running the git test suite, the function gets called under
700K times, all from padzero():
<...>-2536 [006] ..... 261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900
<...>-2536 [006] ..... 261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160
<...>-2537 [008] ..... 261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850
<...>-2537 [008] ..... 261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900
...
which is around 1%-ish of the total time and which is consistent with
the benchmark numbers.
So Mel gave me the idea to simply measure how fast the function becomes.
I.e.:
start = rdtsc_ordered();
ret = __clear_user(to, n);
end = rdtsc_ordered();
Computing the mean average of all the samples collected during the test
suite run then shows some improvement:
clear_user_original:
Amean: 9219.71 (Sum: 6340154910, samples: 687674)
fsrm:
Amean: 8030.63 (Sum: 5522277720, samples: 687652)
That's on Zen3.
The situation looks a lot more confusing on Intel:
Icelake:
clear_user_original:
Amean: 19679.4 (Sum: 13652560764, samples: 693750)
Amean: 19743.7 (Sum: 13693470604, samples: 693562)
(I ran it twice just to be sure.)
ERMS:
Amean: 20374.3 (Sum: 13910601024, samples: 682752)
Amean: 20453.7 (Sum: 14186223606, samples: 693576)
FSRM:
Amean: 20458.2 (Sum: 13918381386, sample s: 680331)
The original microbenchmark which people were complaining about:
for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=65536; done 2>&1 | grep copied
32207011840 bytes (32 GB, 30 GiB) copied, 1 s, 32.2 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.93069 s, 35.6 GB/s
37597741056 bytes (38 GB, 35 GiB) copied, 1 s, 37.6 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.78017 s, 38.6 GB/s
62020124672 bytes (62 GB, 58 GiB) copied, 2 s, 31.0 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 2.13716 s, 32.2 GB/s
60010004480 bytes (60 GB, 56 GiB) copied, 1 s, 60.0 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.14129 s, 60.2 GB/s
53212086272 bytes (53 GB, 50 GiB) copied, 1 s, 53.2 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.28398 s, 53.5 GB/s
55698259968 bytes (56 GB, 52 GiB) copied, 1 s, 55.7 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.22507 s, 56.1 GB/s
55306092544 bytes (55 GB, 52 GiB) copied, 1 s, 55.3 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.23647 s, 55.6 GB/s
54387539968 bytes (54 GB, 51 GiB) copied, 1 s, 54.4 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.25693 s, 54.7 GB/s
50566529024 bytes (51 GB, 47 GiB) copied, 1 s, 50.6 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.35096 s, 50.9 GB/s
58308165632 bytes (58 GB, 54 GiB) copied, 1 s, 58.3 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.17394 s, 58.5 GB/s
Now the same thing with smaller buffers:
for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=8192; done 2>&1 | grep copied
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28485 s, 30.2 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276112 s, 31.1 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.29136 s, 29.5 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.283803 s, 30.3 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.306503 s, 28.0 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.349169 s, 24.6 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276912 s, 31.0 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.265356 s, 32.4 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28464 s, 30.2 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.242998 s, 35.3 GB/s
is also not conclusive because it all depends on the buffer sizes,
their alignments and when the microcode detects that cachelines can be
aggregated properly and copied in bigger sizes.
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@mail.gmail.com
2022-05-24 09:01:18 +00:00
|
|
|
#include <asm/asm.h>
|
2016-01-11 16:04:34 +00:00
|
|
|
#include <asm/export.h>
|
2006-09-26 08:52:32 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
x86/lib/clear_page_64.S: Convert to ALTERNATIVE_2 macro
Move clear_page() up so that we can get 2-byte forward JMPs when
patching:
apply_alternatives: feat: 3*32+16, old: (ffffffff8130adb0, len: 5), repl: (ffffffff81d0b859, len: 5)
ffffffff8130adb0: alt_insn: 90 90 90 90 90
recompute_jump: new_displ: 0x0000003e
ffffffff81d0b859: rpl_insn: eb 3e 66 66 90
even though the compiler generated 5-byte JMPs which we padded with 5
NOPs.
Also, make the REP_GOOD version be the default as the majority of
machines set REP_GOOD. This way we get to save ourselves the JMP:
old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_REP_GOOD, size: 5, padlen: 0
clear_page:
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0
repl insn: 0xffffffff81cf0e92, size: 0
old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_ERMS, size: 5, padlen: 0
clear_page:
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0
repl insn: 0xffffffff81cf0e92, size: 5
ffffffff81cf0e92: e9 69 2a 61 ff jmpq ffffffff81303900
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 69 2a 61 ff jmpq ffffffff8091631e
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-18 11:57:41 +00:00
|
|
|
* Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
|
|
|
|
* recommended to use this when possible and we do use them by default.
|
|
|
|
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
|
|
|
|
* Otherwise, use original.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Zero a page.
|
|
|
|
* %rdi - page
|
|
|
|
*/
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_START(clear_page_rep)
|
2006-09-26 08:52:32 +00:00
|
|
|
movl $4096/8,%ecx
|
|
|
|
xorl %eax,%eax
|
|
|
|
rep stosq
|
2021-12-04 13:43:40 +00:00
|
|
|
RET
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_END(clear_page_rep)
|
x86/asm: Optimize clear_page()
Currently, we CALL clear_page() which then JMPs to the proper function
chosen by the alternatives.
What we should do instead is CALL the proper function directly. (This
was something Ingo suggested a while ago). So let's do that.
Measuring our favourite kernel build workload shows that there are no
significant changes in performance.
AMD
===
-- /tmp/before 2017-02-09 18:01:46.451961188 +0100
++ /tmp/after 2017-02-09 18:01:54.883961175 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% )
+ 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% )
- 518,744 context-switches # 0.504 K/sec ( +- 1.04% )
+ 518,254 context-switches # 0.507 K/sec ( +- 1.01% )
- 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% )
+ 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% )
- 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% )
+ 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% )
- 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%)
+ 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%)
- 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%)
+ 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%)
- 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%)
+ 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%)
- 1,738,364,431,659 instructions # 0.85 insn per cycle
+ 1,739,895,771,901 instructions # 0.85 insn per cycle
- # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%)
+ # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%)
- 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%)
+ 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%)
- 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%)
+ 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%)
- 171.480741921 seconds time elapsed ( +- 1.41% )
+ 170.509229451 seconds time elapsed ( +- 1.20% )
Intel
=====
-- /tmp/before 2017-02-09 20:36:19.851947473 +0100
++ /tmp/after 2017-02-09 20:36:30.151947458 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% )
+ 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% )
- 899,342 context-switches # 0.407 K/sec ( +- 0.68% )
+ 898,381 context-switches # 0.406 K/sec ( +- 0.79% )
- 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% )
+ 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% )
- 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% )
+ 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% )
- 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%)
+ 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%)
- 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%)
+ 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%)
- 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%)
+ 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%)
- 4,148,398,171,079 instructions # 0.62 insn per cycle
+ 4,148,997,156,059 instructions # 0.62 insn per cycle
- # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%)
+ # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%)
- 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%)
+ 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%)
- 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%)
+ 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%)
- 275.904405540 seconds time elapsed ( +- 0.69% )
+ 276.660352016 seconds time elapsed ( +- 0.73% )
allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize
our calling of the clear_page variants.
text data bss dec hex filename
9051979 23067670 27009024 59128673 3863b61 vmlinux
9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page
Reported-by: kernel test robot <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-02-09 00:34:49 +00:00
|
|
|
EXPORT_SYMBOL_GPL(clear_page_rep)
|
2006-09-26 08:52:32 +00:00
|
|
|
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_START(clear_page_orig)
|
2006-02-03 20:51:02 +00:00
|
|
|
xorl %eax,%eax
|
|
|
|
movl $4096/64,%ecx
|
|
|
|
.p2align 4
|
|
|
|
.Lloop:
|
|
|
|
decl %ecx
|
|
|
|
#define PUT(x) movq %rax,x*8(%rdi)
|
|
|
|
movq %rax,(%rdi)
|
|
|
|
PUT(1)
|
|
|
|
PUT(2)
|
|
|
|
PUT(3)
|
|
|
|
PUT(4)
|
|
|
|
PUT(5)
|
|
|
|
PUT(6)
|
|
|
|
PUT(7)
|
|
|
|
leaq 64(%rdi),%rdi
|
|
|
|
jnz .Lloop
|
|
|
|
nop
|
2021-12-04 13:43:40 +00:00
|
|
|
RET
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_END(clear_page_orig)
|
x86/asm: Optimize clear_page()
Currently, we CALL clear_page() which then JMPs to the proper function
chosen by the alternatives.
What we should do instead is CALL the proper function directly. (This
was something Ingo suggested a while ago). So let's do that.
Measuring our favourite kernel build workload shows that there are no
significant changes in performance.
AMD
===
-- /tmp/before 2017-02-09 18:01:46.451961188 +0100
++ /tmp/after 2017-02-09 18:01:54.883961175 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% )
+ 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% )
- 518,744 context-switches # 0.504 K/sec ( +- 1.04% )
+ 518,254 context-switches # 0.507 K/sec ( +- 1.01% )
- 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% )
+ 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% )
- 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% )
+ 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% )
- 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%)
+ 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%)
- 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%)
+ 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%)
- 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%)
+ 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%)
- 1,738,364,431,659 instructions # 0.85 insn per cycle
+ 1,739,895,771,901 instructions # 0.85 insn per cycle
- # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%)
+ # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%)
- 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%)
+ 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%)
- 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%)
+ 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%)
- 171.480741921 seconds time elapsed ( +- 1.41% )
+ 170.509229451 seconds time elapsed ( +- 1.20% )
Intel
=====
-- /tmp/before 2017-02-09 20:36:19.851947473 +0100
++ /tmp/after 2017-02-09 20:36:30.151947458 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% )
+ 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% )
- 899,342 context-switches # 0.407 K/sec ( +- 0.68% )
+ 898,381 context-switches # 0.406 K/sec ( +- 0.79% )
- 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% )
+ 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% )
- 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% )
+ 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% )
- 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%)
+ 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%)
- 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%)
+ 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%)
- 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%)
+ 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%)
- 4,148,398,171,079 instructions # 0.62 insn per cycle
+ 4,148,997,156,059 instructions # 0.62 insn per cycle
- # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%)
+ # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%)
- 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%)
+ 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%)
- 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%)
+ 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%)
- 275.904405540 seconds time elapsed ( +- 0.69% )
+ 276.660352016 seconds time elapsed ( +- 0.73% )
allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize
our calling of the clear_page variants.
text data bss dec hex filename
9051979 23067670 27009024 59128673 3863b61 vmlinux
9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page
Reported-by: kernel test robot <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-02-09 00:34:49 +00:00
|
|
|
EXPORT_SYMBOL_GPL(clear_page_orig)
|
2006-02-03 20:51:02 +00:00
|
|
|
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_START(clear_page_erms)
|
x86/lib/clear_page_64.S: Convert to ALTERNATIVE_2 macro
Move clear_page() up so that we can get 2-byte forward JMPs when
patching:
apply_alternatives: feat: 3*32+16, old: (ffffffff8130adb0, len: 5), repl: (ffffffff81d0b859, len: 5)
ffffffff8130adb0: alt_insn: 90 90 90 90 90
recompute_jump: new_displ: 0x0000003e
ffffffff81d0b859: rpl_insn: eb 3e 66 66 90
even though the compiler generated 5-byte JMPs which we padded with 5
NOPs.
Also, make the REP_GOOD version be the default as the majority of
machines set REP_GOOD. This way we get to save ourselves the JMP:
old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_REP_GOOD, size: 5, padlen: 0
clear_page:
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0
repl insn: 0xffffffff81cf0e92, size: 0
old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_ERMS, size: 5, padlen: 0
clear_page:
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0
repl insn: 0xffffffff81cf0e92, size: 5
ffffffff81cf0e92: e9 69 2a 61 ff jmpq ffffffff81303900
ffffffff813038b0 <clear_page>:
ffffffff813038b0: e9 69 2a 61 ff jmpq ffffffff8091631e
Signed-off-by: Borislav Petkov <bp@suse.de>
2015-01-18 11:57:41 +00:00
|
|
|
movl $4096,%ecx
|
|
|
|
xorl %eax,%eax
|
|
|
|
rep stosb
|
2021-12-04 13:43:40 +00:00
|
|
|
RET
|
2019-10-11 11:51:04 +00:00
|
|
|
SYM_FUNC_END(clear_page_erms)
|
x86/asm: Optimize clear_page()
Currently, we CALL clear_page() which then JMPs to the proper function
chosen by the alternatives.
What we should do instead is CALL the proper function directly. (This
was something Ingo suggested a while ago). So let's do that.
Measuring our favourite kernel build workload shows that there are no
significant changes in performance.
AMD
===
-- /tmp/before 2017-02-09 18:01:46.451961188 +0100
++ /tmp/after 2017-02-09 18:01:54.883961175 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% )
+ 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% )
- 518,744 context-switches # 0.504 K/sec ( +- 1.04% )
+ 518,254 context-switches # 0.507 K/sec ( +- 1.01% )
- 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% )
+ 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% )
- 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% )
+ 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% )
- 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%)
+ 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%)
- 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%)
+ 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%)
- 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%)
+ 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%)
- 1,738,364,431,659 instructions # 0.85 insn per cycle
+ 1,739,895,771,901 instructions # 0.85 insn per cycle
- # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%)
+ # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%)
- 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%)
+ 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%)
- 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%)
+ 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%)
- 171.480741921 seconds time elapsed ( +- 1.41% )
+ 170.509229451 seconds time elapsed ( +- 1.20% )
Intel
=====
-- /tmp/before 2017-02-09 20:36:19.851947473 +0100
++ /tmp/after 2017-02-09 20:36:30.151947458 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):
- 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% )
+ 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% )
- 899,342 context-switches # 0.407 K/sec ( +- 0.68% )
+ 898,381 context-switches # 0.406 K/sec ( +- 0.79% )
- 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% )
+ 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% )
- 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% )
+ 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% )
- 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%)
+ 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%)
- 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%)
+ 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%)
- 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%)
+ 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%)
- 4,148,398,171,079 instructions # 0.62 insn per cycle
+ 4,148,997,156,059 instructions # 0.62 insn per cycle
- # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%)
+ # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%)
- 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%)
+ 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%)
- 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%)
+ 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%)
- 275.904405540 seconds time elapsed ( +- 0.69% )
+ 276.660352016 seconds time elapsed ( +- 0.73% )
allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize
our calling of the clear_page variants.
text data bss dec hex filename
9051979 23067670 27009024 59128673 3863b61 vmlinux
9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page
Reported-by: kernel test robot <fengguang.wu@intel.com>
Tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-02-09 00:34:49 +00:00
|
|
|
EXPORT_SYMBOL_GPL(clear_page_erms)
|
x86/clear_user: Make it faster
Based on a patch by Mark Hemment <markhemm@googlemail.com> and
incorporating very sane suggestions from Linus.
The point here is to have the default case with FSRM - which is supposed
to be the majority of x86 hw out there - if not now then soon - be
directly inlined into the instruction stream so that no function call
overhead is taking place.
Drop the early clobbers from the @size and @addr operands as those are
not needed anymore since we have single instruction alternatives.
The benchmarks I ran would show very small improvements and a PF
benchmark would even show weird things like slowdowns with higher core
counts.
So for a ~6m running the git test suite, the function gets called under
700K times, all from padzero():
<...>-2536 [006] ..... 261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900
<...>-2536 [006] ..... 261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160
<...>-2537 [008] ..... 261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850
<...>-2537 [008] ..... 261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900
...
which is around 1%-ish of the total time and which is consistent with
the benchmark numbers.
So Mel gave me the idea to simply measure how fast the function becomes.
I.e.:
start = rdtsc_ordered();
ret = __clear_user(to, n);
end = rdtsc_ordered();
Computing the mean average of all the samples collected during the test
suite run then shows some improvement:
clear_user_original:
Amean: 9219.71 (Sum: 6340154910, samples: 687674)
fsrm:
Amean: 8030.63 (Sum: 5522277720, samples: 687652)
That's on Zen3.
The situation looks a lot more confusing on Intel:
Icelake:
clear_user_original:
Amean: 19679.4 (Sum: 13652560764, samples: 693750)
Amean: 19743.7 (Sum: 13693470604, samples: 693562)
(I ran it twice just to be sure.)
ERMS:
Amean: 20374.3 (Sum: 13910601024, samples: 682752)
Amean: 20453.7 (Sum: 14186223606, samples: 693576)
FSRM:
Amean: 20458.2 (Sum: 13918381386, sample s: 680331)
The original microbenchmark which people were complaining about:
for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=65536; done 2>&1 | grep copied
32207011840 bytes (32 GB, 30 GiB) copied, 1 s, 32.2 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.93069 s, 35.6 GB/s
37597741056 bytes (38 GB, 35 GiB) copied, 1 s, 37.6 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.78017 s, 38.6 GB/s
62020124672 bytes (62 GB, 58 GiB) copied, 2 s, 31.0 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 2.13716 s, 32.2 GB/s
60010004480 bytes (60 GB, 56 GiB) copied, 1 s, 60.0 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.14129 s, 60.2 GB/s
53212086272 bytes (53 GB, 50 GiB) copied, 1 s, 53.2 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.28398 s, 53.5 GB/s
55698259968 bytes (56 GB, 52 GiB) copied, 1 s, 55.7 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.22507 s, 56.1 GB/s
55306092544 bytes (55 GB, 52 GiB) copied, 1 s, 55.3 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.23647 s, 55.6 GB/s
54387539968 bytes (54 GB, 51 GiB) copied, 1 s, 54.4 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.25693 s, 54.7 GB/s
50566529024 bytes (51 GB, 47 GiB) copied, 1 s, 50.6 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.35096 s, 50.9 GB/s
58308165632 bytes (58 GB, 54 GiB) copied, 1 s, 58.3 GB/s
68719476736 bytes (69 GB, 64 GiB) copied, 1.17394 s, 58.5 GB/s
Now the same thing with smaller buffers:
for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=8192; done 2>&1 | grep copied
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28485 s, 30.2 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276112 s, 31.1 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.29136 s, 29.5 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.283803 s, 30.3 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.306503 s, 28.0 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.349169 s, 24.6 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276912 s, 31.0 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.265356 s, 32.4 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28464 s, 30.2 GB/s
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.242998 s, 35.3 GB/s
is also not conclusive because it all depends on the buffer sizes,
their alignments and when the microcode detects that cachelines can be
aggregated properly and copied in bigger sizes.
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@mail.gmail.com
2022-05-24 09:01:18 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Default clear user-space.
|
|
|
|
* Input:
|
|
|
|
* rdi destination
|
|
|
|
* rcx count
|
|
|
|
*
|
|
|
|
* Output:
|
|
|
|
* rcx: uncleared bytes or 0 if successful.
|
|
|
|
*/
|
|
|
|
SYM_FUNC_START(clear_user_original)
|
|
|
|
/*
|
|
|
|
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
|
|
|
|
* i.e., no need for a 'q' suffix and thus a REX prefix.
|
|
|
|
*/
|
|
|
|
mov %ecx,%eax
|
|
|
|
shr $3,%rcx
|
|
|
|
jz .Lrest_bytes
|
|
|
|
|
|
|
|
# do the qwords first
|
|
|
|
.p2align 4
|
|
|
|
.Lqwords:
|
|
|
|
movq $0,(%rdi)
|
|
|
|
lea 8(%rdi),%rdi
|
|
|
|
dec %rcx
|
|
|
|
jnz .Lqwords
|
|
|
|
|
|
|
|
.Lrest_bytes:
|
|
|
|
and $7, %eax
|
|
|
|
jz .Lexit
|
|
|
|
|
|
|
|
# now do the rest bytes
|
|
|
|
.Lbytes:
|
|
|
|
movb $0,(%rdi)
|
|
|
|
inc %rdi
|
|
|
|
dec %eax
|
|
|
|
jnz .Lbytes
|
|
|
|
|
|
|
|
.Lexit:
|
|
|
|
/*
|
|
|
|
* %rax still needs to be cleared in the exception case because this function is called
|
|
|
|
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
|
|
|
|
* in case it might reuse it somewhere.
|
|
|
|
*/
|
|
|
|
xor %eax,%eax
|
|
|
|
RET
|
|
|
|
|
|
|
|
.Lqwords_exception:
|
|
|
|
# convert remaining qwords back into bytes to return to caller
|
|
|
|
shl $3, %rcx
|
|
|
|
and $7, %eax
|
|
|
|
add %rax,%rcx
|
|
|
|
jmp .Lexit
|
|
|
|
|
|
|
|
.Lbytes_exception:
|
|
|
|
mov %eax,%ecx
|
|
|
|
jmp .Lexit
|
|
|
|
|
|
|
|
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
|
|
|
|
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
|
|
|
|
SYM_FUNC_END(clear_user_original)
|
|
|
|
EXPORT_SYMBOL(clear_user_original)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
|
|
|
|
* present.
|
|
|
|
* Input:
|
|
|
|
* rdi destination
|
|
|
|
* rcx count
|
|
|
|
*
|
|
|
|
* Output:
|
|
|
|
* rcx: uncleared bytes or 0 if successful.
|
|
|
|
*/
|
|
|
|
SYM_FUNC_START(clear_user_rep_good)
|
|
|
|
# call the original thing for less than a cacheline
|
|
|
|
cmp $64, %rcx
|
|
|
|
jb clear_user_original
|
|
|
|
|
|
|
|
.Lprep:
|
|
|
|
# copy lower 32-bits for rest bytes
|
|
|
|
mov %ecx, %edx
|
|
|
|
shr $3, %rcx
|
|
|
|
jz .Lrep_good_rest_bytes
|
|
|
|
|
|
|
|
.Lrep_good_qwords:
|
|
|
|
rep stosq
|
|
|
|
|
|
|
|
.Lrep_good_rest_bytes:
|
|
|
|
and $7, %edx
|
|
|
|
jz .Lrep_good_exit
|
|
|
|
|
|
|
|
.Lrep_good_bytes:
|
|
|
|
mov %edx, %ecx
|
|
|
|
rep stosb
|
|
|
|
|
|
|
|
.Lrep_good_exit:
|
|
|
|
# see .Lexit comment above
|
|
|
|
xor %eax, %eax
|
|
|
|
RET
|
|
|
|
|
|
|
|
.Lrep_good_qwords_exception:
|
|
|
|
# convert remaining qwords back into bytes to return to caller
|
|
|
|
shl $3, %rcx
|
|
|
|
and $7, %edx
|
|
|
|
add %rdx, %rcx
|
|
|
|
jmp .Lrep_good_exit
|
|
|
|
|
|
|
|
_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
|
|
|
|
_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
|
|
|
|
SYM_FUNC_END(clear_user_rep_good)
|
|
|
|
EXPORT_SYMBOL(clear_user_rep_good)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
|
|
|
|
* Input:
|
|
|
|
* rdi destination
|
|
|
|
* rcx count
|
|
|
|
*
|
|
|
|
* Output:
|
|
|
|
* rcx: uncleared bytes or 0 if successful.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
SYM_FUNC_START(clear_user_erms)
|
|
|
|
# call the original thing for less than a cacheline
|
|
|
|
cmp $64, %rcx
|
|
|
|
jb clear_user_original
|
|
|
|
|
|
|
|
.Lerms_bytes:
|
|
|
|
rep stosb
|
|
|
|
|
|
|
|
.Lerms_exit:
|
|
|
|
xorl %eax,%eax
|
|
|
|
RET
|
|
|
|
|
|
|
|
_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
|
|
|
|
SYM_FUNC_END(clear_user_erms)
|
|
|
|
EXPORT_SYMBOL(clear_user_erms)
|