x86-64: Handle byte-wise tail copying in memcpy() without a loop

While hard to measure, reducing the number of possibly/likely
mis-predicted branches can generally be expected to be slightly
better.

Other than apparent at the first glance, this also doesn't grow
the function size (the alignment gap to the next function just
gets smaller).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Jan Beulich 2012-01-26 15:55:32 +00:00 committed by Ingo Molnar
parent 2ab560911a
commit 9d8e22777e

View file

@ -164,18 +164,19 @@ ENTRY(memcpy)
retq retq
.p2align 4 .p2align 4
.Lless_3bytes: .Lless_3bytes:
cmpl $0, %edx subl $1, %edx
je .Lend jb .Lend
/* /*
* Move data from 1 bytes to 3 bytes. * Move data from 1 bytes to 3 bytes.
*/ */
.Lloop_1: movzbl (%rsi), %ecx
movb (%rsi), %r8b jz .Lstore_1byte
movb %r8b, (%rdi) movzbq 1(%rsi), %r8
incq %rdi movzbq (%rsi, %rdx), %r9
incq %rsi movb %r8b, 1(%rdi)
decl %edx movb %r9b, (%rdi, %rdx)
jnz .Lloop_1 .Lstore_1byte:
movb %cl, (%rdi)
.Lend: .Lend:
retq retq