arm64: lib: accelerate crc32_be

It makes no sense to leave crc32_be using the generic code while we
only accelerate the little-endian ops.

Even though the big-endian form doesn't fit as smoothly into the arm64,
we can speed it up and avoid hitting the D cache.

Tested on Cortex-A53. Without acceleration:

    crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64
    crc32: self tests passed, processed 225944 bytes in 192240 nsec
    crc32c: CRC_LE_BITS = 64
    crc32c: self tests passed, processed 112972 bytes in 21360 nsec

With acceleration:

    crc32: CRC_LE_BITS = 64, CRC_BE BITS = 64
    crc32: self tests passed, processed 225944 bytes in 53480 nsec
    crc32c: CRC_LE_BITS = 64
    crc32c: self tests passed, processed 112972 bytes in 21480 nsec

Signed-off-by: Kevin Bracey <kevin@bracey.fi>
Tested-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Kevin Bracey 2022-01-18 12:23:51 +02:00 committed by Herbert Xu
parent 1b3dce8b8a
commit 5f2f5eaa3e
1 changed files with 73 additions and 14 deletions

View File

@ -11,7 +11,44 @@
.arch armv8-a+crc
.macro __crc32, c
.macro byteorder, reg, be
.if \be
CPU_LE( rev \reg, \reg )
.else
CPU_BE( rev \reg, \reg )
.endif
.endm
.macro byteorder16, reg, be
.if \be
CPU_LE( rev16 \reg, \reg )
.else
CPU_BE( rev16 \reg, \reg )
.endif
.endm
.macro bitorder, reg, be
.if \be
rbit \reg, \reg
.endif
.endm
.macro bitorder16, reg, be
.if \be
rbit \reg, \reg
lsr \reg, \reg, #16
.endif
.endm
.macro bitorder8, reg, be
.if \be
rbit \reg, \reg
lsr \reg, \reg, #24
.endif
.endm
.macro __crc32, c, be=0
bitorder w0, \be
cmp x2, #16
b.lt 8f // less than 16 bytes
@ -24,10 +61,14 @@
add x8, x8, x1
add x1, x1, x7
ldp x5, x6, [x8]
CPU_BE( rev x3, x3 )
CPU_BE( rev x4, x4 )
CPU_BE( rev x5, x5 )
CPU_BE( rev x6, x6 )
byteorder x3, \be
byteorder x4, \be
byteorder x5, \be
byteorder x6, \be
bitorder x3, \be
bitorder x4, \be
bitorder x5, \be
bitorder x6, \be
tst x7, #8
crc32\c\()x w8, w0, x3
@ -55,33 +96,43 @@ CPU_BE( rev x6, x6 )
32: ldp x3, x4, [x1], #32
sub x2, x2, #32
ldp x5, x6, [x1, #-16]
CPU_BE( rev x3, x3 )
CPU_BE( rev x4, x4 )
CPU_BE( rev x5, x5 )
CPU_BE( rev x6, x6 )
byteorder x3, \be
byteorder x4, \be
byteorder x5, \be
byteorder x6, \be
bitorder x3, \be
bitorder x4, \be
bitorder x5, \be
bitorder x6, \be
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
crc32\c\()x w0, w0, x5
crc32\c\()x w0, w0, x6
cbnz x2, 32b
0: ret
0: bitorder w0, \be
ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
CPU_BE( rev x3, x3 )
byteorder x3, \be
bitorder x3, \be
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
CPU_BE( rev w3, w3 )
byteorder w3, \be
bitorder w3, \be
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
CPU_BE( rev16 w3, w3 )
byteorder16 w3, \be
bitorder16 w3, \be
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
bitorder8 w3, \be
crc32\c\()b w0, w0, w3
0: ret
0: bitorder w0, \be
ret
.endm
.align 5
@ -99,3 +150,11 @@ alternative_if_not ARM64_HAS_CRC32
alternative_else_nop_endif
__crc32 c
SYM_FUNC_END(__crc32c_le)
.align 5
SYM_FUNC_START(crc32_be)
alternative_if_not ARM64_HAS_CRC32
b crc32_be_base
alternative_else_nop_endif
__crc32 be=1
SYM_FUNC_END(crc32_be)