mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-12 21:57:43 +00:00
crypto: aesni - make non-AVX AES-GCM work with any aadlen
This is the first step to make the aesni AES-GCM implementation generic. The current code was written for rfc4106, so it handles only some specific sizes of associated data. Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
f4857f4c2e
commit
0487ccac20
1 changed files with 132 additions and 37 deletions
|
@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
|
||||||
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
|
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
|
||||||
.octa 0x00000000000000000000000000000000
|
.octa 0x00000000000000000000000000000000
|
||||||
|
|
||||||
|
.section .rodata
|
||||||
|
.align 16
|
||||||
|
.type aad_shift_arr, @object
|
||||||
|
.size aad_shift_arr, 272
|
||||||
|
aad_shift_arr:
|
||||||
|
.octa 0xffffffffffffffffffffffffffffffff
|
||||||
|
.octa 0xffffffffffffffffffffffffffffff0C
|
||||||
|
.octa 0xffffffffffffffffffffffffffff0D0C
|
||||||
|
.octa 0xffffffffffffffffffffffffff0E0D0C
|
||||||
|
.octa 0xffffffffffffffffffffffff0F0E0D0C
|
||||||
|
.octa 0xffffffffffffffffffffff0C0B0A0908
|
||||||
|
.octa 0xffffffffffffffffffff0D0C0B0A0908
|
||||||
|
.octa 0xffffffffffffffffff0E0D0C0B0A0908
|
||||||
|
.octa 0xffffffffffffffff0F0E0D0C0B0A0908
|
||||||
|
.octa 0xffffffffffffff0C0B0A090807060504
|
||||||
|
.octa 0xffffffffffff0D0C0B0A090807060504
|
||||||
|
.octa 0xffffffffff0E0D0C0B0A090807060504
|
||||||
|
.octa 0xffffffff0F0E0D0C0B0A090807060504
|
||||||
|
.octa 0xffffff0C0B0A09080706050403020100
|
||||||
|
.octa 0xffff0D0C0B0A09080706050403020100
|
||||||
|
.octa 0xff0E0D0C0B0A09080706050403020100
|
||||||
|
.octa 0x0F0E0D0C0B0A09080706050403020100
|
||||||
|
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
|
@ -252,32 +275,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
|
||||||
mov arg8, %r12 # %r12 = aadLen
|
mov arg8, %r12 # %r12 = aadLen
|
||||||
mov %r12, %r11
|
mov %r12, %r11
|
||||||
pxor %xmm\i, %xmm\i
|
pxor %xmm\i, %xmm\i
|
||||||
|
pxor \XMM2, \XMM2
|
||||||
|
|
||||||
_get_AAD_loop\num_initial_blocks\operation:
|
cmp $16, %r11
|
||||||
movd (%r10), \TMP1
|
jl _get_AAD_rest8\num_initial_blocks\operation
|
||||||
|
_get_AAD_blocks\num_initial_blocks\operation:
|
||||||
|
movdqu (%r10), %xmm\i
|
||||||
|
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
||||||
|
pxor %xmm\i, \XMM2
|
||||||
|
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
||||||
|
add $16, %r10
|
||||||
|
sub $16, %r12
|
||||||
|
sub $16, %r11
|
||||||
|
cmp $16, %r11
|
||||||
|
jge _get_AAD_blocks\num_initial_blocks\operation
|
||||||
|
|
||||||
|
movdqu \XMM2, %xmm\i
|
||||||
|
cmp $0, %r11
|
||||||
|
je _get_AAD_done\num_initial_blocks\operation
|
||||||
|
|
||||||
|
pxor %xmm\i,%xmm\i
|
||||||
|
|
||||||
|
/* read the last <16B of AAD. since we have at least 4B of
|
||||||
|
data right after the AAD (the ICV, and maybe some CT), we can
|
||||||
|
read 4B/8B blocks safely, and then get rid of the extra stuff */
|
||||||
|
_get_AAD_rest8\num_initial_blocks\operation:
|
||||||
|
cmp $4, %r11
|
||||||
|
jle _get_AAD_rest4\num_initial_blocks\operation
|
||||||
|
movq (%r10), \TMP1
|
||||||
|
add $8, %r10
|
||||||
|
sub $8, %r11
|
||||||
|
pslldq $8, \TMP1
|
||||||
|
psrldq $8, %xmm\i
|
||||||
|
pxor \TMP1, %xmm\i
|
||||||
|
jmp _get_AAD_rest8\num_initial_blocks\operation
|
||||||
|
_get_AAD_rest4\num_initial_blocks\operation:
|
||||||
|
cmp $0, %r11
|
||||||
|
jle _get_AAD_rest0\num_initial_blocks\operation
|
||||||
|
mov (%r10), %eax
|
||||||
|
movq %rax, \TMP1
|
||||||
|
add $4, %r10
|
||||||
|
sub $4, %r10
|
||||||
pslldq $12, \TMP1
|
pslldq $12, \TMP1
|
||||||
psrldq $4, %xmm\i
|
psrldq $4, %xmm\i
|
||||||
pxor \TMP1, %xmm\i
|
pxor \TMP1, %xmm\i
|
||||||
add $4, %r10
|
_get_AAD_rest0\num_initial_blocks\operation:
|
||||||
sub $4, %r12
|
/* finalize: shift out the extra bytes we read, and align
|
||||||
jne _get_AAD_loop\num_initial_blocks\operation
|
left. since pslldq can only shift by an immediate, we use
|
||||||
|
vpshufb and an array of shuffle masks */
|
||||||
cmp $16, %r11
|
movq %r12, %r11
|
||||||
je _get_AAD_loop2_done\num_initial_blocks\operation
|
salq $4, %r11
|
||||||
|
movdqu aad_shift_arr(%r11), \TMP1
|
||||||
mov $16, %r12
|
PSHUFB_XMM \TMP1, %xmm\i
|
||||||
_get_AAD_loop2\num_initial_blocks\operation:
|
_get_AAD_rest_final\num_initial_blocks\operation:
|
||||||
psrldq $4, %xmm\i
|
|
||||||
sub $4, %r12
|
|
||||||
cmp %r11, %r12
|
|
||||||
jne _get_AAD_loop2\num_initial_blocks\operation
|
|
||||||
|
|
||||||
_get_AAD_loop2_done\num_initial_blocks\operation:
|
|
||||||
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
||||||
|
pxor \XMM2, %xmm\i
|
||||||
|
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
||||||
|
|
||||||
|
_get_AAD_done\num_initial_blocks\operation:
|
||||||
xor %r11, %r11 # initialise the data pointer offset as zero
|
xor %r11, %r11 # initialise the data pointer offset as zero
|
||||||
|
# start AES for num_initial_blocks blocks
|
||||||
# start AES for num_initial_blocks blocks
|
|
||||||
|
|
||||||
mov %arg5, %rax # %rax = *Y0
|
mov %arg5, %rax # %rax = *Y0
|
||||||
movdqu (%rax), \XMM0 # XMM0 = Y0
|
movdqu (%rax), \XMM0 # XMM0 = Y0
|
||||||
|
@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
|
||||||
# prepare plaintext/ciphertext for GHASH computation
|
# prepare plaintext/ciphertext for GHASH computation
|
||||||
.endr
|
.endr
|
||||||
.endif
|
.endif
|
||||||
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
|
||||||
# apply GHASH on num_initial_blocks blocks
|
# apply GHASH on num_initial_blocks blocks
|
||||||
|
|
||||||
.if \i == 5
|
.if \i == 5
|
||||||
|
@ -477,28 +534,66 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
|
||||||
mov arg8, %r12 # %r12 = aadLen
|
mov arg8, %r12 # %r12 = aadLen
|
||||||
mov %r12, %r11
|
mov %r12, %r11
|
||||||
pxor %xmm\i, %xmm\i
|
pxor %xmm\i, %xmm\i
|
||||||
_get_AAD_loop\num_initial_blocks\operation:
|
pxor \XMM2, \XMM2
|
||||||
movd (%r10), \TMP1
|
|
||||||
|
cmp $16, %r11
|
||||||
|
jl _get_AAD_rest8\num_initial_blocks\operation
|
||||||
|
_get_AAD_blocks\num_initial_blocks\operation:
|
||||||
|
movdqu (%r10), %xmm\i
|
||||||
|
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
||||||
|
pxor %xmm\i, \XMM2
|
||||||
|
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
||||||
|
add $16, %r10
|
||||||
|
sub $16, %r12
|
||||||
|
sub $16, %r11
|
||||||
|
cmp $16, %r11
|
||||||
|
jge _get_AAD_blocks\num_initial_blocks\operation
|
||||||
|
|
||||||
|
movdqu \XMM2, %xmm\i
|
||||||
|
cmp $0, %r11
|
||||||
|
je _get_AAD_done\num_initial_blocks\operation
|
||||||
|
|
||||||
|
pxor %xmm\i,%xmm\i
|
||||||
|
|
||||||
|
/* read the last <16B of AAD. since we have at least 4B of
|
||||||
|
data right after the AAD (the ICV, and maybe some PT), we can
|
||||||
|
read 4B/8B blocks safely, and then get rid of the extra stuff */
|
||||||
|
_get_AAD_rest8\num_initial_blocks\operation:
|
||||||
|
cmp $4, %r11
|
||||||
|
jle _get_AAD_rest4\num_initial_blocks\operation
|
||||||
|
movq (%r10), \TMP1
|
||||||
|
add $8, %r10
|
||||||
|
sub $8, %r11
|
||||||
|
pslldq $8, \TMP1
|
||||||
|
psrldq $8, %xmm\i
|
||||||
|
pxor \TMP1, %xmm\i
|
||||||
|
jmp _get_AAD_rest8\num_initial_blocks\operation
|
||||||
|
_get_AAD_rest4\num_initial_blocks\operation:
|
||||||
|
cmp $0, %r11
|
||||||
|
jle _get_AAD_rest0\num_initial_blocks\operation
|
||||||
|
mov (%r10), %eax
|
||||||
|
movq %rax, \TMP1
|
||||||
|
add $4, %r10
|
||||||
|
sub $4, %r10
|
||||||
pslldq $12, \TMP1
|
pslldq $12, \TMP1
|
||||||
psrldq $4, %xmm\i
|
psrldq $4, %xmm\i
|
||||||
pxor \TMP1, %xmm\i
|
pxor \TMP1, %xmm\i
|
||||||
add $4, %r10
|
_get_AAD_rest0\num_initial_blocks\operation:
|
||||||
sub $4, %r12
|
/* finalize: shift out the extra bytes we read, and align
|
||||||
jne _get_AAD_loop\num_initial_blocks\operation
|
left. since pslldq can only shift by an immediate, we use
|
||||||
cmp $16, %r11
|
vpshufb and an array of shuffle masks */
|
||||||
je _get_AAD_loop2_done\num_initial_blocks\operation
|
movq %r12, %r11
|
||||||
mov $16, %r12
|
salq $4, %r11
|
||||||
_get_AAD_loop2\num_initial_blocks\operation:
|
movdqu aad_shift_arr(%r11), \TMP1
|
||||||
psrldq $4, %xmm\i
|
PSHUFB_XMM \TMP1, %xmm\i
|
||||||
sub $4, %r12
|
_get_AAD_rest_final\num_initial_blocks\operation:
|
||||||
cmp %r11, %r12
|
|
||||||
jne _get_AAD_loop2\num_initial_blocks\operation
|
|
||||||
_get_AAD_loop2_done\num_initial_blocks\operation:
|
|
||||||
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
|
||||||
|
pxor \XMM2, %xmm\i
|
||||||
|
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
||||||
|
|
||||||
|
_get_AAD_done\num_initial_blocks\operation:
|
||||||
xor %r11, %r11 # initialise the data pointer offset as zero
|
xor %r11, %r11 # initialise the data pointer offset as zero
|
||||||
|
# start AES for num_initial_blocks blocks
|
||||||
# start AES for num_initial_blocks blocks
|
|
||||||
|
|
||||||
mov %arg5, %rax # %rax = *Y0
|
mov %arg5, %rax # %rax = *Y0
|
||||||
movdqu (%rax), \XMM0 # XMM0 = Y0
|
movdqu (%rax), \XMM0 # XMM0 = Y0
|
||||||
|
@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
|
||||||
# prepare plaintext/ciphertext for GHASH computation
|
# prepare plaintext/ciphertext for GHASH computation
|
||||||
.endr
|
.endr
|
||||||
.endif
|
.endif
|
||||||
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
|
|
||||||
# apply GHASH on num_initial_blocks blocks
|
# apply GHASH on num_initial_blocks blocks
|
||||||
|
|
||||||
.if \i == 5
|
.if \i == 5
|
||||||
|
|
Loading…
Reference in a new issue