crypto: arm/aes-neonbs-ctr - deal with non-multiples of AES block size

Instead of falling back to C code to deal with the final bit of input
that is not a round multiple of the block size, handle this in the asm
code, permitting us to use overlapping loads and stores for performance,
and implement the 16-byte wide XOR using a single NEON instruction.

Since NEON loads and stores have a natural width of 16 bytes, we need to
handle inputs of less than 16 bytes in a special way, but this rarely
occurs in practice so it does not impact performance. All other input
sizes can be consumed directly by the NEON asm code, although it should
be noted that the core AES transform can still only process 128 bytes (8
AES blocks) at a time.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2022-01-27 12:35:43 +01:00 committed by Herbert Xu
parent 8daa399ede
commit c8bf850e99
2 changed files with 82 additions and 68 deletions

View File

@ -758,29 +758,24 @@ ENTRY(aesbs_cbc_decrypt)
ENDPROC(aesbs_cbc_decrypt) ENDPROC(aesbs_cbc_decrypt)
.macro next_ctr, q .macro next_ctr, q
vmov.32 \q\()h[1], r10 vmov \q\()h, r9, r10
adds r10, r10, #1 adds r10, r10, #1
vmov.32 \q\()h[0], r9
adcs r9, r9, #0 adcs r9, r9, #0
vmov.32 \q\()l[1], r8 vmov \q\()l, r7, r8
adcs r8, r8, #0 adcs r8, r8, #0
vmov.32 \q\()l[0], r7
adc r7, r7, #0 adc r7, r7, #0
vrev32.8 \q, \q vrev32.8 \q, \q
.endm .endm
/* /*
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
* int rounds, int blocks, u8 ctr[], u8 final[]) * int rounds, int bytes, u8 ctr[])
*/ */
ENTRY(aesbs_ctr_encrypt) ENTRY(aesbs_ctr_encrypt)
mov ip, sp mov ip, sp
push {r4-r10, lr} push {r4-r10, lr}
ldm ip, {r5-r7} // load args 4-6 ldm ip, {r5, r6} // load args 4-5
teq r7, #0
addne r5, r5, #1 // one extra block if final != 0
vld1.8 {q0}, [r6] // load counter vld1.8 {q0}, [r6] // load counter
vrev32.8 q1, q0 vrev32.8 q1, q0
vmov r9, r10, d3 vmov r9, r10, d3
@ -792,20 +787,19 @@ ENTRY(aesbs_ctr_encrypt)
adc r7, r7, #0 adc r7, r7, #0
99: vmov q1, q0 99: vmov q1, q0
vmov q2, q0
vmov q3, q0
vmov q4, q0
vmov q5, q0
vmov q6, q0
vmov q7, q0
adr ip, 0f
sub lr, r5, #1 sub lr, r5, #1
and lr, lr, #7 vmov q2, q0
cmp r5, #8 adr ip, 0f
sub ip, ip, lr, lsl #5 vmov q3, q0
sub ip, ip, lr, lsl #2 and lr, lr, #112
movlt pc, ip // computed goto if blocks < 8 vmov q4, q0
cmp r5, #112
vmov q5, q0
sub ip, ip, lr, lsl #1
vmov q6, q0
add ip, ip, lr, lsr #2
vmov q7, q0
movle pc, ip // computed goto if bytes < 112
next_ctr q1 next_ctr q1
next_ctr q2 next_ctr q2
@ -820,12 +814,14 @@ ENTRY(aesbs_ctr_encrypt)
bl aesbs_encrypt8 bl aesbs_encrypt8
adr ip, 1f adr ip, 1f
and lr, r5, #7 sub lr, r5, #1
cmp r5, #8 cmp r5, #128
movgt r4, #0 bic lr, lr, #15
ldrle r4, [sp, #40] // load final in the last round ands r4, r5, #15 // preserves C flag
sub ip, ip, lr, lsl #2 teqcs r5, r5 // set Z flag if not last iteration
movlt pc, ip // computed goto if blocks < 8 sub ip, ip, lr, lsr #2
rsb r4, r4, #16
movcc pc, ip // computed goto if bytes < 128
vld1.8 {q8}, [r1]! vld1.8 {q8}, [r1]!
vld1.8 {q9}, [r1]! vld1.8 {q9}, [r1]!
@ -834,46 +830,70 @@ ENTRY(aesbs_ctr_encrypt)
vld1.8 {q12}, [r1]! vld1.8 {q12}, [r1]!
vld1.8 {q13}, [r1]! vld1.8 {q13}, [r1]!
vld1.8 {q14}, [r1]! vld1.8 {q14}, [r1]!
teq r4, #0 // skip last block if 'final' 1: subne r1, r1, r4
1: bne 2f
vld1.8 {q15}, [r1]! vld1.8 {q15}, [r1]!
2: adr ip, 3f add ip, ip, #2f - 1b
cmp r5, #8
sub ip, ip, lr, lsl #3
movlt pc, ip // computed goto if blocks < 8
veor q0, q0, q8 veor q0, q0, q8
vst1.8 {q0}, [r0]!
veor q1, q1, q9 veor q1, q1, q9
vst1.8 {q1}, [r0]!
veor q4, q4, q10 veor q4, q4, q10
vst1.8 {q4}, [r0]!
veor q6, q6, q11 veor q6, q6, q11
vst1.8 {q6}, [r0]!
veor q3, q3, q12 veor q3, q3, q12
vst1.8 {q3}, [r0]!
veor q7, q7, q13 veor q7, q7, q13
vst1.8 {q7}, [r0]!
veor q2, q2, q14 veor q2, q2, q14
bne 3f
veor q5, q5, q15
movcc pc, ip // computed goto if bytes < 128
vst1.8 {q0}, [r0]!
vst1.8 {q1}, [r0]!
vst1.8 {q4}, [r0]!
vst1.8 {q6}, [r0]!
vst1.8 {q3}, [r0]!
vst1.8 {q7}, [r0]!
vst1.8 {q2}, [r0]! vst1.8 {q2}, [r0]!
teq r4, #0 // skip last block if 'final' 2: subne r0, r0, r4
W(bne) 5f
3: veor q5, q5, q15
vst1.8 {q5}, [r0]! vst1.8 {q5}, [r0]!
4: next_ctr q0 next_ctr q0
subs r5, r5, #8 subs r5, r5, #128
bgt 99b bgt 99b
vst1.8 {q0}, [r6] vst1.8 {q0}, [r6]
pop {r4-r10, pc} pop {r4-r10, pc}
5: vst1.8 {q5}, [r4] 3: adr lr, .Lpermute_table + 16
b 4b cmp r5, #16 // Z flag remains cleared
sub lr, lr, r4
vld1.8 {q8-q9}, [lr]
vtbl.8 d16, {q5}, d16
vtbl.8 d17, {q5}, d17
veor q5, q8, q15
bcc 4f // have to reload prev if R5 < 16
vtbx.8 d10, {q2}, d18
vtbx.8 d11, {q2}, d19
mov pc, ip // branch back to VST sequence
4: sub r0, r0, r4
vshr.s8 q9, q9, #7 // create mask for VBIF
vld1.8 {q8}, [r0] // reload
vbif q5, q8, q9
vst1.8 {q5}, [r0]
pop {r4-r10, pc}
ENDPROC(aesbs_ctr_encrypt) ENDPROC(aesbs_ctr_encrypt)
.align 6
.Lpermute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.macro next_tweak, out, in, const, tmp .macro next_tweak, out, in, const, tmp
vshr.s64 \tmp, \in, #63 vshr.s64 \tmp, \in, #63
vand \tmp, \tmp, \const vand \tmp, \tmp, \const
@ -888,6 +908,7 @@ ENDPROC(aesbs_ctr_encrypt)
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[], int reorder_last_tweak) * int blocks, u8 iv[], int reorder_last_tweak)
*/ */
.align 6
__xts_prepare8: __xts_prepare8:
vld1.8 {q14}, [r7] // load iv vld1.8 {q14}, [r7] // load iv
vmov.i32 d30, #0x87 // compose tweak mask vector vmov.i32 d30, #0x87 // compose tweak mask vector

View File

@ -37,7 +37,7 @@ asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]); int rounds, int blocks, u8 iv[]);
asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 ctr[], u8 final[]); int rounds, int blocks, u8 ctr[]);
asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[], int); int rounds, int blocks, u8 iv[], int);
@ -243,32 +243,25 @@ static int ctr_encrypt(struct skcipher_request *req)
err = skcipher_walk_virt(&walk, req, false); err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) { while (walk.nbytes > 0) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE; const u8 *src = walk.src.virt.addr;
u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL; u8 *dst = walk.dst.virt.addr;
int bytes = walk.nbytes;
if (walk.nbytes < walk.total) { if (unlikely(bytes < AES_BLOCK_SIZE))
blocks = round_down(blocks, src = dst = memcpy(buf + sizeof(buf) - bytes,
walk.stride / AES_BLOCK_SIZE); src, bytes);
final = NULL; else if (walk.nbytes < walk.total)
} bytes &= ~(8 * AES_BLOCK_SIZE - 1);
kernel_neon_begin(); kernel_neon_begin();
aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, aesbs_ctr_encrypt(dst, src, ctx->rk, ctx->rounds, bytes, walk.iv);
ctx->rk, ctx->rounds, blocks, walk.iv, final);
kernel_neon_end(); kernel_neon_end();
if (final) { if (unlikely(bytes < AES_BLOCK_SIZE))
u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; memcpy(walk.dst.virt.addr,
u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; buf + sizeof(buf) - bytes, bytes);
crypto_xor_cpy(dst, src, final, err = skcipher_walk_done(&walk, walk.nbytes - bytes);
walk.total % AES_BLOCK_SIZE);
err = skcipher_walk_done(&walk, 0);
break;
}
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
} }
return err; return err;