crypto: vmx - Improved AES/XTS performance of 6-way unrolling for ppc

Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt.  This is done by using one instruction,
vpermxor, to replace xor and vsldoi.

The same changes were applied to OpenSSL code and a pull request was
submitted.

This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest.  The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.

Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Danny Tsen 2023-08-30 09:49:11 -04:00 committed by Herbert Xu
parent 65029eec5c
commit 6b36dafedd
1 changed files with 92 additions and 49 deletions

View File

@ -132,11 +132,12 @@ rcon:
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
.long 0,0,0,0 ?asis .long 0,0,0,0 ?asis
.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
Lconsts: Lconsts:
mflr r0 mflr r0
bcl 20,31,\$+4 bcl 20,31,\$+4
mflr $ptr #vvvvv "distance between . and rcon mflr $ptr #vvvvv "distance between . and rcon
addi $ptr,$ptr,-0x48 addi $ptr,$ptr,-0x58
mtlr r0 mtlr r0
blr blr
.long 0 .long 0
@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li $x70,0x70 li $x70,0x70
mtspr 256,r0 mtspr 256,r0
xxlor 2, 32+$eighty7, 32+$eighty7
vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
xxlor 1, 32+$eighty7, 32+$eighty7
# Load XOR Lconsts.
mr $x70, r6
bl Lconsts
lxvw4x 0, $x40, r6 # load XOR contents
mr r6, $x70
li $x70,0x70
subi $rounds,$rounds,3 # -4 in total subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule lvx $rndkey0,$x00,$key1 # load key schedule
@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm v31,v31,$twk5,$keyperm ?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2] lvx v25,$x10,$key_ # pre-load round[2]
# Switch to use the following codes with 0x010101..87 to generate tweak.
# eighty7 = 0x010101..87
# vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
# vand tmp, tmp, eighty7 # last byte with carry
# vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
# xxlor vsx, 0, 0
# vpermxor tweak, tweak, tmp, vsx
vperm $in0,$inout,$inptail,$inpperm vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller" subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0 vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0 vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp xxlor 32+$in1, 0, 0
vpermxor $tweak, $tweak, $tmp, $in1
lvx_u $in1,$x10,$inp lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0 vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1 vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp xxlor 32+$in2, 0, 0
vpermxor $tweak, $tweak, $tmp, $in2
lvx_u $in2,$x20,$inp lvx_u $in2,$x20,$inp
andi. $taillen,$len,15 andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0 vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2 vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp xxlor 32+$in3, 0, 0
vpermxor $tweak, $tweak, $tmp, $in3
lvx_u $in3,$x30,$inp lvx_u $in3,$x30,$inp
sub $len,$len,$taillen sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0 vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3 vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp xxlor 32+$in4, 0, 0
vpermxor $tweak, $tweak, $tmp, $in4
lvx_u $in4,$x40,$inp lvx_u $in4,$x40,$inp
subi $len,$len,0x60 subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0 vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4 vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp xxlor 32+$in5, 0, 0
vpermxor $tweak, $tweak, $tmp, $in5
lvx_u $in5,$x50,$inp lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60 addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0 vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5 vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp xxlor 32+$in0, 0, 0
vpermxor $tweak, $tweak, $tmp, $in0
vxor v31,v31,$rndkey0 vxor v31,v31,$rndkey0
mtctr $rounds mtctr $rounds
@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
lvx v25,$x10,$key_ # round[4] lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc6x bdnz Loop_xts_enc6x
xxlor 32+$eighty7, 1, 1 # 0x010101..87
subic $len,$len,96 # $len-=96 subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key vxor $in0,$twk0,v31 # xor with last round key
vcipher $out0,$out0,v24 vcipher $out0,$out0,v24
@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vcipher $out2,$out2,v24 vcipher $out2,$out2,v24
vcipher $out3,$out3,v24 vcipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v24 vcipher $out4,$out4,v24
vcipher $out5,$out5,v24 vcipher $out5,$out5,v24
@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v25 vcipher $out0,$out0,v25
vcipher $out1,$out1,v25 vcipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp xxlor 32+$in1, 0, 0
vpermxor $tweak, $tweak, $tmp, $in1
vcipher $out2,$out2,v25 vcipher $out2,$out2,v25
vcipher $out3,$out3,v25 vcipher $out3,$out3,v25
vxor $in1,$twk1,v31 vxor $in1,$twk1,v31
@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
and r0,r0,$len and r0,r0,$len
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v26 vcipher $out0,$out0,v26
vcipher $out1,$out1,v26 vcipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v26 vcipher $out2,$out2,v26
vcipher $out3,$out3,v26 vcipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp xxlor 32+$in2, 0, 0
vpermxor $tweak, $tweak, $tmp, $in2
vcipher $out4,$out4,v26 vcipher $out4,$out4,v26
vcipher $out5,$out5,v26 vcipher $out5,$out5,v26
@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vcipher $out0,$out0,v27 vcipher $out0,$out0,v27
vcipher $out1,$out1,v27 vcipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vcipher $out2,$out2,v27 vcipher $out2,$out2,v27
vcipher $out3,$out3,v27 vcipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
vcipher $out5,$out5,v27 vcipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_ addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp xxlor 32+$in3, 0, 0
vpermxor $tweak, $tweak, $tmp, $in3
vcipher $out0,$out0,v28 vcipher $out0,$out0,v28
vcipher $out1,$out1,v28 vcipher $out1,$out1,v28
vxor $in3,$twk3,v31 vxor $in3,$twk3,v31
@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
vcipher $out2,$out2,v28 vcipher $out2,$out2,v28
vcipher $out3,$out3,v28 vcipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v28 vcipher $out4,$out4,v28
vcipher $out5,$out5,v28 vcipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1] lvx v24,$x00,$key_ # re-pre-load round[1]
@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
vcipher $out0,$out0,v29 vcipher $out0,$out0,v29
vcipher $out1,$out1,v29 vcipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp xxlor 32+$in4, 0, 0
vpermxor $tweak, $tweak, $tmp, $in4
vcipher $out2,$out2,v29 vcipher $out2,$out2,v29
vcipher $out3,$out3,v29 vcipher $out3,$out3,v29
vxor $in4,$twk4,v31 vxor $in4,$twk4,v31
@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
vcipher $out5,$out5,v29 vcipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2] lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v30 vcipher $out0,$out0,v30
vcipher $out1,$out1,v30 vcipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v30 vcipher $out2,$out2,v30
vcipher $out3,$out3,v30 vcipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp xxlor 32+$in5, 0, 0
vpermxor $tweak, $tweak, $tmp, $in5
vcipher $out4,$out4,v30 vcipher $out4,$out4,v30
vcipher $out5,$out5,v30 vcipher $out5,$out5,v30
vxor $in5,$twk5,v31 vxor $in5,$twk5,v31
@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
vcipherlast $out0,$out0,$in0 vcipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipherlast $out1,$out1,$in1 vcipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp lvx_u $in1,$x10,$inp
vcipherlast $out2,$out2,$in2 vcipherlast $out2,$out2,$in2
@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
vcipherlast $out4,$out4,$in4 vcipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp xxlor 10, 32+$in0, 32+$in0
xxlor 32+$in0, 0, 0
vpermxor $tweak, $tweak, $tmp, $in0
xxlor 32+$in0, 10, 10
vcipherlast $tmp,$out5,$in5 # last block might be needed vcipherlast $tmp,$out5,$in5 # last block might be needed
# in stealing mode # in stealing mode
le?vperm $in3,$in3,$in3,$leperm le?vperm $in3,$in3,$in3,$leperm
@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
mtctr $rounds mtctr $rounds
beq Loop_xts_enc6x # did $len-=96 borrow? beq Loop_xts_enc6x # did $len-=96 borrow?
xxlor 32+$eighty7, 2, 2 # 0x010101..87
addic. $len,$len,0x60 addic. $len,$len,0x60
beq Lxts_enc6x_zero beq Lxts_enc6x_zero
cmpwi $len,0x20 cmpwi $len,0x20
@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
li $x70,0x70 li $x70,0x70
mtspr 256,r0 mtspr 256,r0
xxlor 2, 32+$eighty7, 32+$eighty7
vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
xxlor 1, 32+$eighty7, 32+$eighty7
# Load XOR Lconsts.
mr $x70, r6
bl Lconsts
lxvw4x 0, $x40, r6 # load XOR contents
mr r6, $x70
li $x70,0x70
subi $rounds,$rounds,3 # -4 in total subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule lvx $rndkey0,$x00,$key1 # load key schedule
@ -3194,64 +3231,64 @@ Load_xts_dec_key:
vxor $twk0,$tweak,$rndkey0 vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0 vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp xxlor 32+$in1, 0, 0
vpermxor $tweak, $tweak, $tmp, $in1
lvx_u $in1,$x10,$inp lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0 vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1 vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp xxlor 32+$in2, 0, 0
vpermxor $tweak, $tweak, $tmp, $in2
lvx_u $in2,$x20,$inp lvx_u $in2,$x20,$inp
andi. $taillen,$len,15 andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0 vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2 vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp xxlor 32+$in3, 0, 0
vpermxor $tweak, $tweak, $tmp, $in3
lvx_u $in3,$x30,$inp lvx_u $in3,$x30,$inp
sub $len,$len,$taillen sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0 vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3 vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp xxlor 32+$in4, 0, 0
vpermxor $tweak, $tweak, $tmp, $in4
lvx_u $in4,$x40,$inp lvx_u $in4,$x40,$inp
subi $len,$len,0x60 subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0 vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4 vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp xxlor 32+$in5, 0, 0
vpermxor $tweak, $tweak, $tmp, $in5
lvx_u $in5,$x50,$inp lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60 addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0 vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5 vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp xxlor 32+$in0, 0, 0
vpermxor $tweak, $tweak, $tmp, $in0
vxor v31,v31,$rndkey0 vxor v31,v31,$rndkey0
mtctr $rounds mtctr $rounds
@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
lvx v25,$x10,$key_ # round[4] lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec6x bdnz Loop_xts_dec6x
xxlor 32+$eighty7, 1, 1 # 0x010101..87
subic $len,$len,96 # $len-=96 subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key vxor $in0,$twk0,v31 # xor with last round key
vncipher $out0,$out0,v24 vncipher $out0,$out0,v24
@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vncipher $out2,$out2,v24 vncipher $out2,$out2,v24
vncipher $out3,$out3,v24 vncipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v24 vncipher $out4,$out4,v24
vncipher $out5,$out5,v24 vncipher $out5,$out5,v24
@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v25 vncipher $out0,$out0,v25
vncipher $out1,$out1,v25 vncipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp xxlor 32+$in1, 0, 0
vpermxor $tweak, $tweak, $tmp, $in1
vncipher $out2,$out2,v25 vncipher $out2,$out2,v25
vncipher $out3,$out3,v25 vncipher $out3,$out3,v25
vxor $in1,$twk1,v31 vxor $in1,$twk1,v31
@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
and r0,r0,$len and r0,r0,$len
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v26 vncipher $out0,$out0,v26
vncipher $out1,$out1,v26 vncipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v26 vncipher $out2,$out2,v26
vncipher $out3,$out3,v26 vncipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp xxlor 32+$in2, 0, 0
vpermxor $tweak, $tweak, $tmp, $in2
vncipher $out4,$out4,v26 vncipher $out4,$out4,v26
vncipher $out5,$out5,v26 vncipher $out5,$out5,v26
@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vncipher $out0,$out0,v27 vncipher $out0,$out0,v27
vncipher $out1,$out1,v27 vncipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vncipher $out2,$out2,v27 vncipher $out2,$out2,v27
vncipher $out3,$out3,v27 vncipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
vncipher $out5,$out5,v27 vncipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_ addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp xxlor 32+$in3, 0, 0
vpermxor $tweak, $tweak, $tmp, $in3
vncipher $out0,$out0,v28 vncipher $out0,$out0,v28
vncipher $out1,$out1,v28 vncipher $out1,$out1,v28
vxor $in3,$twk3,v31 vxor $in3,$twk3,v31
@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
vncipher $out2,$out2,v28 vncipher $out2,$out2,v28
vncipher $out3,$out3,v28 vncipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v28 vncipher $out4,$out4,v28
vncipher $out5,$out5,v28 vncipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1] lvx v24,$x00,$key_ # re-pre-load round[1]
@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
vncipher $out0,$out0,v29 vncipher $out0,$out0,v29
vncipher $out1,$out1,v29 vncipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp xxlor 32+$in4, 0, 0
vpermxor $tweak, $tweak, $tmp, $in4
vncipher $out2,$out2,v29 vncipher $out2,$out2,v29
vncipher $out3,$out3,v29 vncipher $out3,$out3,v29
vxor $in4,$twk4,v31 vxor $in4,$twk4,v31
@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
vncipher $out5,$out5,v29 vncipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2] lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v30 vncipher $out0,$out0,v30
vncipher $out1,$out1,v30 vncipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7 vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v30 vncipher $out2,$out2,v30
vncipher $out3,$out3,v30 vncipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp xxlor 32+$in5, 0, 0
vpermxor $tweak, $tweak, $tmp, $in5
vncipher $out4,$out4,v30 vncipher $out4,$out4,v30
vncipher $out5,$out5,v30 vncipher $out5,$out5,v30
vxor $in5,$twk5,v31 vxor $in5,$twk5,v31
@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
vncipherlast $out0,$out0,$in0 vncipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipherlast $out1,$out1,$in1 vncipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp lvx_u $in1,$x10,$inp
vncipherlast $out2,$out2,$in2 vncipherlast $out2,$out2,$in2
@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
vncipherlast $out4,$out4,$in4 vncipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp xxlor 10, 32+$in0, 32+$in0
xxlor 32+$in0, 0, 0
vpermxor $tweak, $tweak, $tmp, $in0
xxlor 32+$in0, 10, 10
vncipherlast $out5,$out5,$in5 vncipherlast $out5,$out5,$in5
le?vperm $in3,$in3,$in3,$leperm le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp lvx_u $in5,$x50,$inp
@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
mtctr $rounds mtctr $rounds
beq Loop_xts_dec6x # did $len-=96 borrow? beq Loop_xts_dec6x # did $len-=96 borrow?
xxlor 32+$eighty7, 2, 2 # 0x010101..87
addic. $len,$len,0x60 addic. $len,$len,0x60
beq Lxts_dec6x_zero beq Lxts_dec6x_zero
cmpwi $len,0x20 cmpwi $len,0x20