crypto: arm64/sm4 - refactor and simplify CE implementation

This patch does not add new features, but only refactors and simplifies the
implementation of the Crypto Extension acceleration of the SM4 algorithm:

Extract the macro optimized by SM4 Crypto Extension for reuse in the
subsequent optimization of CCM/GCM modes.

Encryption in CBC and CFB modes processes four blocks at a time instead of
one, allowing the ld1 instruction to load 64 bytes of data at a time, which
will reduces unnecessary memory accesses.

CBC/CFB/CTR makes full use of free registers to reduce redundant memory
accesses, and rearranges some instructions to improve out-of-order execution
capabilities.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Tianjia Zhang 2022-10-27 14:54:58 +08:00 committed by Herbert Xu
parent 3c3836378d
commit ce41fefd24
3 changed files with 515 additions and 396 deletions

View File

@ -0,0 +1,209 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 helper macros for Crypto Extensions
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
#define SM4_PREPARE(ptr) \
ld1 {v24.16b-v27.16b}, [ptr], #64; \
ld1 {v28.16b-v31.16b}, [ptr];
#define SM4_CRYPT_BLK_BE(b0) \
sm4e b0.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
rev32 b0.16b, b0.16b;
#define SM4_CRYPT_BLK(b0) \
rev32 b0.16b, b0.16b; \
SM4_CRYPT_BLK_BE(b0);
#define SM4_CRYPT_BLK2_BE(b0, b1) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
#define SM4_CRYPT_BLK2(b0, b1) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
SM4_CRYPT_BLK2_BE(b0, b1);
#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b;
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
#define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b4.4s, v24.4s; \
sm4e b5.4s, v24.4s; \
sm4e b6.4s, v24.4s; \
sm4e b7.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b4.4s, v25.4s; \
sm4e b5.4s, v25.4s; \
sm4e b6.4s, v25.4s; \
sm4e b7.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b4.4s, v26.4s; \
sm4e b5.4s, v26.4s; \
sm4e b6.4s, v26.4s; \
sm4e b7.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b4.4s, v27.4s; \
sm4e b5.4s, v27.4s; \
sm4e b6.4s, v27.4s; \
sm4e b7.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b4.4s, v28.4s; \
sm4e b5.4s, v28.4s; \
sm4e b6.4s, v28.4s; \
sm4e b7.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b4.4s, v29.4s; \
sm4e b5.4s, v29.4s; \
sm4e b6.4s, v29.4s; \
sm4e b7.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b4.4s, v30.4s; \
sm4e b5.4s, v30.4s; \
sm4e b6.4s, v30.4s; \
sm4e b7.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
sm4e b4.4s, v31.4s; \
sm4e b5.4s, v31.4s; \
sm4e b6.4s, v31.4s; \
sm4e b7.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
rev64 b4.4s, b4.4s; \
rev64 b5.4s, b5.4s; \
rev64 b6.4s, b6.4s; \
rev64 b7.4s, b7.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
ext b4.16b, b4.16b, b4.16b, #8; \
ext b5.16b, b5.16b, b5.16b, #8; \
ext b6.16b, b6.16b, b6.16b, #8; \
ext b7.16b, b7.16b, b7.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b;
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b; \
SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);

View File

@ -10,10 +10,12 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"
.arch armv8-a+crypto
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
20, 24, 25, 26, 27, 28, 29, 30, 31
.set .Lv\b\().4s, \b
.endr
@ -34,174 +36,6 @@
#define RIV v20
/* Helper macros. */
#define PREPARE \
ld1 {v24.16b-v27.16b}, [x0], #64; \
ld1 {v28.16b-v31.16b}, [x0];
#define SM4_CRYPT_BLK(b0) \
rev32 b0.16b, b0.16b; \
sm4e b0.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
rev32 b0.16b, b0.16b;
#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b;
#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b; \
sm4e b0.4s, v24.4s; \
sm4e b1.4s, v24.4s; \
sm4e b2.4s, v24.4s; \
sm4e b3.4s, v24.4s; \
sm4e b4.4s, v24.4s; \
sm4e b5.4s, v24.4s; \
sm4e b6.4s, v24.4s; \
sm4e b7.4s, v24.4s; \
sm4e b0.4s, v25.4s; \
sm4e b1.4s, v25.4s; \
sm4e b2.4s, v25.4s; \
sm4e b3.4s, v25.4s; \
sm4e b4.4s, v25.4s; \
sm4e b5.4s, v25.4s; \
sm4e b6.4s, v25.4s; \
sm4e b7.4s, v25.4s; \
sm4e b0.4s, v26.4s; \
sm4e b1.4s, v26.4s; \
sm4e b2.4s, v26.4s; \
sm4e b3.4s, v26.4s; \
sm4e b4.4s, v26.4s; \
sm4e b5.4s, v26.4s; \
sm4e b6.4s, v26.4s; \
sm4e b7.4s, v26.4s; \
sm4e b0.4s, v27.4s; \
sm4e b1.4s, v27.4s; \
sm4e b2.4s, v27.4s; \
sm4e b3.4s, v27.4s; \
sm4e b4.4s, v27.4s; \
sm4e b5.4s, v27.4s; \
sm4e b6.4s, v27.4s; \
sm4e b7.4s, v27.4s; \
sm4e b0.4s, v28.4s; \
sm4e b1.4s, v28.4s; \
sm4e b2.4s, v28.4s; \
sm4e b3.4s, v28.4s; \
sm4e b4.4s, v28.4s; \
sm4e b5.4s, v28.4s; \
sm4e b6.4s, v28.4s; \
sm4e b7.4s, v28.4s; \
sm4e b0.4s, v29.4s; \
sm4e b1.4s, v29.4s; \
sm4e b2.4s, v29.4s; \
sm4e b3.4s, v29.4s; \
sm4e b4.4s, v29.4s; \
sm4e b5.4s, v29.4s; \
sm4e b6.4s, v29.4s; \
sm4e b7.4s, v29.4s; \
sm4e b0.4s, v30.4s; \
sm4e b1.4s, v30.4s; \
sm4e b2.4s, v30.4s; \
sm4e b3.4s, v30.4s; \
sm4e b4.4s, v30.4s; \
sm4e b5.4s, v30.4s; \
sm4e b6.4s, v30.4s; \
sm4e b7.4s, v30.4s; \
sm4e b0.4s, v31.4s; \
sm4e b1.4s, v31.4s; \
sm4e b2.4s, v31.4s; \
sm4e b3.4s, v31.4s; \
sm4e b4.4s, v31.4s; \
sm4e b5.4s, v31.4s; \
sm4e b6.4s, v31.4s; \
sm4e b7.4s, v31.4s; \
rev64 b0.4s, b0.4s; \
rev64 b1.4s, b1.4s; \
rev64 b2.4s, b2.4s; \
rev64 b3.4s, b3.4s; \
rev64 b4.4s, b4.4s; \
rev64 b5.4s, b5.4s; \
rev64 b6.4s, b6.4s; \
rev64 b7.4s, b7.4s; \
ext b0.16b, b0.16b, b0.16b, #8; \
ext b1.16b, b1.16b, b1.16b, #8; \
ext b2.16b, b2.16b, b2.16b, #8; \
ext b3.16b, b3.16b, b3.16b, #8; \
ext b4.16b, b4.16b, b4.16b, #8; \
ext b5.16b, b5.16b, b5.16b, #8; \
ext b6.16b, b6.16b, b6.16b, #8; \
ext b7.16b, b7.16b, b7.16b, #8; \
rev32 b0.16b, b0.16b; \
rev32 b1.16b, b1.16b; \
rev32 b2.16b, b2.16b; \
rev32 b3.16b, b3.16b; \
rev32 b4.16b, b4.16b; \
rev32 b5.16b, b5.16b; \
rev32 b6.16b, b6.16b; \
rev32 b7.16b, b7.16b;
.align 3
SYM_FUNC_START(sm4_ce_expand_key)
@ -268,7 +102,7 @@ SYM_FUNC_START(sm4_ce_crypt_block)
* x1: dst
* x2: src
*/
PREPARE;
SM4_PREPARE(x0)
ld1 {v0.16b}, [x2];
SM4_CRYPT_BLK(v0);
@ -285,7 +119,7 @@ SYM_FUNC_START(sm4_ce_crypt)
* x2: src
* w3: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
.Lcrypt_loop_blk:
sub w3, w3, #8;
@ -337,26 +171,50 @@ SYM_FUNC_START(sm4_ce_cbc_enc)
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3];
ld1 {RIV.16b}, [x3]
.Lcbc_enc_loop:
sub w4, w4, #1;
.Lcbc_enc_loop_4x:
cmp w4, #4
blt .Lcbc_enc_loop_1x
ld1 {RTMP0.16b}, [x2], #16;
eor RIV.16b, RIV.16b, RTMP0.16b;
sub w4, w4, #4
SM4_CRYPT_BLK(RIV);
ld1 {v0.16b-v3.16b}, [x2], #64
st1 {RIV.16b}, [x1], #16;
eor v0.16b, v0.16b, RIV.16b
SM4_CRYPT_BLK(v0)
eor v1.16b, v1.16b, v0.16b
SM4_CRYPT_BLK(v1)
eor v2.16b, v2.16b, v1.16b
SM4_CRYPT_BLK(v2)
eor v3.16b, v3.16b, v2.16b
SM4_CRYPT_BLK(v3)
cbnz w4, .Lcbc_enc_loop;
st1 {v0.16b-v3.16b}, [x1], #64
mov RIV.16b, v3.16b
cbz w4, .Lcbc_enc_end
b .Lcbc_enc_loop_4x
.Lcbc_enc_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
eor RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)
st1 {RIV.16b}, [x1], #16
cbnz w4, .Lcbc_enc_loop_1x
.Lcbc_enc_end:
/* store new IV */
st1 {RIV.16b}, [x3];
st1 {RIV.16b}, [x3]
ret;
ret
SYM_FUNC_END(sm4_ce_cbc_enc)
.align 3
@ -368,79 +226,93 @@ SYM_FUNC_START(sm4_ce_cbc_dec)
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3];
ld1 {RIV.16b}, [x3]
.Lcbc_loop_blk:
sub w4, w4, #8;
tbnz w4, #31, .Lcbc_tail8;
.Lcbc_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lcbc_dec_4x
ld1 {v0.16b-v3.16b}, [x2], #64;
ld1 {v4.16b-v7.16b}, [x2];
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
rev32 v12.16b, v4.16b
rev32 v13.16b, v5.16b
rev32 v14.16b, v6.16b
rev32 v15.16b, v7.16b
sub x2, x2, #64;
eor v0.16b, v0.16b, RIV.16b;
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v1.16b, v1.16b, RTMP0.16b;
eor v2.16b, v2.16b, RTMP1.16b;
eor v3.16b, v3.16b, RTMP2.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
eor v4.16b, v4.16b, RTMP3.16b;
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v5.16b, v5.16b, RTMP0.16b;
eor v6.16b, v6.16b, RTMP1.16b;
eor v7.16b, v7.16b, RTMP2.16b;
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
eor v12.16b, v12.16b, v3.16b
eor v13.16b, v13.16b, v4.16b
eor v14.16b, v14.16b, v5.16b
eor v15.16b, v15.16b, v6.16b
mov RIV.16b, RTMP3.16b;
st1 {v4.16b-v7.16b}, [x1], #64;
st1 {v8.16b-v11.16b}, [x1], #64
st1 {v12.16b-v15.16b}, [x1], #64
cbz w4, .Lcbc_end;
b .Lcbc_loop_blk;
mov RIV.16b, v7.16b
.Lcbc_tail8:
add w4, w4, #8;
cmp w4, #4;
blt .Lcbc_tail4;
cbz w4, .Lcbc_dec_end
b .Lcbc_dec_loop_8x
sub w4, w4, #4;
.Lcbc_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lcbc_dec_loop_1x
ld1 {v0.16b-v3.16b}, [x2];
sub w4, w4, #4
SM4_CRYPT_BLK4(v0, v1, v2, v3);
ld1 {v0.16b-v3.16b}, [x2], #64
eor v0.16b, v0.16b, RIV.16b;
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v1.16b, v1.16b, RTMP0.16b;
eor v2.16b, v2.16b, RTMP1.16b;
eor v3.16b, v3.16b, RTMP2.16b;
rev32 v8.16b, v0.16b
rev32 v9.16b, v1.16b
rev32 v10.16b, v2.16b
rev32 v11.16b, v3.16b
mov RIV.16b, RTMP3.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
cbz w4, .Lcbc_end;
eor v8.16b, v8.16b, RIV.16b
eor v9.16b, v9.16b, v0.16b
eor v10.16b, v10.16b, v1.16b
eor v11.16b, v11.16b, v2.16b
.Lcbc_tail4:
sub w4, w4, #1;
st1 {v8.16b-v11.16b}, [x1], #64
ld1 {v0.16b}, [x2];
mov RIV.16b, v3.16b
SM4_CRYPT_BLK(v0);
cbz w4, .Lcbc_dec_end
eor v0.16b, v0.16b, RIV.16b;
ld1 {RIV.16b}, [x2], #16;
st1 {v0.16b}, [x1], #16;
.Lcbc_dec_loop_1x:
sub w4, w4, #1
cbnz w4, .Lcbc_tail4;
ld1 {v0.16b}, [x2], #16
.Lcbc_end:
rev32 v8.16b, v0.16b
SM4_CRYPT_BLK_BE(v8)
eor v8.16b, v8.16b, RIV.16b
st1 {v8.16b}, [x1], #16
mov RIV.16b, v0.16b
cbnz w4, .Lcbc_dec_loop_1x
.Lcbc_dec_end:
/* store new IV */
st1 {RIV.16b}, [x3];
st1 {RIV.16b}, [x3]
ret;
ret
SYM_FUNC_END(sm4_ce_cbc_dec)
.align 3
@ -452,25 +324,57 @@ SYM_FUNC_START(sm4_ce_cfb_enc)
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
ld1 {RIV.16b}, [x3];
ld1 {RIV.16b}, [x3]
.Lcfb_enc_loop:
sub w4, w4, #1;
.Lcfb_enc_loop_4x:
cmp w4, #4
blt .Lcfb_enc_loop_1x
SM4_CRYPT_BLK(RIV);
sub w4, w4, #4
ld1 {RTMP0.16b}, [x2], #16;
eor RIV.16b, RIV.16b, RTMP0.16b;
st1 {RIV.16b}, [x1], #16;
ld1 {v0.16b-v3.16b}, [x2], #64
cbnz w4, .Lcfb_enc_loop;
rev32 v8.16b, RIV.16b
SM4_CRYPT_BLK_BE(v8)
eor v0.16b, v0.16b, v8.16b
rev32 v8.16b, v0.16b
SM4_CRYPT_BLK_BE(v8)
eor v1.16b, v1.16b, v8.16b
rev32 v8.16b, v1.16b
SM4_CRYPT_BLK_BE(v8)
eor v2.16b, v2.16b, v8.16b
rev32 v8.16b, v2.16b
SM4_CRYPT_BLK_BE(v8)
eor v3.16b, v3.16b, v8.16b
st1 {v0.16b-v3.16b}, [x1], #64
mov RIV.16b, v3.16b
cbz w4, .Lcfb_enc_end
b .Lcfb_enc_loop_4x
.Lcfb_enc_loop_1x:
sub w4, w4, #1
ld1 {v0.16b}, [x2], #16
SM4_CRYPT_BLK(RIV)
eor RIV.16b, RIV.16b, v0.16b
st1 {RIV.16b}, [x1], #16
cbnz w4, .Lcfb_enc_loop_1x
.Lcfb_enc_end:
/* store new IV */
st1 {RIV.16b}, [x3];
st1 {RIV.16b}, [x3]
ret;
ret
SYM_FUNC_END(sm4_ce_cfb_enc)
.align 3
@ -482,79 +386,91 @@ SYM_FUNC_START(sm4_ce_cfb_dec)
* x3: iv (big endian, 128 bit)
* w4: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
ld1 {v0.16b}, [x3];
ld1 {RIV.16b}, [x3]
.Lcfb_loop_blk:
sub w4, w4, #8;
tbnz w4, #31, .Lcfb_tail8;
.Lcfb_dec_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lcfb_dec_4x
ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
ld1 {v4.16b-v7.16b}, [x2];
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {v4.16b-v7.16b}, [x2], #64
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
rev32 v8.16b, RIV.16b
rev32 v9.16b, v0.16b
rev32 v10.16b, v1.16b
rev32 v11.16b, v2.16b
rev32 v12.16b, v3.16b
rev32 v13.16b, v4.16b
rev32 v14.16b, v5.16b
rev32 v15.16b, v6.16b
sub x2, x2, #48;
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v0.16b, v0.16b, RTMP0.16b;
eor v1.16b, v1.16b, RTMP1.16b;
eor v2.16b, v2.16b, RTMP2.16b;
eor v3.16b, v3.16b, RTMP3.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v4.16b, v4.16b, RTMP0.16b;
eor v5.16b, v5.16b, RTMP1.16b;
eor v6.16b, v6.16b, RTMP2.16b;
eor v7.16b, v7.16b, RTMP3.16b;
st1 {v4.16b-v7.16b}, [x1], #64;
mov RIV.16b, v7.16b
mov v0.16b, RTMP3.16b;
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
cbz w4, .Lcfb_end;
b .Lcfb_loop_blk;
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
.Lcfb_tail8:
add w4, w4, #8;
cmp w4, #4;
blt .Lcfb_tail4;
cbz w4, .Lcfb_dec_end
b .Lcfb_dec_loop_8x
sub w4, w4, #4;
.Lcfb_dec_4x:
add w4, w4, #8
cmp w4, #4
blt .Lcfb_dec_loop_1x
ld1 {v1.16b, v2.16b, v3.16b}, [x2];
sub w4, w4, #4
SM4_CRYPT_BLK4(v0, v1, v2, v3);
ld1 {v0.16b-v3.16b}, [x2], #64
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v0.16b, v0.16b, RTMP0.16b;
eor v1.16b, v1.16b, RTMP1.16b;
eor v2.16b, v2.16b, RTMP2.16b;
eor v3.16b, v3.16b, RTMP3.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
rev32 v8.16b, RIV.16b
rev32 v9.16b, v0.16b
rev32 v10.16b, v1.16b
rev32 v11.16b, v2.16b
mov v0.16b, RTMP3.16b;
SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
cbz w4, .Lcfb_end;
mov RIV.16b, v3.16b
.Lcfb_tail4:
sub w4, w4, #1;
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
SM4_CRYPT_BLK(v0);
st1 {v0.16b-v3.16b}, [x1], #64
ld1 {RTMP0.16b}, [x2], #16;
eor v0.16b, v0.16b, RTMP0.16b;
st1 {v0.16b}, [x1], #16;
cbz w4, .Lcfb_dec_end
mov v0.16b, RTMP0.16b;
.Lcfb_dec_loop_1x:
sub w4, w4, #1
cbnz w4, .Lcfb_tail4;
ld1 {v0.16b}, [x2], #16
.Lcfb_end:
SM4_CRYPT_BLK(RIV)
eor RIV.16b, RIV.16b, v0.16b
st1 {RIV.16b}, [x1], #16
mov RIV.16b, v0.16b
cbnz w4, .Lcfb_dec_loop_1x
.Lcfb_dec_end:
/* store new IV */
st1 {v0.16b}, [x3];
st1 {RIV.16b}, [x3]
ret;
ret
SYM_FUNC_END(sm4_ce_cfb_dec)
.align 3
@ -566,95 +482,99 @@ SYM_FUNC_START(sm4_ce_ctr_enc)
* x3: ctr (big endian, 128 bit)
* w4: nblocks
*/
PREPARE;
SM4_PREPARE(x0)
ldp x7, x8, [x3];
rev x7, x7;
rev x8, x8;
ldp x7, x8, [x3]
rev x7, x7
rev x8, x8
.Lctr_loop_blk:
sub w4, w4, #8;
tbnz w4, #31, .Lctr_tail8;
.Lctr_loop_8x:
sub w4, w4, #8
tbnz w4, #31, .Lctr_4x
#define inc_le128(vctr) \
mov vctr.d[1], x8; \
mov vctr.d[0], x7; \
adds x8, x8, #1; \
adc x7, x7, xzr; \
rev64 vctr.16b, vctr.16b;
#define inc_le128(vctr) \
mov vctr.d[1], x8; \
mov vctr.d[0], x7; \
adds x8, x8, #1; \
rev64 vctr.16b, vctr.16b; \
adc x7, x7, xzr;
/* construct CTRs */
inc_le128(v0); /* +0 */
inc_le128(v1); /* +1 */
inc_le128(v2); /* +2 */
inc_le128(v3); /* +3 */
inc_le128(v4); /* +4 */
inc_le128(v5); /* +5 */
inc_le128(v6); /* +6 */
inc_le128(v7); /* +7 */
inc_le128(v0) /* +0 */
inc_le128(v1) /* +1 */
inc_le128(v2) /* +2 */
inc_le128(v3) /* +3 */
inc_le128(v4) /* +4 */
inc_le128(v5) /* +5 */
inc_le128(v6) /* +6 */
inc_le128(v7) /* +7 */
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
ld1 {v8.16b-v11.16b}, [x2], #64
ld1 {v12.16b-v15.16b}, [x2], #64
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v0.16b, v0.16b, RTMP0.16b;
eor v1.16b, v1.16b, RTMP1.16b;
eor v2.16b, v2.16b, RTMP2.16b;
eor v3.16b, v3.16b, RTMP3.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v4.16b, v4.16b, RTMP0.16b;
eor v5.16b, v5.16b, RTMP1.16b;
eor v6.16b, v6.16b, RTMP2.16b;
eor v7.16b, v7.16b, RTMP3.16b;
st1 {v4.16b-v7.16b}, [x1], #64;
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
eor v4.16b, v4.16b, v12.16b
eor v5.16b, v5.16b, v13.16b
eor v6.16b, v6.16b, v14.16b
eor v7.16b, v7.16b, v15.16b
cbz w4, .Lctr_end;
b .Lctr_loop_blk;
st1 {v0.16b-v3.16b}, [x1], #64
st1 {v4.16b-v7.16b}, [x1], #64
.Lctr_tail8:
add w4, w4, #8;
cmp w4, #4;
blt .Lctr_tail4;
cbz w4, .Lctr_end
b .Lctr_loop_8x
sub w4, w4, #4;
.Lctr_4x:
add w4, w4, #8
cmp w4, #4
blt .Lctr_loop_1x
sub w4, w4, #4
/* construct CTRs */
inc_le128(v0); /* +0 */
inc_le128(v1); /* +1 */
inc_le128(v2); /* +2 */
inc_le128(v3); /* +3 */
inc_le128(v0) /* +0 */
inc_le128(v1) /* +1 */
inc_le128(v2) /* +2 */
inc_le128(v3) /* +3 */
SM4_CRYPT_BLK4(v0, v1, v2, v3);
ld1 {v8.16b-v11.16b}, [x2], #64
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
eor v0.16b, v0.16b, RTMP0.16b;
eor v1.16b, v1.16b, RTMP1.16b;
eor v2.16b, v2.16b, RTMP2.16b;
eor v3.16b, v3.16b, RTMP3.16b;
st1 {v0.16b-v3.16b}, [x1], #64;
SM4_CRYPT_BLK4(v0, v1, v2, v3)
cbz w4, .Lctr_end;
eor v0.16b, v0.16b, v8.16b
eor v1.16b, v1.16b, v9.16b
eor v2.16b, v2.16b, v10.16b
eor v3.16b, v3.16b, v11.16b
.Lctr_tail4:
sub w4, w4, #1;
st1 {v0.16b-v3.16b}, [x1], #64
cbz w4, .Lctr_end
.Lctr_loop_1x:
sub w4, w4, #1
/* construct CTRs */
inc_le128(v0);
inc_le128(v0)
SM4_CRYPT_BLK(v0);
ld1 {v8.16b}, [x2], #16
ld1 {RTMP0.16b}, [x2], #16;
eor v0.16b, v0.16b, RTMP0.16b;
st1 {v0.16b}, [x1], #16;
SM4_CRYPT_BLK(v0)
cbnz w4, .Lctr_tail4;
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x1], #16
cbnz w4, .Lctr_loop_1x
.Lctr_end:
/* store new CTR */
rev x7, x7;
rev x8, x8;
stp x7, x8, [x3];
rev x7, x7
rev x8, x8
stp x7, x8, [x3]
ret;
ret
SYM_FUNC_END(sm4_ce_ctr_enc)

View File

@ -26,9 +26,9 @@ asmlinkage void sm4_ce_crypt_block(const u32 *rkey, u8 *dst, const u8 *src);
asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src,
unsigned int nblks);
asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks);
u8 *iv, unsigned int nblocks);
asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks);
u8 *iv, unsigned int nblocks);
asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src,
u8 *iv, unsigned int nblks);
asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src,
@ -94,66 +94,56 @@ static int sm4_ecb_decrypt(struct skcipher_request *req)
return sm4_ecb_do_crypt(req, ctx->rkey_dec);
}
static int sm4_cbc_encrypt(struct skcipher_request *req)
static int sm4_cbc_crypt(struct skcipher_request *req,
struct sm4_ctx *ctx, bool encrypt)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
while ((nbytes = walk.nbytes) > 0) {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
unsigned int nblks;
unsigned int nblocks;
kernel_neon_begin();
nblocks = nbytes / SM4_BLOCK_SIZE;
if (nblocks) {
kernel_neon_begin();
nblks = BYTES2BLKS(nbytes);
if (nblks) {
sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks);
nbytes -= nblks * SM4_BLOCK_SIZE;
if (encrypt)
sm4_ce_cbc_enc(ctx->rkey_enc, dst, src,
walk.iv, nblocks);
else
sm4_ce_cbc_dec(ctx->rkey_dec, dst, src,
walk.iv, nblocks);
kernel_neon_end();
}
kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes);
err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE);
}
return err;
}
static int sm4_cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
return sm4_cbc_crypt(req, ctx, true);
}
static int sm4_cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
unsigned int nbytes;
int err;
err = skcipher_walk_virt(&walk, req, false);
while ((nbytes = walk.nbytes) > 0) {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
unsigned int nblks;
kernel_neon_begin();
nblks = BYTES2BLKS(nbytes);
if (nblks) {
sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks);
nbytes -= nblks * SM4_BLOCK_SIZE;
}
kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes);
}
return err;
return sm4_cbc_crypt(req, ctx, false);
}
static int sm4_cfb_encrypt(struct skcipher_request *req)