crypto: arm64/sm4 - add CE implementation for cmac/xcbc/cbcmac

This patch is a CE-optimized assembly implementation for cmac/xcbc/cbcmac.

Benchmark on T-Head Yitian-710 2.75 GHz, the data comes from the 300 mode of
tcrypt, and compared the performance before and after this patch (the driver
used before this patch is XXXmac(sm4-ce)). The abscissas are blocks of
different lengths. The data is tabulated and the unit is Mb/s:

Before:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac(sm4-ce)   |  293.33  403.69  503.76  527.78  531.10  535.46  535.81
xcbc(sm4-ce)   |  292.83  402.50  504.02  529.08  529.87  536.55  538.24
cbcmac(sm4-ce) |  318.42  415.79  497.12  515.05  523.15  521.19  523.01

After:

update-size    |      16      64     256    1024    2048    4096    8192
---------------+--------------------------------------------------------
cmac-sm4-ce    |  371.99  675.28  903.56  971.65  980.57  990.40  991.04
xcbc-sm4-ce    |  372.11  674.55  903.47  971.61  980.96  990.42  991.10
cbcmac-sm4-ce  |  371.63  675.33  903.23  972.07  981.42  990.93  991.45

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Tianjia Zhang 2022-10-27 14:55:03 +08:00 committed by Herbert Xu
parent 01f633113b
commit 6b5360a5e0
2 changed files with 336 additions and 1 deletions

View File

@ -35,6 +35,7 @@
#define RTMP3 v19
#define RIV v20
#define RMAC v20
#define RMASK v21
@ -1007,6 +1008,75 @@ SYM_FUNC_START(sm4_ce_xts_dec)
ret
SYM_FUNC_END(sm4_ce_xts_dec)
.align 3
SYM_FUNC_START(sm4_ce_mac_update)
/* input:
* x0: round key array, CTX
* x1: digest
* x2: src
* w3: nblocks
* w4: enc_before
* w5: enc_after
*/
SM4_PREPARE(x0)
ld1 {RMAC.16b}, [x1]
cbz w4, .Lmac_update
SM4_CRYPT_BLK(RMAC)
.Lmac_update:
cbz w3, .Lmac_ret
sub w6, w3, #1
cmp w5, wzr
csel w3, w3, w6, ne
cbz w3, .Lmac_end
.Lmac_loop_4x:
cmp w3, #4
blt .Lmac_loop_1x
sub w3, w3, #4
ld1 {v0.16b-v3.16b}, [x2], #64
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v1.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v2.16b
SM4_CRYPT_BLK(RMAC)
eor RMAC.16b, RMAC.16b, v3.16b
SM4_CRYPT_BLK(RMAC)
cbz w3, .Lmac_end
b .Lmac_loop_4x
.Lmac_loop_1x:
sub w3, w3, #1
ld1 {v0.16b}, [x2], #16
eor RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
cbnz w3, .Lmac_loop_1x
.Lmac_end:
cbnz w5, .Lmac_ret
ld1 {v0.16b}, [x2], #16
eor RMAC.16b, RMAC.16b, v0.16b
.Lmac_ret:
st1 {RMAC.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_mac_update)
.section ".rodata", "a"
.align 4

View File

@ -14,8 +14,10 @@
#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/b128ops.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/internal/hash.h>
#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <crypto/sm4.h>
@ -47,6 +49,9 @@ asmlinkage void sm4_ce_xts_enc(const u32 *rkey1, u8 *dst, const u8 *src,
asmlinkage void sm4_ce_xts_dec(const u32 *rkey1, u8 *dst, const u8 *src,
u8 *tweak, unsigned int nbytes,
const u32 *rkey2_enc);
asmlinkage void sm4_ce_mac_update(const u32 *rkey_enc, u8 *digest,
const u8 *src, unsigned int nblocks,
bool enc_before, bool enc_after);
EXPORT_SYMBOL(sm4_ce_expand_key);
EXPORT_SYMBOL(sm4_ce_crypt_block);
@ -58,6 +63,16 @@ struct sm4_xts_ctx {
struct sm4_ctx key2;
};
struct sm4_mac_tfm_ctx {
struct sm4_ctx key;
u8 __aligned(8) consts[];
};
struct sm4_mac_desc_ctx {
unsigned int len;
u8 digest[SM4_BLOCK_SIZE];
};
static int sm4_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int key_len)
{
@ -594,13 +609,260 @@ static struct skcipher_alg sm4_algs[] = {
}
};
static int sm4_cbcmac_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int key_len)
{
struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
if (key_len != SM4_KEY_SIZE)
return -EINVAL;
kernel_neon_begin();
sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
crypto_sm4_fk, crypto_sm4_ck);
kernel_neon_end();
return 0;
}
static int sm4_cmac_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int key_len)
{
struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
be128 *consts = (be128 *)ctx->consts;
u64 a, b;
if (key_len != SM4_KEY_SIZE)
return -EINVAL;
memset(consts, 0, SM4_BLOCK_SIZE);
kernel_neon_begin();
sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
crypto_sm4_fk, crypto_sm4_ck);
/* encrypt the zero block */
sm4_ce_crypt_block(ctx->key.rkey_enc, (u8 *)consts, (const u8 *)consts);
kernel_neon_end();
/* gf(2^128) multiply zero-ciphertext with u and u^2 */
a = be64_to_cpu(consts[0].a);
b = be64_to_cpu(consts[0].b);
consts[0].a = cpu_to_be64((a << 1) | (b >> 63));
consts[0].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
a = be64_to_cpu(consts[0].a);
b = be64_to_cpu(consts[0].b);
consts[1].a = cpu_to_be64((a << 1) | (b >> 63));
consts[1].b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
return 0;
}
static int sm4_xcbc_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int key_len)
{
struct sm4_mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
u8 __aligned(8) key2[SM4_BLOCK_SIZE];
static u8 const ks[3][SM4_BLOCK_SIZE] = {
{ [0 ... SM4_BLOCK_SIZE - 1] = 0x1},
{ [0 ... SM4_BLOCK_SIZE - 1] = 0x2},
{ [0 ... SM4_BLOCK_SIZE - 1] = 0x3},
};
if (key_len != SM4_KEY_SIZE)
return -EINVAL;
kernel_neon_begin();
sm4_ce_expand_key(key, ctx->key.rkey_enc, ctx->key.rkey_dec,
crypto_sm4_fk, crypto_sm4_ck);
sm4_ce_crypt_block(ctx->key.rkey_enc, key2, ks[0]);
sm4_ce_crypt(ctx->key.rkey_enc, ctx->consts, ks[1], 2);
sm4_ce_expand_key(key2, ctx->key.rkey_enc, ctx->key.rkey_dec,
crypto_sm4_fk, crypto_sm4_ck);
kernel_neon_end();
return 0;
}
static int sm4_mac_init(struct shash_desc *desc)
{
struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
memset(ctx->digest, 0, SM4_BLOCK_SIZE);
ctx->len = 0;
return 0;
}
static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
unsigned int len)
{
struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
unsigned int l, nblocks;
if (len == 0)
return 0;
if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) {
l = min(len, SM4_BLOCK_SIZE - ctx->len);
crypto_xor(ctx->digest + ctx->len, p, l);
ctx->len += l;
len -= l;
p += l;
}
if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) {
kernel_neon_begin();
if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) {
sm4_ce_crypt_block(tctx->key.rkey_enc,
ctx->digest, ctx->digest);
ctx->len = 0;
} else {
nblocks = len / SM4_BLOCK_SIZE;
len %= SM4_BLOCK_SIZE;
sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
nblocks, (ctx->len == SM4_BLOCK_SIZE),
(len != 0));
p += nblocks * SM4_BLOCK_SIZE;
if (len == 0)
ctx->len = SM4_BLOCK_SIZE;
}
kernel_neon_end();
if (len) {
crypto_xor(ctx->digest, p, len);
ctx->len = len;
}
}
return 0;
}
static int sm4_cmac_final(struct shash_desc *desc, u8 *out)
{
struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
const u8 *consts = tctx->consts;
if (ctx->len != SM4_BLOCK_SIZE) {
ctx->digest[ctx->len] ^= 0x80;
consts += SM4_BLOCK_SIZE;
}
kernel_neon_begin();
sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
false, true);
kernel_neon_end();
memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
return 0;
}
static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out)
{
struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
if (ctx->len) {
kernel_neon_begin();
sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
ctx->digest);
kernel_neon_end();
}
memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
return 0;
}
static struct shash_alg sm4_mac_algs[] = {
{
.base = {
.cra_name = "cmac(sm4)",
.cra_driver_name = "cmac-sm4-ce",
.cra_priority = 400,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx)
+ SM4_BLOCK_SIZE * 2,
.cra_module = THIS_MODULE,
},
.digestsize = SM4_BLOCK_SIZE,
.init = sm4_mac_init,
.update = sm4_mac_update,
.final = sm4_cmac_final,
.setkey = sm4_cmac_setkey,
.descsize = sizeof(struct sm4_mac_desc_ctx),
}, {
.base = {
.cra_name = "xcbc(sm4)",
.cra_driver_name = "xcbc-sm4-ce",
.cra_priority = 400,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx)
+ SM4_BLOCK_SIZE * 2,
.cra_module = THIS_MODULE,
},
.digestsize = SM4_BLOCK_SIZE,
.init = sm4_mac_init,
.update = sm4_mac_update,
.final = sm4_cmac_final,
.setkey = sm4_xcbc_setkey,
.descsize = sizeof(struct sm4_mac_desc_ctx),
}, {
.base = {
.cra_name = "cbcmac(sm4)",
.cra_driver_name = "cbcmac-sm4-ce",
.cra_priority = 400,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx),
.cra_module = THIS_MODULE,
},
.digestsize = SM4_BLOCK_SIZE,
.init = sm4_mac_init,
.update = sm4_mac_update,
.final = sm4_cbcmac_final,
.setkey = sm4_cbcmac_setkey,
.descsize = sizeof(struct sm4_mac_desc_ctx),
}
};
static int __init sm4_init(void)
{
return crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
int err;
err = crypto_register_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
if (err)
return err;
err = crypto_register_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
if (err)
goto out_err;
return 0;
out_err:
crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
return err;
}
static void __exit sm4_exit(void)
{
crypto_unregister_shashes(sm4_mac_algs, ARRAY_SIZE(sm4_mac_algs));
crypto_unregister_skciphers(sm4_algs, ARRAY_SIZE(sm4_algs));
}
@ -616,5 +878,8 @@ MODULE_ALIAS_CRYPTO("cfb(sm4)");
MODULE_ALIAS_CRYPTO("ctr(sm4)");
MODULE_ALIAS_CRYPTO("cts(cbc(sm4))");
MODULE_ALIAS_CRYPTO("xts(sm4)");
MODULE_ALIAS_CRYPTO("cmac(sm4)");
MODULE_ALIAS_CRYPTO("xcbc(sm4)");
MODULE_ALIAS_CRYPTO("cbcmac(sm4)");
MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
MODULE_LICENSE("GPL v2");