5b2efa2bb8
Like the implementation of AESNI/AVX, this patch adds an accelerated implementation of AESNI/AVX2. In terms of code implementation, by reusing AESNI/AVX mode-related codes, the amount of code is greatly reduced. From the benchmark data, it can be seen that when the block size is 1024, compared to AVX acceleration, the performance achieved by AVX2 has increased by about 70%, it is also 7.7 times of the pure software implementation of sm4-generic. The main algorithm implementation comes from SM4 AES-NI work by libgcrypt and Markku-Juhani O. Saarinen at: https://github.com/mjosaarinen/sm4ni This optimization supports the four modes of SM4, ECB, CBC, CFB, and CTR. Since CBC and CFB do not support multiple block parallel encryption, the optimization effect is not obvious. Benchmark on Intel i5-6200U 2.30GHz, performance data of three implementation methods, pure software sm4-generic, aesni/avx acceleration, and aesni/avx2 acceleration, the data comes from the 218 mode and 518 mode of tcrypt. The abscissas are blocks of different lengths. The data is tabulated and the unit is Mb/s: block-size | 16 64 128 256 1024 1420 4096 sm4-generic ECB enc | 60.94 70.41 72.27 73.02 73.87 73.58 73.59 ECB dec | 61.87 70.53 72.15 73.09 73.89 73.92 73.86 CBC enc | 56.71 66.31 68.05 69.84 70.02 70.12 70.24 CBC dec | 54.54 65.91 68.22 69.51 70.63 70.79 70.82 CFB enc | 57.21 67.24 69.10 70.25 70.73 70.52 71.42 CFB dec | 57.22 64.74 66.31 67.24 67.40 67.64 67.58 CTR enc | 59.47 68.64 69.91 71.02 71.86 71.61 71.95 CTR dec | 59.94 68.77 69.95 71.00 71.84 71.55 71.95 sm4-aesni-avx ECB enc | 44.95 177.35 292.06 316.98 339.48 322.27 330.59 ECB dec | 45.28 178.66 292.31 317.52 339.59 322.52 331.16 CBC enc | 57.75 67.68 69.72 70.60 71.48 71.63 71.74 CBC dec | 44.32 176.83 284.32 307.24 328.61 312.61 325.82 CFB enc | 57.81 67.64 69.63 70.55 71.40 71.35 71.70 CFB dec | 43.14 167.78 282.03 307.20 328.35 318.24 325.95 CTR enc | 42.35 163.32 279.11 302.93 320.86 310.56 317.93 CTR dec | 42.39 162.81 278.49 302.37 321.11 310.33 318.37 sm4-aesni-avx2 ECB enc | 45.19 177.41 292.42 316.12 339.90 322.53 330.54 ECB dec | 44.83 178.90 291.45 317.31 339.85 322.55 331.07 CBC enc | 57.66 67.62 69.73 70.55 71.58 71.66 71.77 CBC dec | 44.34 176.86 286.10 501.68 559.58 483.87 527.46 CFB enc | 57.43 67.60 69.61 70.52 71.43 71.28 71.65 CFB dec | 43.12 167.75 268.09 499.33 558.35 490.36 524.73 CTR enc | 42.42 163.39 256.17 493.95 552.45 481.58 517.19 CTR dec | 42.49 163.11 256.36 493.34 552.62 481.49 516.83 Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
170 lines
4.6 KiB
C
170 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* SM4 Cipher Algorithm, AES-NI/AVX2 optimized.
|
|
* as specified in
|
|
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
|
|
*
|
|
* Copyright (c) 2021, Alibaba Group.
|
|
* Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/crypto.h>
|
|
#include <linux/kernel.h>
|
|
#include <asm/simd.h>
|
|
#include <crypto/internal/simd.h>
|
|
#include <crypto/internal/skcipher.h>
|
|
#include <crypto/sm4.h>
|
|
#include "sm4-avx.h"
|
|
|
|
#define SM4_CRYPT16_BLOCK_SIZE (SM4_BLOCK_SIZE * 16)
|
|
|
|
asmlinkage void sm4_aesni_avx2_ctr_enc_blk16(const u32 *rk, u8 *dst,
|
|
const u8 *src, u8 *iv);
|
|
asmlinkage void sm4_aesni_avx2_cbc_dec_blk16(const u32 *rk, u8 *dst,
|
|
const u8 *src, u8 *iv);
|
|
asmlinkage void sm4_aesni_avx2_cfb_dec_blk16(const u32 *rk, u8 *dst,
|
|
const u8 *src, u8 *iv);
|
|
|
|
static int sm4_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
|
|
unsigned int key_len)
|
|
{
|
|
struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm);
|
|
|
|
return sm4_expandkey(ctx, key, key_len);
|
|
}
|
|
|
|
static int cbc_decrypt(struct skcipher_request *req)
|
|
{
|
|
return sm4_avx_cbc_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
|
|
sm4_aesni_avx2_cbc_dec_blk16);
|
|
}
|
|
|
|
|
|
static int cfb_decrypt(struct skcipher_request *req)
|
|
{
|
|
return sm4_avx_cfb_decrypt(req, SM4_CRYPT16_BLOCK_SIZE,
|
|
sm4_aesni_avx2_cfb_dec_blk16);
|
|
}
|
|
|
|
static int ctr_crypt(struct skcipher_request *req)
|
|
{
|
|
return sm4_avx_ctr_crypt(req, SM4_CRYPT16_BLOCK_SIZE,
|
|
sm4_aesni_avx2_ctr_enc_blk16);
|
|
}
|
|
|
|
static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
|
|
{
|
|
.base = {
|
|
.cra_name = "__ecb(sm4)",
|
|
.cra_driver_name = "__ecb-sm4-aesni-avx2",
|
|
.cra_priority = 500,
|
|
.cra_flags = CRYPTO_ALG_INTERNAL,
|
|
.cra_blocksize = SM4_BLOCK_SIZE,
|
|
.cra_ctxsize = sizeof(struct sm4_ctx),
|
|
.cra_module = THIS_MODULE,
|
|
},
|
|
.min_keysize = SM4_KEY_SIZE,
|
|
.max_keysize = SM4_KEY_SIZE,
|
|
.walksize = 16 * SM4_BLOCK_SIZE,
|
|
.setkey = sm4_skcipher_setkey,
|
|
.encrypt = sm4_avx_ecb_encrypt,
|
|
.decrypt = sm4_avx_ecb_decrypt,
|
|
}, {
|
|
.base = {
|
|
.cra_name = "__cbc(sm4)",
|
|
.cra_driver_name = "__cbc-sm4-aesni-avx2",
|
|
.cra_priority = 500,
|
|
.cra_flags = CRYPTO_ALG_INTERNAL,
|
|
.cra_blocksize = SM4_BLOCK_SIZE,
|
|
.cra_ctxsize = sizeof(struct sm4_ctx),
|
|
.cra_module = THIS_MODULE,
|
|
},
|
|
.min_keysize = SM4_KEY_SIZE,
|
|
.max_keysize = SM4_KEY_SIZE,
|
|
.ivsize = SM4_BLOCK_SIZE,
|
|
.walksize = 16 * SM4_BLOCK_SIZE,
|
|
.setkey = sm4_skcipher_setkey,
|
|
.encrypt = sm4_cbc_encrypt,
|
|
.decrypt = cbc_decrypt,
|
|
}, {
|
|
.base = {
|
|
.cra_name = "__cfb(sm4)",
|
|
.cra_driver_name = "__cfb-sm4-aesni-avx2",
|
|
.cra_priority = 500,
|
|
.cra_flags = CRYPTO_ALG_INTERNAL,
|
|
.cra_blocksize = 1,
|
|
.cra_ctxsize = sizeof(struct sm4_ctx),
|
|
.cra_module = THIS_MODULE,
|
|
},
|
|
.min_keysize = SM4_KEY_SIZE,
|
|
.max_keysize = SM4_KEY_SIZE,
|
|
.ivsize = SM4_BLOCK_SIZE,
|
|
.chunksize = SM4_BLOCK_SIZE,
|
|
.walksize = 16 * SM4_BLOCK_SIZE,
|
|
.setkey = sm4_skcipher_setkey,
|
|
.encrypt = sm4_cfb_encrypt,
|
|
.decrypt = cfb_decrypt,
|
|
}, {
|
|
.base = {
|
|
.cra_name = "__ctr(sm4)",
|
|
.cra_driver_name = "__ctr-sm4-aesni-avx2",
|
|
.cra_priority = 500,
|
|
.cra_flags = CRYPTO_ALG_INTERNAL,
|
|
.cra_blocksize = 1,
|
|
.cra_ctxsize = sizeof(struct sm4_ctx),
|
|
.cra_module = THIS_MODULE,
|
|
},
|
|
.min_keysize = SM4_KEY_SIZE,
|
|
.max_keysize = SM4_KEY_SIZE,
|
|
.ivsize = SM4_BLOCK_SIZE,
|
|
.chunksize = SM4_BLOCK_SIZE,
|
|
.walksize = 16 * SM4_BLOCK_SIZE,
|
|
.setkey = sm4_skcipher_setkey,
|
|
.encrypt = ctr_crypt,
|
|
.decrypt = ctr_crypt,
|
|
}
|
|
};
|
|
|
|
static struct simd_skcipher_alg *
|
|
simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)];
|
|
|
|
static int __init sm4_init(void)
|
|
{
|
|
const char *feature_name;
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_AVX) ||
|
|
!boot_cpu_has(X86_FEATURE_AVX2) ||
|
|
!boot_cpu_has(X86_FEATURE_AES) ||
|
|
!boot_cpu_has(X86_FEATURE_OSXSAVE)) {
|
|
pr_info("AVX2 or AES-NI instructions are not detected.\n");
|
|
return -ENODEV;
|
|
}
|
|
|
|
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM,
|
|
&feature_name)) {
|
|
pr_info("CPU feature '%s' is not supported.\n", feature_name);
|
|
return -ENODEV;
|
|
}
|
|
|
|
return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers,
|
|
ARRAY_SIZE(sm4_aesni_avx2_skciphers),
|
|
simd_sm4_aesni_avx2_skciphers);
|
|
}
|
|
|
|
static void __exit sm4_exit(void)
|
|
{
|
|
simd_unregister_skciphers(sm4_aesni_avx2_skciphers,
|
|
ARRAY_SIZE(sm4_aesni_avx2_skciphers),
|
|
simd_sm4_aesni_avx2_skciphers);
|
|
}
|
|
|
|
module_init(sm4_init);
|
|
module_exit(sm4_exit);
|
|
|
|
MODULE_LICENSE("GPL v2");
|
|
MODULE_AUTHOR("Tianjia Zhang <tianjia.zhang@linux.alibaba.com>");
|
|
MODULE_DESCRIPTION("SM4 Cipher Algorithm, AES-NI/AVX2 optimized");
|
|
MODULE_ALIAS_CRYPTO("sm4");
|
|
MODULE_ALIAS_CRYPTO("sm4-aesni-avx2");
|