2019-11-08 13:22:25 +01:00
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL / Cryptogams accelerated Poly1305 transform for ARM
*
* Copyright ( C ) 2019 Linaro Ltd . < ard . biesheuvel @ linaro . org >
*/
# include <asm/hwcap.h>
# include <asm/neon.h>
# include <asm/simd.h>
# include <asm/unaligned.h>
# include <crypto/algapi.h>
# include <crypto/internal/hash.h>
# include <crypto/internal/poly1305.h>
# include <crypto/internal/simd.h>
# include <linux/cpufeature.h>
# include <linux/crypto.h>
# include <linux/jump_label.h>
# include <linux/module.h>
void poly1305_init_arm ( void * state , const u8 * key ) ;
void poly1305_blocks_arm ( void * state , const u8 * src , u32 len , u32 hibit ) ;
2020-01-05 22:40:49 -05:00
void poly1305_emit_arm ( void * state , u8 * digest , const u32 * nonce ) ;
2019-11-08 13:22:25 +01:00
void __weak poly1305_blocks_neon ( void * state , const u8 * src , u32 len , u32 hibit )
{
}
static __ro_after_init DEFINE_STATIC_KEY_FALSE ( have_neon ) ;
void poly1305_init_arch ( struct poly1305_desc_ctx * dctx , const u8 * key )
{
poly1305_init_arm ( & dctx - > h , key ) ;
dctx - > s [ 0 ] = get_unaligned_le32 ( key + 16 ) ;
dctx - > s [ 1 ] = get_unaligned_le32 ( key + 20 ) ;
dctx - > s [ 2 ] = get_unaligned_le32 ( key + 24 ) ;
dctx - > s [ 3 ] = get_unaligned_le32 ( key + 28 ) ;
dctx - > buflen = 0 ;
}
EXPORT_SYMBOL ( poly1305_init_arch ) ;
static int arm_poly1305_init ( struct shash_desc * desc )
{
struct poly1305_desc_ctx * dctx = shash_desc_ctx ( desc ) ;
dctx - > buflen = 0 ;
dctx - > rset = 0 ;
dctx - > sset = false ;
return 0 ;
}
static void arm_poly1305_blocks ( struct poly1305_desc_ctx * dctx , const u8 * src ,
u32 len , u32 hibit , bool do_neon )
{
if ( unlikely ( ! dctx - > sset ) ) {
if ( ! dctx - > rset ) {
poly1305_init_arm ( & dctx - > h , src ) ;
src + = POLY1305_BLOCK_SIZE ;
len - = POLY1305_BLOCK_SIZE ;
dctx - > rset = 1 ;
}
if ( len > = POLY1305_BLOCK_SIZE ) {
dctx - > s [ 0 ] = get_unaligned_le32 ( src + 0 ) ;
dctx - > s [ 1 ] = get_unaligned_le32 ( src + 4 ) ;
dctx - > s [ 2 ] = get_unaligned_le32 ( src + 8 ) ;
dctx - > s [ 3 ] = get_unaligned_le32 ( src + 12 ) ;
src + = POLY1305_BLOCK_SIZE ;
len - = POLY1305_BLOCK_SIZE ;
dctx - > sset = true ;
}
if ( len < POLY1305_BLOCK_SIZE )
return ;
}
len & = ~ ( POLY1305_BLOCK_SIZE - 1 ) ;
if ( static_branch_likely ( & have_neon ) & & likely ( do_neon ) )
poly1305_blocks_neon ( & dctx - > h , src , len , hibit ) ;
else
poly1305_blocks_arm ( & dctx - > h , src , len , hibit ) ;
}
static void arm_poly1305_do_update ( struct poly1305_desc_ctx * dctx ,
const u8 * src , u32 len , bool do_neon )
{
if ( unlikely ( dctx - > buflen ) ) {
u32 bytes = min ( len , POLY1305_BLOCK_SIZE - dctx - > buflen ) ;
memcpy ( dctx - > buf + dctx - > buflen , src , bytes ) ;
src + = bytes ;
len - = bytes ;
dctx - > buflen + = bytes ;
if ( dctx - > buflen = = POLY1305_BLOCK_SIZE ) {
arm_poly1305_blocks ( dctx , dctx - > buf ,
POLY1305_BLOCK_SIZE , 1 , false ) ;
dctx - > buflen = 0 ;
}
}
if ( likely ( len > = POLY1305_BLOCK_SIZE ) ) {
arm_poly1305_blocks ( dctx , src , len , 1 , do_neon ) ;
src + = round_down ( len , POLY1305_BLOCK_SIZE ) ;
len % = POLY1305_BLOCK_SIZE ;
}
if ( unlikely ( len ) ) {
dctx - > buflen = len ;
memcpy ( dctx - > buf , src , len ) ;
}
}
static int arm_poly1305_update ( struct shash_desc * desc ,
const u8 * src , unsigned int srclen )
{
struct poly1305_desc_ctx * dctx = shash_desc_ctx ( desc ) ;
arm_poly1305_do_update ( dctx , src , srclen , false ) ;
return 0 ;
}
static int __maybe_unused arm_poly1305_update_neon ( struct shash_desc * desc ,
const u8 * src ,
unsigned int srclen )
{
struct poly1305_desc_ctx * dctx = shash_desc_ctx ( desc ) ;
bool do_neon = crypto_simd_usable ( ) & & srclen > 128 ;
if ( static_branch_likely ( & have_neon ) & & do_neon )
kernel_neon_begin ( ) ;
arm_poly1305_do_update ( dctx , src , srclen , do_neon ) ;
if ( static_branch_likely ( & have_neon ) & & do_neon )
kernel_neon_end ( ) ;
return 0 ;
}
void poly1305_update_arch ( struct poly1305_desc_ctx * dctx , const u8 * src ,
unsigned int nbytes )
{
bool do_neon = IS_ENABLED ( CONFIG_KERNEL_MODE_NEON ) & &
crypto_simd_usable ( ) ;
if ( unlikely ( dctx - > buflen ) ) {
u32 bytes = min ( nbytes , POLY1305_BLOCK_SIZE - dctx - > buflen ) ;
memcpy ( dctx - > buf + dctx - > buflen , src , bytes ) ;
src + = bytes ;
nbytes - = bytes ;
dctx - > buflen + = bytes ;
if ( dctx - > buflen = = POLY1305_BLOCK_SIZE ) {
poly1305_blocks_arm ( & dctx - > h , dctx - > buf ,
POLY1305_BLOCK_SIZE , 1 ) ;
dctx - > buflen = 0 ;
}
}
if ( likely ( nbytes > = POLY1305_BLOCK_SIZE ) ) {
unsigned int len = round_down ( nbytes , POLY1305_BLOCK_SIZE ) ;
if ( static_branch_likely ( & have_neon ) & & do_neon ) {
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
do {
unsigned int todo = min_t ( unsigned int , len , SZ_4K ) ;
kernel_neon_begin ( ) ;
poly1305_blocks_neon ( & dctx - > h , src , todo , 1 ) ;
kernel_neon_end ( ) ;
len - = todo ;
src + = todo ;
} while ( len ) ;
2019-11-08 13:22:25 +01:00
} else {
poly1305_blocks_arm ( & dctx - > h , src , len , 1 ) ;
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
src + = len ;
2019-11-08 13:22:25 +01:00
}
nbytes % = POLY1305_BLOCK_SIZE ;
}
if ( unlikely ( nbytes ) ) {
dctx - > buflen = nbytes ;
memcpy ( dctx - > buf , src , nbytes ) ;
}
}
EXPORT_SYMBOL ( poly1305_update_arch ) ;
void poly1305_final_arch ( struct poly1305_desc_ctx * dctx , u8 * dst )
{
if ( unlikely ( dctx - > buflen ) ) {
dctx - > buf [ dctx - > buflen + + ] = 1 ;
memset ( dctx - > buf + dctx - > buflen , 0 ,
POLY1305_BLOCK_SIZE - dctx - > buflen ) ;
poly1305_blocks_arm ( & dctx - > h , dctx - > buf , POLY1305_BLOCK_SIZE , 0 ) ;
}
2020-01-05 22:40:49 -05:00
poly1305_emit_arm ( & dctx - > h , dst , dctx - > s ) ;
2019-11-08 13:22:25 +01:00
* dctx = ( struct poly1305_desc_ctx ) { } ;
}
EXPORT_SYMBOL ( poly1305_final_arch ) ;
static int arm_poly1305_final ( struct shash_desc * desc , u8 * dst )
{
struct poly1305_desc_ctx * dctx = shash_desc_ctx ( desc ) ;
if ( unlikely ( ! dctx - > sset ) )
return - ENOKEY ;
poly1305_final_arch ( dctx , dst ) ;
return 0 ;
}
static struct shash_alg arm_poly1305_algs [ ] = { {
. init = arm_poly1305_init ,
. update = arm_poly1305_update ,
. final = arm_poly1305_final ,
. digestsize = POLY1305_DIGEST_SIZE ,
. descsize = sizeof ( struct poly1305_desc_ctx ) ,
. base . cra_name = " poly1305 " ,
. base . cra_driver_name = " poly1305-arm " ,
. base . cra_priority = 150 ,
. base . cra_blocksize = POLY1305_BLOCK_SIZE ,
. base . cra_module = THIS_MODULE ,
# ifdef CONFIG_KERNEL_MODE_NEON
} , {
. init = arm_poly1305_init ,
. update = arm_poly1305_update_neon ,
. final = arm_poly1305_final ,
. digestsize = POLY1305_DIGEST_SIZE ,
. descsize = sizeof ( struct poly1305_desc_ctx ) ,
. base . cra_name = " poly1305 " ,
. base . cra_driver_name = " poly1305-neon " ,
. base . cra_priority = 200 ,
. base . cra_blocksize = POLY1305_BLOCK_SIZE ,
. base . cra_module = THIS_MODULE ,
# endif
} } ;
static int __init arm_poly1305_mod_init ( void )
{
if ( IS_ENABLED ( CONFIG_KERNEL_MODE_NEON ) & &
( elf_hwcap & HWCAP_NEON ) )
static_branch_enable ( & have_neon ) ;
2019-11-25 11:31:12 +01:00
else if ( IS_REACHABLE ( CONFIG_CRYPTO_HASH ) )
2019-11-08 13:22:25 +01:00
/* register only the first entry */
return crypto_register_shash ( & arm_poly1305_algs [ 0 ] ) ;
2019-11-25 11:31:12 +01:00
return IS_REACHABLE ( CONFIG_CRYPTO_HASH ) ?
crypto_register_shashes ( arm_poly1305_algs ,
ARRAY_SIZE ( arm_poly1305_algs ) ) : 0 ;
2019-11-08 13:22:25 +01:00
}
static void __exit arm_poly1305_mod_exit ( void )
{
2019-11-25 11:31:12 +01:00
if ( ! IS_REACHABLE ( CONFIG_CRYPTO_HASH ) )
return ;
2019-11-08 13:22:25 +01:00
if ( ! static_branch_likely ( & have_neon ) ) {
crypto_unregister_shash ( & arm_poly1305_algs [ 0 ] ) ;
return ;
}
crypto_unregister_shashes ( arm_poly1305_algs ,
ARRAY_SIZE ( arm_poly1305_algs ) ) ;
}
module_init ( arm_poly1305_mod_init ) ;
module_exit ( arm_poly1305_mod_exit ) ;
MODULE_LICENSE ( " GPL v2 " ) ;
MODULE_ALIAS_CRYPTO ( " poly1305 " ) ;
MODULE_ALIAS_CRYPTO ( " poly1305-arm " ) ;
MODULE_ALIAS_CRYPTO ( " poly1305-neon " ) ;