2019-11-08 13:22:31 +01:00
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright ( C ) 2015 - 2019 Jason A . Donenfeld < Jason @ zx2c4 . com > . All Rights Reserved .
*/
# include <crypto/internal/blake2s.h>
# include <linux/types.h>
# include <linux/jump_label.h>
# include <linux/kernel.h>
2020-08-19 21:58:20 +10:00
# include <linux/sizes.h>
2019-11-08 13:22:31 +01:00
# include <asm/cpufeature.h>
# include <asm/fpu/api.h>
# include <asm/processor.h>
# include <asm/simd.h>
asmlinkage void blake2s_compress_ssse3 ( struct blake2s_state * state ,
const u8 * block , const size_t nblocks ,
const u32 inc ) ;
asmlinkage void blake2s_compress_avx512 ( struct blake2s_state * state ,
const u8 * block , const size_t nblocks ,
const u32 inc ) ;
static __ro_after_init DEFINE_STATIC_KEY_FALSE ( blake2s_use_ssse3 ) ;
static __ro_after_init DEFINE_STATIC_KEY_FALSE ( blake2s_use_avx512 ) ;
2021-12-22 14:56:58 +01:00
void blake2s_compress ( struct blake2s_state * state , const u8 * block ,
size_t nblocks , const u32 inc )
2019-11-08 13:22:31 +01:00
{
/* SIMD disables preemption, so relax after processing each page. */
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
BUILD_BUG_ON ( SZ_4K / BLAKE2S_BLOCK_SIZE < 8 ) ;
2019-11-08 13:22:31 +01:00
2022-05-28 21:44:07 +02:00
if ( ! static_branch_likely ( & blake2s_use_ssse3 ) | | ! may_use_simd ( ) ) {
2019-11-08 13:22:31 +01:00
blake2s_compress_generic ( state , block , nblocks , inc ) ;
return ;
}
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
do {
2019-11-08 13:22:31 +01:00
const size_t blocks = min_t ( size_t , nblocks ,
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
SZ_4K / BLAKE2S_BLOCK_SIZE ) ;
2019-11-08 13:22:31 +01:00
kernel_fpu_begin ( ) ;
if ( IS_ENABLED ( CONFIG_AS_AVX512 ) & &
static_branch_likely ( & blake2s_use_avx512 ) )
blake2s_compress_avx512 ( state , block , blocks , inc ) ;
else
blake2s_compress_ssse3 ( state , block , blocks , inc ) ;
kernel_fpu_end ( ) ;
nblocks - = blocks ;
block + = blocks * BLAKE2S_BLOCK_SIZE ;
crypto: arch/lib - limit simd usage to 4k chunks
The initial Zinc patchset, after some mailing list discussion, contained
code to ensure that kernel_fpu_enable would not be kept on for more than
a 4k chunk, since it disables preemption. The choice of 4k isn't totally
scientific, but it's not a bad guess either, and it's what's used in
both the x86 poly1305, blake2s, and nhpoly1305 code already (in the form
of PAGE_SIZE, which this commit corrects to be explicitly 4k for the
former two).
Ard did some back of the envelope calculations and found that
at 5 cycles/byte (overestimate) on a 1ghz processor (pretty slow), 4k
means we have a maximum preemption disabling of 20us, which Sebastian
confirmed was probably a good limit.
Unfortunately the chunking appears to have been left out of the final
patchset that added the glue code. So, this commit adds it back in.
Fixes: 84e03fa39fbe ("crypto: x86/chacha - expose SIMD ChaCha routine as library function")
Fixes: b3aad5bad26a ("crypto: arm64/chacha - expose arm64 ChaCha routine as library function")
Fixes: a44a3430d71b ("crypto: arm/chacha - expose ARM ChaCha routine as library function")
Fixes: d7d7b8535662 ("crypto: x86/poly1305 - wire up faster implementations for kernel")
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: a6b803b3ddc7 ("crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation")
Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: Eric Biggers <ebiggers@google.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2020-04-22 17:18:53 -06:00
} while ( nblocks ) ;
2019-11-08 13:22:31 +01:00
}
2021-12-22 14:56:58 +01:00
EXPORT_SYMBOL ( blake2s_compress ) ;
2019-11-08 13:22:31 +01:00
static int __init blake2s_mod_init ( void )
{
2021-12-22 14:56:58 +01:00
if ( boot_cpu_has ( X86_FEATURE_SSSE3 ) )
static_branch_enable ( & blake2s_use_ssse3 ) ;
2019-11-08 13:22:31 +01:00
if ( IS_ENABLED ( CONFIG_AS_AVX512 ) & &
boot_cpu_has ( X86_FEATURE_AVX ) & &
boot_cpu_has ( X86_FEATURE_AVX2 ) & &
boot_cpu_has ( X86_FEATURE_AVX512F ) & &
boot_cpu_has ( X86_FEATURE_AVX512VL ) & &
cpu_has_xfeatures ( XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
XFEATURE_MASK_AVX512 , NULL ) )
static_branch_enable ( & blake2s_use_avx512 ) ;
2021-12-22 14:56:58 +01:00
return 0 ;
2019-11-08 13:22:31 +01:00
}
2023-03-20 10:24:35 +00:00
subsys_initcall ( blake2s_mod_init ) ;