2019-06-04 10:11:33 +02:00
// SPDX-License-Identifier: GPL-2.0-only
2018-12-04 09:43:23 +08:00
/*
* arch / arm64 / lib / xor - neon . c
*
* Authors : Jackie Liu < liuyun01 @ kylinos . cn >
* Copyright ( C ) 2018 , Tianjin KYLIN Information Technology Co . , Ltd .
*/
# include <linux/raid/xor.h>
# include <linux/module.h>
# include <asm/neon-intrinsics.h>
2022-02-05 16:23:45 +01:00
void xor_arm64_neon_2 ( unsigned long bytes , unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 )
2018-12-04 09:43:23 +08:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 */
v0 = veorq_u64 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ) ;
v1 = veorq_u64 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ) ;
v2 = veorq_u64 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ) ;
v3 = veorq_u64 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
} while ( - - lines > 0 ) ;
}
2022-02-05 16:23:45 +01:00
void xor_arm64_neon_3 ( unsigned long bytes , unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 )
2018-12-04 09:43:23 +08:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 */
v0 = veorq_u64 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ) ;
v1 = veorq_u64 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ) ;
v2 = veorq_u64 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ) ;
v3 = veorq_u64 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ) ;
/* p1 ^= p3 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp3 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp3 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp3 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp3 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
} while ( - - lines > 0 ) ;
}
2022-02-05 16:23:45 +01:00
void xor_arm64_neon_4 ( unsigned long bytes , unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 ,
const unsigned long * __restrict p4 )
2018-12-04 09:43:23 +08:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
uint64_t * dp4 = ( uint64_t * ) p4 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 */
v0 = veorq_u64 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ) ;
v1 = veorq_u64 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ) ;
v2 = veorq_u64 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ) ;
v3 = veorq_u64 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ) ;
/* p1 ^= p3 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp3 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp3 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp3 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp3 + 6 ) ) ;
/* p1 ^= p4 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp4 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp4 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp4 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp4 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
dp4 + = 8 ;
} while ( - - lines > 0 ) ;
}
2022-02-05 16:23:45 +01:00
void xor_arm64_neon_5 ( unsigned long bytes , unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 ,
const unsigned long * __restrict p4 ,
const unsigned long * __restrict p5 )
2018-12-04 09:43:23 +08:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
uint64_t * dp4 = ( uint64_t * ) p4 ;
uint64_t * dp5 = ( uint64_t * ) p5 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 */
v0 = veorq_u64 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ) ;
v1 = veorq_u64 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ) ;
v2 = veorq_u64 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ) ;
v3 = veorq_u64 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ) ;
/* p1 ^= p3 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp3 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp3 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp3 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp3 + 6 ) ) ;
/* p1 ^= p4 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp4 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp4 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp4 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp4 + 6 ) ) ;
/* p1 ^= p5 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp5 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp5 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp5 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp5 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
dp4 + = 8 ;
dp5 + = 8 ;
} while ( - - lines > 0 ) ;
}
2021-12-13 15:02:52 +01:00
struct xor_block_template xor_block_inner_neon __ro_after_init = {
2018-12-04 09:43:23 +08:00
. name = " __inner_neon__ " ,
. do_2 = xor_arm64_neon_2 ,
. do_3 = xor_arm64_neon_3 ,
. do_4 = xor_arm64_neon_4 ,
. do_5 = xor_arm64_neon_5 ,
} ;
EXPORT_SYMBOL ( xor_block_inner_neon ) ;
2021-12-13 15:02:52 +01:00
static inline uint64x2_t eor3 ( uint64x2_t p , uint64x2_t q , uint64x2_t r )
{
uint64x2_t res ;
asm ( ARM64_ASM_PREAMBLE " .arch_extension sha3 \n "
" eor3 %0.16b, %1.16b, %2.16b, %3.16b "
: " =w " ( res ) : " w " ( p ) , " w " ( q ) , " w " ( r ) ) ;
return res ;
}
2022-02-05 16:23:45 +01:00
static void xor_arm64_eor3_3 ( unsigned long bytes ,
unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 )
2021-12-13 15:02:52 +01:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 ^ p3 */
v0 = eor3 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ,
vld1q_u64 ( dp3 + 0 ) ) ;
v1 = eor3 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ,
vld1q_u64 ( dp3 + 2 ) ) ;
v2 = eor3 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ,
vld1q_u64 ( dp3 + 4 ) ) ;
v3 = eor3 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ,
vld1q_u64 ( dp3 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
} while ( - - lines > 0 ) ;
}
2022-02-05 16:23:45 +01:00
static void xor_arm64_eor3_4 ( unsigned long bytes ,
unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 ,
const unsigned long * __restrict p4 )
2021-12-13 15:02:52 +01:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
uint64_t * dp4 = ( uint64_t * ) p4 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 ^ p3 */
v0 = eor3 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ,
vld1q_u64 ( dp3 + 0 ) ) ;
v1 = eor3 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ,
vld1q_u64 ( dp3 + 2 ) ) ;
v2 = eor3 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ,
vld1q_u64 ( dp3 + 4 ) ) ;
v3 = eor3 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ,
vld1q_u64 ( dp3 + 6 ) ) ;
/* p1 ^= p4 */
v0 = veorq_u64 ( v0 , vld1q_u64 ( dp4 + 0 ) ) ;
v1 = veorq_u64 ( v1 , vld1q_u64 ( dp4 + 2 ) ) ;
v2 = veorq_u64 ( v2 , vld1q_u64 ( dp4 + 4 ) ) ;
v3 = veorq_u64 ( v3 , vld1q_u64 ( dp4 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
dp4 + = 8 ;
} while ( - - lines > 0 ) ;
}
2022-02-05 16:23:45 +01:00
static void xor_arm64_eor3_5 ( unsigned long bytes ,
unsigned long * __restrict p1 ,
const unsigned long * __restrict p2 ,
const unsigned long * __restrict p3 ,
const unsigned long * __restrict p4 ,
const unsigned long * __restrict p5 )
2021-12-13 15:02:52 +01:00
{
uint64_t * dp1 = ( uint64_t * ) p1 ;
uint64_t * dp2 = ( uint64_t * ) p2 ;
uint64_t * dp3 = ( uint64_t * ) p3 ;
uint64_t * dp4 = ( uint64_t * ) p4 ;
uint64_t * dp5 = ( uint64_t * ) p5 ;
register uint64x2_t v0 , v1 , v2 , v3 ;
long lines = bytes / ( sizeof ( uint64x2_t ) * 4 ) ;
do {
/* p1 ^= p2 ^ p3 */
v0 = eor3 ( vld1q_u64 ( dp1 + 0 ) , vld1q_u64 ( dp2 + 0 ) ,
vld1q_u64 ( dp3 + 0 ) ) ;
v1 = eor3 ( vld1q_u64 ( dp1 + 2 ) , vld1q_u64 ( dp2 + 2 ) ,
vld1q_u64 ( dp3 + 2 ) ) ;
v2 = eor3 ( vld1q_u64 ( dp1 + 4 ) , vld1q_u64 ( dp2 + 4 ) ,
vld1q_u64 ( dp3 + 4 ) ) ;
v3 = eor3 ( vld1q_u64 ( dp1 + 6 ) , vld1q_u64 ( dp2 + 6 ) ,
vld1q_u64 ( dp3 + 6 ) ) ;
/* p1 ^= p4 ^ p5 */
v0 = eor3 ( v0 , vld1q_u64 ( dp4 + 0 ) , vld1q_u64 ( dp5 + 0 ) ) ;
v1 = eor3 ( v1 , vld1q_u64 ( dp4 + 2 ) , vld1q_u64 ( dp5 + 2 ) ) ;
v2 = eor3 ( v2 , vld1q_u64 ( dp4 + 4 ) , vld1q_u64 ( dp5 + 4 ) ) ;
v3 = eor3 ( v3 , vld1q_u64 ( dp4 + 6 ) , vld1q_u64 ( dp5 + 6 ) ) ;
/* store */
vst1q_u64 ( dp1 + 0 , v0 ) ;
vst1q_u64 ( dp1 + 2 , v1 ) ;
vst1q_u64 ( dp1 + 4 , v2 ) ;
vst1q_u64 ( dp1 + 6 , v3 ) ;
dp1 + = 8 ;
dp2 + = 8 ;
dp3 + = 8 ;
dp4 + = 8 ;
dp5 + = 8 ;
} while ( - - lines > 0 ) ;
}
static int __init xor_neon_init ( void )
{
if ( IS_ENABLED ( CONFIG_AS_HAS_SHA3 ) & & cpu_have_named_feature ( SHA3 ) ) {
xor_block_inner_neon . do_3 = xor_arm64_eor3_3 ;
xor_block_inner_neon . do_4 = xor_arm64_eor3_4 ;
xor_block_inner_neon . do_5 = xor_arm64_eor3_5 ;
}
return 0 ;
}
module_init ( xor_neon_init ) ;
static void __exit xor_neon_exit ( void )
{
}
module_exit ( xor_neon_exit ) ;
2018-12-04 09:43:23 +08:00
MODULE_AUTHOR ( " Jackie Liu <liuyun01@kylinos.cn> " ) ;
MODULE_DESCRIPTION ( " ARMv8 XOR Extensions " ) ;
MODULE_LICENSE ( " GPL " ) ;