linux/lib/raid6/recov_neon_inner.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 Intel Corporation
 * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 */

#include <arm_neon.h>

#ifdef CONFIG_ARM
/*
 * AArch32 does not provide this intrinsic natively because it does not
 * implement the underlying instruction. AArch32 only provides a 64-bit
 * wide vtbl.8 instruction, so use that instead.
 */
static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
{
	union {
		uint8x16_t	val;
		uint8x8x2_t	pair;
	} __a = { a };

	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
			   vtbl2_u8(__a.pair, vget_high_u8(b)));
}
#endif

void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
			      uint8_t *dq, const uint8_t *pbmul,
			      const uint8_t *qmul)
{
	uint8x16_t pm0 = vld1q_u8(pbmul);
	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
	uint8x16_t qm0 = vld1q_u8(qmul);
	uint8x16_t qm1 = vld1q_u8(qmul + 16);
	uint8x16_t x0f = vdupq_n_u8(0x0f);

	/*
	 * while ( bytes-- ) {
	 *	uint8_t px, qx, db;
	 *
	 *	px    = *p ^ *dp;
	 *	qx    = qmul[*q ^ *dq];
	 *	*dq++ = db = pbmul[px] ^ qx;
	 *	*dp++ = db ^ px;
	 *	p++; q++;
	 * }
	 */

	while (bytes) {
		uint8x16_t vx, vy, px, qx, db;

		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));

		vy = vshrq_n_u8(vx, 4);
		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
		vy = vqtbl1q_u8(qm1, vy);
		qx = veorq_u8(vx, vy);

		vy = vshrq_n_u8(px, 4);
		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
		vy = vqtbl1q_u8(pm1, vy);
		vx = veorq_u8(vx, vy);
		db = veorq_u8(vx, qx);

		vst1q_u8(dq, db);
		vst1q_u8(dp, veorq_u8(db, px));

		bytes -= 16;
		p += 16;
		q += 16;
		dp += 16;
		dq += 16;
	}
}

void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
			      const uint8_t *qmul)
{
	uint8x16_t qm0 = vld1q_u8(qmul);
	uint8x16_t qm1 = vld1q_u8(qmul + 16);
	uint8x16_t x0f = vdupq_n_u8(0x0f);

	/*
	 * while (bytes--) {
	 *	*p++ ^= *dq = qmul[*q ^ *dq];
	 *	q++; dq++;
	 * }
	 */

	while (bytes) {
		uint8x16_t vx, vy;

		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));

		vy = vshrq_n_u8(vx, 4);
		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
		vy = vqtbl1q_u8(qm1, vy);
		vx = veorq_u8(vx, vy);
		vy = veorq_u8(vx, vld1q_u8(p));

		vst1q_u8(dq, vx);
		vst1q_u8(p, vy);

		bytes -= 16;
		p += 16;
		q += 16;
		dq += 16;
	}
}
treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 441 Based on 1 normalized pattern(s): this program is free software you can redistribute it and or modify it under the terms of the gnu general public license as published by the free software foundation version 2 of the license extracted by the scancode license scanner the SPDX license identifier GPL-2.0-only has been chosen to replace the boilerplate/reference in 315 file(s). Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Allison Randal <allison@lohutok.net> Reviewed-by: Armijn Hemel <armijn@tjaldur.nl> Cc: linux-spdx@vger.kernel.org Link: https://lkml.kernel.org/r/20190531190115.503150771@linutronix.de Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2019-06-01 11:08:55 +03:00			`// SPDX-License-Identifier: GPL-2.0-only`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`/*`
			`* Copyright (C) 2012 Intel Corporation`
			`* Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>`
			`*/`

			`#include <arm_neon.h>`

			`#ifdef CONFIG_ARM`
			`/*`
			`* AArch32 does not provide this intrinsic natively because it does not`
			`* implement the underlying instruction. AArch32 only provides a 64-bit`
			`* wide vtbl.8 instruction, so use that instead.`
			`*/`
			`static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)`
			`{`
			`union {`
			`uint8x16_t val;`
			`uint8x8x2_t pair;`
			`} __a = { a };`

			`return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),`
			`vtbl2_u8(__a.pair, vget_high_u8(b)));`
			`}`
			`#endif`

			`void __raid6_2data_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dp,`
			`uint8_t dq, const uint8_t pbmul,`
			`const uint8_t *qmul)`
			`{`
			`uint8x16_t pm0 = vld1q_u8(pbmul);`
			`uint8x16_t pm1 = vld1q_u8(pbmul + 16);`
			`uint8x16_t qm0 = vld1q_u8(qmul);`
			`uint8x16_t qm1 = vld1q_u8(qmul + 16);`
lib/raid6: use vdupq_n_u8 to avoid endianness warnings Clang warns: vector initializers are not compatible with NEON intrinsics in big endian mode [-Wnonportable-vector-initialization] While this is usually the case, it's not an issue for this case since we're initializing the uint8x16_t (16x uint8_t's) with the same value. Instead, use vdupq_n_u8 which both compilers lower into a single movi instruction: https://godbolt.org/z/vBrgzt This avoids the static storage for a constant value. Link: https://github.com/ClangBuiltLinux/linux/issues/214 Suggested-by: Nathan Chancellor <natechancellor@gmail.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 07:03:42 +03:00			`uint8x16_t x0f = vdupq_n_u8(0x0f);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00
			`/*`
			`* while ( bytes-- ) {`
			`* uint8_t px, qx, db;`
			`*`
			`* px = p ^ dp;`
			`* qx = qmul[q ^ dq];`
			`* *dq++ = db = pbmul[px] ^ qx;`
			`* *dp++ = db ^ px;`
			`* p++; q++;`
			`* }`
			`*/`

			`while (bytes) {`
			`uint8x16_t vx, vy, px, qx, db;`

			`px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));`
			`vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));`

lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vshrq_n_u8(vx, 4);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));`
lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vqtbl1q_u8(qm1, vy);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`qx = veorq_u8(vx, vy);`

lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vshrq_n_u8(px, 4);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));`
lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vqtbl1q_u8(pm1, vy);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`vx = veorq_u8(vx, vy);`
			`db = veorq_u8(vx, qx);`

			`vst1q_u8(dq, db);`
			`vst1q_u8(dp, veorq_u8(db, px));`

			`bytes -= 16;`
			`p += 16;`
			`q += 16;`
			`dp += 16;`
			`dq += 16;`
			`}`
			`}`

			`void __raid6_datap_recov_neon(int bytes, uint8_t p, uint8_t q, uint8_t *dq,`
			`const uint8_t *qmul)`
			`{`
			`uint8x16_t qm0 = vld1q_u8(qmul);`
			`uint8x16_t qm1 = vld1q_u8(qmul + 16);`
lib/raid6: use vdupq_n_u8 to avoid endianness warnings Clang warns: vector initializers are not compatible with NEON intrinsics in big endian mode [-Wnonportable-vector-initialization] While this is usually the case, it's not an issue for this case since we're initializing the uint8x16_t (16x uint8_t's) with the same value. Instead, use vdupq_n_u8 which both compilers lower into a single movi instruction: https://godbolt.org/z/vBrgzt This avoids the static storage for a constant value. Link: https://github.com/ClangBuiltLinux/linux/issues/214 Suggested-by: Nathan Chancellor <natechancellor@gmail.com> Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 07:03:42 +03:00			`uint8x16_t x0f = vdupq_n_u8(0x0f);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00
			`/*`
			`* while (bytes--) {`
			`* p++ ^= dq = qmul[q ^ dq];`
			`* q++; dq++;`
			`* }`
			`*/`

			`while (bytes) {`
			`uint8x16_t vx, vy;`

			`vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));`

lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vshrq_n_u8(vx, 4);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));`
lib/raid6: arm: optimize away a mask operation in NEON recovery routine The NEON recovery code was modeled after the x86 SIMD code, and for some reason, that code uses a 16 bit wide signed shift and a mask to perform what amounts to a 8 bit unsigned shift. So fold the ops together. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2019-02-26 14:36:18 +03:00			`vy = vqtbl1q_u8(qm1, vy);`
md/raid6: implement recovery using ARM NEON intrinsics Provide a NEON accelerated implementation of the recovery algorithm, which supersedes the default byte-by-byte one. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com> 2017-07-13 20:16:01 +03:00			`vx = veorq_u8(vx, vy);`
			`vy = veorq_u8(vx, vld1q_u8(p));`

			`vst1q_u8(dq, vx);`
			`vst1q_u8(p, vy);`

			`bytes -= 16;`
			`p += 16;`
			`q += 16;`
			`dq += 16;`
			`}`
			`}`