linux/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S

/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
//
// This file is dual-licensed, meaning that you can use it under your
// choice of either of the following two licenses:
//
// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License"). You can obtain
// a copy in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// or
//
// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
// Copyright 2024 Google LLC
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// The generated code of this file depends on the following RISC-V extensions:
// - RV64I
// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
// - RISC-V Vector AES block cipher extension ('Zvkned')
// - RISC-V Vector Bit-manipulation extension ('Zvbb')
// - RISC-V Vector GCM/GMAC extension ('Zvkg')

#include <linux/linkage.h>

.text
.option arch, +zvkned, +zvbb, +zvkg

#include "aes-macros.S"

#define KEYP		a0
#define INP		a1
#define OUTP		a2
#define LEN		a3
#define TWEAKP		a4

#define LEN32		a5
#define TAIL_LEN	a6
#define VL		a7
#define VLMAX		t4

// v1-v15 contain the AES round keys, but they are used for temporaries before
// the AES round keys have been loaded.
#define TWEAKS		v16	// LMUL=4 (most of the time)
#define TWEAKS_BREV	v20	// LMUL=4 (most of the time)
#define MULTS_BREV	v24	// LMUL=4 (most of the time)
#define TMP0		v28
#define TMP1		v29
#define TMP2		v30
#define TMP3		v31

// xts_init initializes the following values:
//
//	TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
//	TWEAKS_BREV: same as TWEAKS, but bit-reversed
//	MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
//
// N is the maximum number of blocks that will be processed per loop iteration,
// computed using vsetvli.
//
// The field convention used by XTS is the same as that of GHASH, but with the
// bits reversed within each byte.  The zvkg extension provides the vgmul
// instruction which does multiplication in this field.  Therefore, for tweak
// computation we use vgmul to do multiplications in parallel, instead of
// serially multiplying by x using shifting+xoring.  Note that for this to work,
// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
.macro	xts_init

	// Load the first tweak T.
	vsetivli	zero, 4, e32, m1, ta, ma
	vle32.v		TWEAKS, (TWEAKP)

	// If there's only one block (or no blocks at all), then skip the tweak
	// sequence computation because (at most) T itself is needed.
	li		t0, 16
	ble		LEN, t0, .Linit_single_block\@

	// Save a copy of T bit-reversed in v12.
	vbrev8.v	v12, TWEAKS

	//
	// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
	// that N <= 128.  Though, this code actually requires N < 64 (or
	// equivalently VLEN < 2048) due to the use of 64-bit intermediate
	// values here and in the x^N computation later.
	//
	vsetvli		VL, LEN32, e32, m4, ta, ma
	srli		t0, VL, 2	// t0 = N (num blocks)
	// Generate two sequences, each with N 32-bit values:
	// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
	vsetvli		zero, t0, e32, m1, ta, ma
	vmv.v.i		v0, 1
	vid.v		v1
	// Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
	// as two sequences, each with 2*N 32-bit values:
	// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
	vsetvli		zero, t0, e64, m2, ta, ma
	vzext.vf2	v2, v0
	vzext.vf2	v4, v1
	slli		t1, t0, 1	// t1 = 2*N
	vsetvli		zero, t1, e32, m2, ta, ma
	// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
	// widening to 64 bits per element.  When reinterpreted as N 128-bit
	// values, this is the needed sequence of 128-bit values 1 << i (x^i).
	vwsll.vv	v8, v2, v4

	// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
	// multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
	vsetvli		zero, LEN32, e32, m4, ta, ma
	vmv.v.i		TWEAKS_BREV, 0
	vaesz.vs	TWEAKS_BREV, v12
	vbrev8.v	v8, v8
	vgmul.vv	TWEAKS_BREV, v8

	// Save a copy of the sequence T*(x^i) with the bit reversal undone.
	vbrev8.v	TWEAKS, TWEAKS_BREV

	// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
	li		t1, 1
	sll		t1, t1, t0	// t1 = 1 << N
	vsetivli	zero, 2, e64, m1, ta, ma
	vmv.v.i		v0, 0
	vsetivli	zero, 1, e64, m1, tu, ma
	vmv.v.x		v0, t1
	vbrev8.v	v0, v0
	vsetvli		zero, LEN32, e32, m4, ta, ma
	vmv.v.i		MULTS_BREV, 0
	vaesz.vs	MULTS_BREV, v0

	j		.Linit_done\@

.Linit_single_block\@:
	vbrev8.v	TWEAKS_BREV, TWEAKS
.Linit_done\@:
.endm

// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
// the multiplier required to advance the tweak by one.
.macro	load_x
	li		t0, 0x40
	vsetivli	zero, 4, e32, m1, ta, ma
	vmv.v.i		MULTS_BREV, 0
	vsetivli	zero, 1, e8, m1, tu, ma
	vmv.v.x		MULTS_BREV, t0
.endm

.macro	__aes_xts_crypt	enc, keylen
	// With 16 < len <= 31, there's no main loop, just ciphertext stealing.
	beqz		LEN32, .Lcts_without_main_loop\@

	vsetvli		VLMAX, zero, e32, m4, ta, ma
1:
	vsetvli		VL, LEN32, e32, m4, ta, ma
2:
	// Encrypt or decrypt VL/4 blocks.
	vle32.v		TMP0, (INP)
	vxor.vv		TMP0, TMP0, TWEAKS
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TWEAKS
	vse32.v		TMP0, (OUTP)

	// Update the pointers and the remaining length.
	slli		t0, VL, 2
	add		INP, INP, t0
	add		OUTP, OUTP, t0
	sub		LEN32, LEN32, VL

	// Check whether more blocks remain.
	beqz		LEN32, .Lmain_loop_done\@

	// Compute the next sequence of tweaks by multiplying the previous
	// sequence by x^N.  Store the result in both bit-reversed order and
	// regular order (i.e. with the bit reversal undone).
	vgmul.vv	TWEAKS_BREV, MULTS_BREV
	vbrev8.v	TWEAKS, TWEAKS_BREV

	// Since we compute the tweak multipliers x^N in advance, we require
	// that each iteration process the same length except possibly the last.
	// This conflicts slightly with the behavior allowed by RISC-V Vector
	// Extension, where CPUs can select a lower length for both of the last
	// two iterations.  E.g., vl might take the sequence of values
	// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
	// can use x^4 again instead of computing x^3.  Therefore, we explicitly
	// keep the vl at VLMAX if there is at least VLMAX remaining.
	bge		LEN32, VLMAX, 2b
	j		1b

.Lmain_loop_done\@:
	load_x

	// Compute the next tweak.
	addi		t0, VL, -4
	vsetivli	zero, 4, e32, m4, ta, ma
	vslidedown.vx	TWEAKS_BREV, TWEAKS_BREV, t0	// Extract last tweak
	vsetivli	zero, 4, e32, m1, ta, ma
	vgmul.vv	TWEAKS_BREV, MULTS_BREV		// Advance to next tweak

	bnez		TAIL_LEN, .Lcts\@

	// Update *TWEAKP to contain the next tweak.
	vbrev8.v	TWEAKS, TWEAKS_BREV
	vse32.v		TWEAKS, (TWEAKP)
	ret

.Lcts_without_main_loop\@:
	load_x
.Lcts\@:
	// TWEAKS_BREV now contains the next tweak.  Compute the one after that.
	vsetivli	zero, 4, e32, m1, ta, ma
	vmv.v.v		TMP0, TWEAKS_BREV
	vgmul.vv	TMP0, MULTS_BREV
	// Undo the bit reversal of the next two tweaks and store them in TMP1
	// and TMP2, such that TMP1 is the first needed and TMP2 the second.
.if \enc
	vbrev8.v	TMP1, TWEAKS_BREV
	vbrev8.v	TMP2, TMP0
.else
	vbrev8.v	TMP1, TMP0
	vbrev8.v	TMP2, TWEAKS_BREV
.endif

	// Encrypt/decrypt the last full block.
	vle32.v		TMP0, (INP)
	vxor.vv		TMP0, TMP0, TMP1
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TMP1

	// Swap the first TAIL_LEN bytes of the above result with the tail.
	// Note that to support in-place encryption/decryption, the load from
	// the input tail must happen before the store to the output tail.
	addi		t0, INP, 16
	addi		t1, OUTP, 16
	vmv.v.v		TMP3, TMP0
	vsetvli		zero, TAIL_LEN, e8, m1, tu, ma
	vle8.v		TMP0, (t0)
	vse8.v		TMP3, (t1)

	// Encrypt/decrypt again and store the last full block.
	vsetivli	zero, 4, e32, m1, ta, ma
	vxor.vv		TMP0, TMP0, TMP2
	aes_crypt	TMP0, \enc, \keylen
	vxor.vv		TMP0, TMP0, TMP2
	vse32.v		TMP0, (OUTP)

	ret
.endm

.macro	aes_xts_crypt	enc

	// Check whether the length is a multiple of the AES block size.
	andi		TAIL_LEN, LEN, 15
	beqz		TAIL_LEN, 1f

	// The length isn't a multiple of the AES block size, so ciphertext
	// stealing will be required.  Ciphertext stealing involves special
	// handling of the partial block and the last full block, so subtract
	// the length of both from the length to be processed in the main loop.
	sub		LEN, LEN, TAIL_LEN
	addi		LEN, LEN, -16
1:
	srli		LEN32, LEN, 2
	// LEN and LEN32 now contain the total length of the blocks that will be
	// processed in the main loop, in bytes and 32-bit words respectively.

	xts_init
	aes_begin	KEYP, 128f, 192f
	__aes_xts_crypt	\enc, 256
128:
	__aes_xts_crypt	\enc, 128
192:
	__aes_xts_crypt	\enc, 192
.endm

// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
//					 const u8 *in, u8 *out, size_t len,
//					 u8 tweak[16]);
//
// |key| is the data key.  |tweak| contains the next tweak; the encryption of
// the original IV with the tweak key was already done.  This function supports
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
// |len| must be a multiple of 16 except on the last call.  If |len| is a
// multiple of 16, then this function updates |tweak| to contain the next tweak.
SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt	1
SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)

// Same prototype and calling convention as the encryption function
SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
	aes_xts_crypt	0
SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com> Signed-off-by: Jerry Shih <jerry.shih@sifive.com> Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com> 2024-01-21 16:19:15 -08:00			`/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */`
			`//`
			`// This file is dual-licensed, meaning that you can use it under your`
			`// choice of either of the following two licenses:`
			`//`
			`// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.`
			`//`
			`// Licensed under the Apache License 2.0 (the "License"). You can obtain`
			`// a copy in the file LICENSE in the source distribution or at`
			`// https://www.openssl.org/source/license.html`
			`//`
			`// or`
			`//`
			`// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>`
			`// Copyright 2024 Google LLC`
			`// All rights reserved.`
			`//`
			`// Redistribution and use in source and binary forms, with or without`
			`// modification, are permitted provided that the following conditions`
			`// are met:`
			`// 1. Redistributions of source code must retain the above copyright`
			`// notice, this list of conditions and the following disclaimer.`
			`// 2. Redistributions in binary form must reproduce the above copyright`
			`// notice, this list of conditions and the following disclaimer in the`
			`// documentation and/or other materials provided with the distribution.`
			`//`
			`// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS`
			`// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT`
			`// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR`
			`// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT`
			`// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,`
			`// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT`
			`// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,`
			`// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY`
			`// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT`
			`// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE`
			`// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`

			`// The generated code of this file depends on the following RISC-V extensions:`
			`// - RV64I`
			`// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048`
			`// - RISC-V Vector AES block cipher extension ('Zvkned')`
			`// - RISC-V Vector Bit-manipulation extension ('Zvbb')`
			`// - RISC-V Vector GCM/GMAC extension ('Zvkg')`

			`#include <linux/linkage.h>`

			`.text`
			`.option arch, +zvkned, +zvbb, +zvkg`

			`#include "aes-macros.S"`

			`#define KEYP a0`
			`#define INP a1`
			`#define OUTP a2`
			`#define LEN a3`
			`#define TWEAKP a4`

			`#define LEN32 a5`
			`#define TAIL_LEN a6`
			`#define VL a7`
			`#define VLMAX t4`

			`// v1-v15 contain the AES round keys, but they are used for temporaries before`
			`// the AES round keys have been loaded.`
			`#define TWEAKS v16 // LMUL=4 (most of the time)`
			`#define TWEAKS_BREV v20 // LMUL=4 (most of the time)`
			`#define MULTS_BREV v24 // LMUL=4 (most of the time)`
			`#define TMP0 v28`
			`#define TMP1 v29`
			`#define TMP2 v30`
			`#define TMP3 v31`

			`// xts_init initializes the following values:`
			`//`
			`// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)`
			`// TWEAKS_BREV: same as TWEAKS, but bit-reversed`
			`// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1.`
			`//`
			`// N is the maximum number of blocks that will be processed per loop iteration,`
			`// computed using vsetvli.`
			`//`
			`// The field convention used by XTS is the same as that of GHASH, but with the`
			`// bits reversed within each byte. The zvkg extension provides the vgmul`
			`// instruction which does multiplication in this field. Therefore, for tweak`
			`// computation we use vgmul to do multiplications in parallel, instead of`
			`// serially multiplying by x using shifting+xoring. Note that for this to work,`
			`// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).`
			`.macro xts_init`

			`// Load the first tweak T.`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`vle32.v TWEAKS, (TWEAKP)`

			`// If there's only one block (or no blocks at all), then skip the tweak`
			`// sequence computation because (at most) T itself is needed.`
			`li t0, 16`
			`ble LEN, t0, .Linit_single_block\@`

			`// Save a copy of T bit-reversed in v12.`
			`vbrev8.v v12, TWEAKS`

			`//`
			`// Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming`
			`// that N <= 128. Though, this code actually requires N < 64 (or`
			`// equivalently VLEN < 2048) due to the use of 64-bit intermediate`
			`// values here and in the x^N computation later.`
			`//`
			`vsetvli VL, LEN32, e32, m4, ta, ma`
			`srli t0, VL, 2 // t0 = N (num blocks)`
			`// Generate two sequences, each with N 32-bit values:`
			`// v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].`
			`vsetvli zero, t0, e32, m1, ta, ma`
			`vmv.v.i v0, 1`
			`vid.v v1`
			`// Use vzext to zero-extend the sequences to 64 bits. Reinterpret them`
			`// as two sequences, each with 2*N 32-bit values:`
			`// v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].`
			`vsetvli zero, t0, e64, m2, ta, ma`
			`vzext.vf2 v2, v0`
			`vzext.vf2 v4, v1`
			`slli t1, t0, 1 // t1 = 2*N`
			`vsetvli zero, t1, e32, m2, ta, ma`
			`// Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],`
			`// widening to 64 bits per element. When reinterpreted as N 128-bit`
			`// values, this is the needed sequence of 128-bit values 1 << i (x^i).`
			`vwsll.vv v8, v2, v4`

			`// Copy the bit-reversed T to all N elements of TWEAKS_BREV, then`
			`// multiply by x^i. This gives the sequence T*(x^i), bit-reversed.`
			`vsetvli zero, LEN32, e32, m4, ta, ma`
			`vmv.v.i TWEAKS_BREV, 0`
			`vaesz.vs TWEAKS_BREV, v12`
			`vbrev8.v v8, v8`
			`vgmul.vv TWEAKS_BREV, v8`

			`// Save a copy of the sequence T*(x^i) with the bit reversal undone.`
			`vbrev8.v TWEAKS, TWEAKS_BREV`

			`// Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.`
			`li t1, 1`
			`sll t1, t1, t0 // t1 = 1 << N`
			`vsetivli zero, 2, e64, m1, ta, ma`
			`vmv.v.i v0, 0`
			`vsetivli zero, 1, e64, m1, tu, ma`
			`vmv.v.x v0, t1`
			`vbrev8.v v0, v0`
			`vsetvli zero, LEN32, e32, m4, ta, ma`
			`vmv.v.i MULTS_BREV, 0`
			`vaesz.vs MULTS_BREV, v0`

			`j .Linit_done\@`

			`.Linit_single_block\@:`
			`vbrev8.v TWEAKS_BREV, TWEAKS`
			`.Linit_done\@:`
			`.endm`

			`// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is`
			`// the multiplier required to advance the tweak by one.`
			`.macro load_x`
			`li t0, 0x40`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`vmv.v.i MULTS_BREV, 0`
			`vsetivli zero, 1, e8, m1, tu, ma`
			`vmv.v.x MULTS_BREV, t0`
			`.endm`

			`.macro __aes_xts_crypt enc, keylen`
			`// With 16 < len <= 31, there's no main loop, just ciphertext stealing.`
			`beqz LEN32, .Lcts_without_main_loop\@`

			`vsetvli VLMAX, zero, e32, m4, ta, ma`
			`1:`
			`vsetvli VL, LEN32, e32, m4, ta, ma`
			`2:`
			`// Encrypt or decrypt VL/4 blocks.`
			`vle32.v TMP0, (INP)`
			`vxor.vv TMP0, TMP0, TWEAKS`
			`aes_crypt TMP0, \enc, \keylen`
			`vxor.vv TMP0, TMP0, TWEAKS`
			`vse32.v TMP0, (OUTP)`

			`// Update the pointers and the remaining length.`
			`slli t0, VL, 2`
			`add INP, INP, t0`
			`add OUTP, OUTP, t0`
			`sub LEN32, LEN32, VL`

			`// Check whether more blocks remain.`
			`beqz LEN32, .Lmain_loop_done\@`

			`// Compute the next sequence of tweaks by multiplying the previous`
			`// sequence by x^N. Store the result in both bit-reversed order and`
			`// regular order (i.e. with the bit reversal undone).`
			`vgmul.vv TWEAKS_BREV, MULTS_BREV`
			`vbrev8.v TWEAKS, TWEAKS_BREV`

			`// Since we compute the tweak multipliers x^N in advance, we require`
			`// that each iteration process the same length except possibly the last.`
			`// This conflicts slightly with the behavior allowed by RISC-V Vector`
			`// Extension, where CPUs can select a lower length for both of the last`
			`// two iterations. E.g., vl might take the sequence of values`
			`// [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we`
			`// can use x^4 again instead of computing x^3. Therefore, we explicitly`
			`// keep the vl at VLMAX if there is at least VLMAX remaining.`
			`bge LEN32, VLMAX, 2b`
			`j 1b`

			`.Lmain_loop_done\@:`
			`load_x`

			`// Compute the next tweak.`
			`addi t0, VL, -4`
			`vsetivli zero, 4, e32, m4, ta, ma`
			`vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak`

			`bnez TAIL_LEN, .Lcts\@`

			`// Update *TWEAKP to contain the next tweak.`
			`vbrev8.v TWEAKS, TWEAKS_BREV`
			`vse32.v TWEAKS, (TWEAKP)`
			`ret`

			`.Lcts_without_main_loop\@:`
			`load_x`
			`.Lcts\@:`
			`// TWEAKS_BREV now contains the next tweak. Compute the one after that.`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`vmv.v.v TMP0, TWEAKS_BREV`
			`vgmul.vv TMP0, MULTS_BREV`
			`// Undo the bit reversal of the next two tweaks and store them in TMP1`
			`// and TMP2, such that TMP1 is the first needed and TMP2 the second.`
			`.if \enc`
			`vbrev8.v TMP1, TWEAKS_BREV`
			`vbrev8.v TMP2, TMP0`
			`.else`
			`vbrev8.v TMP1, TMP0`
			`vbrev8.v TMP2, TWEAKS_BREV`
			`.endif`

			`// Encrypt/decrypt the last full block.`
			`vle32.v TMP0, (INP)`
			`vxor.vv TMP0, TMP0, TMP1`
			`aes_crypt TMP0, \enc, \keylen`
			`vxor.vv TMP0, TMP0, TMP1`

			`// Swap the first TAIL_LEN bytes of the above result with the tail.`
			`// Note that to support in-place encryption/decryption, the load from`
			`// the input tail must happen before the store to the output tail.`
			`addi t0, INP, 16`
			`addi t1, OUTP, 16`
			`vmv.v.v TMP3, TMP0`
			`vsetvli zero, TAIL_LEN, e8, m1, tu, ma`
			`vle8.v TMP0, (t0)`
			`vse8.v TMP3, (t1)`

			`// Encrypt/decrypt again and store the last full block.`
			`vsetivli zero, 4, e32, m1, ta, ma`
			`vxor.vv TMP0, TMP0, TMP2`
			`aes_crypt TMP0, \enc, \keylen`
			`vxor.vv TMP0, TMP0, TMP2`
			`vse32.v TMP0, (OUTP)`

			`ret`
			`.endm`

			`.macro aes_xts_crypt enc`

			`// Check whether the length is a multiple of the AES block size.`
			`andi TAIL_LEN, LEN, 15`
			`beqz TAIL_LEN, 1f`

			`// The length isn't a multiple of the AES block size, so ciphertext`
			`// stealing will be required. Ciphertext stealing involves special`
			`// handling of the partial block and the last full block, so subtract`
			`// the length of both from the length to be processed in the main loop.`
			`sub LEN, LEN, TAIL_LEN`
			`addi LEN, LEN, -16`
			`1:`
			`srli LEN32, LEN, 2`
			`// LEN and LEN32 now contain the total length of the blocks that will be`
			`// processed in the main loop, in bytes and 32-bit words respectively.`

			`xts_init`
			`aes_begin KEYP, 128f, 192f`
			`__aes_xts_crypt \enc, 256`
			`128:`
			`__aes_xts_crypt \enc, 128`
			`192:`
			`__aes_xts_crypt \enc, 192`
			`.endm`

			`// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,`
			`// const u8 in, u8 out, size_t len,`
			`// u8 tweak[16]);`
			`//`
			`// \|key\| is the data key. \|tweak\| contains the next tweak; the encryption of`
			`// the original IV with the tweak key was already done. This function supports`
			`// incremental computation, but \|len\| must always be >= 16 (AES_BLOCK_SIZE), and`
			`// \|len\| must be a multiple of 16 except on the last call. If \|len\| is a`
			`// multiple of 16, then this function updates \|tweak\| to contain the next tweak.`
			`SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)`
			`aes_xts_crypt 1`
			`SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)`

			`// Same prototype and calling convention as the encryption function`
			`SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)`
			`aes_xts_crypt 0`
			`SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)`