From 3a6dd5f614a13033a47eaf439ac34e7b6fbc7705 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 21 Jan 2024 06:33:11 +0900 Subject: [PATCH 001/496] riscv: remove unneeded #include Commit 62694797f56b ("use linux/export.h rather than asm-generic/export.h") replaced deprecated inclusions. Commit c2a658d41924 ("riscv: lib: vectorize copy_to_user/copy_from_user") introduced a new instance of #include . arch/riscv/lib/uaccess_vector.S does not use EXPORT_SYMBOL, hence this include directive is unneeded. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240120213312.3033528-1-masahiroy@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/lib/uaccess_vector.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S index 51ab5588e9ff..7c45f26de4f7 100644 --- a/arch/riscv/lib/uaccess_vector.S +++ b/arch/riscv/lib/uaccess_vector.S @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ #include -#include #include #include #include From bf3159c0ef1fd2bc999e3a4910b1c610949ae333 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 27 Dec 2023 15:35:46 +0100 Subject: [PATCH 002/496] clocksource/drivers/imx: Fix -Wunused-but-set-variable warning All warnings (new ones prefixed by >>): drivers/clocksource/timer-imx-gpt.c: In function 'mxc_timer_interrupt': >> drivers/clocksource/timer-imx-gpt.c:279:18: warning: variable 'tstat' set but not used [-Wunused-but-set-variable] 279 | uint32_t tstat; | ^~~~~ vim +/tstat +279 drivers/clocksource/timer-imx-gpt.c The change remove the tstats assignment but not the reading of the register, assuming the register may be a ROR (Reset On Read) which happens in the driver's interrupt registers. Fixes: df181e382816 ("clocksource/drivers/imx-gpt: Add support for ARM64") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312231803.XzPddRa5-lkp@intel.com/ Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231227143546.2823683-1-daniel.lezcano@linaro.org --- drivers/clocksource/timer-imx-gpt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-imx-gpt.c b/drivers/clocksource/timer-imx-gpt.c index 6a878d227a13..489e69169ed4 100644 --- a/drivers/clocksource/timer-imx-gpt.c +++ b/drivers/clocksource/timer-imx-gpt.c @@ -258,9 +258,8 @@ static irqreturn_t mxc_timer_interrupt(int irq, void *dev_id) { struct clock_event_device *ced = dev_id; struct imx_timer *imxtm = to_imx_timer(ced); - uint32_t tstat; - tstat = readl_relaxed(imxtm->base + imxtm->gpt->reg_tstat); + readl_relaxed(imxtm->base + imxtm->gpt->reg_tstat); imxtm->gpt->gpt_irq_acknowledge(imxtm); From f253c9a1aa3360bd6d61407dbfc6ca002855caa3 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Fri, 22 Dec 2023 16:53:53 +0000 Subject: [PATCH 003/496] dt-bindings: timer: exynos4210-mct: Add google,gs101-mct compatible Add dedicated google,gs101-mct compatible to the dt-schema for representing mct timer of the Google Tensor gs101 SoC. Signed-off-by: Peter Griffin Reviewed-by: Krzysztof Kozlowski Reviewed-by: Sam Protsenko Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231222165355.1462740-2-peter.griffin@linaro.org --- .../devicetree/bindings/timer/samsung,exynos4210-mct.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 829bd2227f7c..774b7992a0ca 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -26,6 +26,7 @@ properties: - items: - enum: - axis,artpec8-mct + - google,gs101-mct - samsung,exynos3250-mct - samsung,exynos5250-mct - samsung,exynos5260-mct @@ -127,6 +128,7 @@ allOf: contains: enum: - axis,artpec8-mct + - google,gs101-mct - samsung,exynos5260-mct - samsung,exynos5420-mct - samsung,exynos5433-mct From 906fed29f4527e60db30d4eaa4b5b06a92447c69 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Jan 2024 09:36:15 -0800 Subject: [PATCH 004/496] clocksource/drivers/stm32: Fix all kernel-doc warnings Add a "Returns:" section in one function description. Use the correct function name in another function description. These changes prevent 2 warnings: timer-stm32.c:79: warning: No description found for return value of 'stm32_timer_of_bits_get' timer-stm32.c:189: warning: expecting prototype for stm32_timer_width(). Prototype was for stm32_timer_set_width() instead Signed-off-by: Randy Dunlap Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Fabrice Gasnier Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240120173615.14618-1-rdunlap@infradead.org --- drivers/clocksource/timer-stm32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c index c9a753f96ba1..0a4ea3288bfb 100644 --- a/drivers/clocksource/timer-stm32.c +++ b/drivers/clocksource/timer-stm32.c @@ -73,7 +73,7 @@ static void stm32_timer_of_bits_set(struct timer_of *to, int bits) * Accessor helper to get the number of bits in the timer-of private * structure. * - * Returns an integer corresponding to the number of bits. + * Returns: an integer corresponding to the number of bits. */ static int stm32_timer_of_bits_get(struct timer_of *to) { @@ -177,7 +177,7 @@ static irqreturn_t stm32_clock_event_handler(int irq, void *dev_id) } /** - * stm32_timer_width - Sort out the timer width (32/16) + * stm32_timer_set_width - Sort out the timer width (32/16) * @to: a pointer to a timer-of structure * * Write the 32-bit max value and read/return the result. If the timer From 702107ed5d89e50499aa1d65fe499a028073b25e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 20 Jan 2024 09:36:24 -0800 Subject: [PATCH 005/496] clocksource/drivers/ti-32K: Fix misuse of "/**" comment Change "/**" to a common "/*" comment in a non-kernel-doc comment to avoid a kernel-doc warning: timer-ti-32k.c:42: warning: expecting prototype for timer(). Prototype was for OMAP2_32KSYNCNT_REV_OFF() instead Signed-off-by: Randy Dunlap Cc: Daniel Lezcano Cc: Thomas Gleixner Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240120173624.16769-1-rdunlap@infradead.org --- drivers/clocksource/timer-ti-32k.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-ti-32k.c b/drivers/clocksource/timer-ti-32k.c index 59b0be482f32..a86529a70737 100644 --- a/drivers/clocksource/timer-ti-32k.c +++ b/drivers/clocksource/timer-ti-32k.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * timer-ti-32k.c - OMAP2 32k Timer Support * * Copyright (C) 2009 Nokia Corporation From 021d23428bdbae032294e8f4a29cb53cb50ae71c Mon Sep 17 00:00:00 2001 From: Wende Tan Date: Tue, 17 Oct 2023 15:21:04 -0700 Subject: [PATCH 006/496] RISC-V: build: Allow LTO to be selected Allow LTO to be selected for RISC-V, only when LLD >= 14, since there is an issue [1] in prior LLD versions that prevents LLD to generate proper machine code for RISC-V when writing `nop`s. To avoid boot failures in QEMU [2], '-mattr=+c' and '-mattr=+relax' need to be passed via '-mllvm' to ld.lld, as there appears to be an issue with LLVM's target-features and LTO [3], which can result in incorrect relocations to branch targets [4]. Once this is fixed in LLVM, it can be made conditional on affected ld.lld versions. Disable LTO for arch/riscv/kernel/pi, as llvm-objcopy expects an ELF object file when manipulating the files in that subfolder, rather than LLVM bitcode. [1] https://github.com/llvm/llvm-project/issues/50505, resolved by LLVM commit e63455d5e0e5 ("[MC] Use local MCSubtargetInfo in writeNops") [2] https://github.com/ClangBuiltLinux/linux/issues/1942 [3] https://github.com/llvm/llvm-project/issues/59350 [4] https://github.com/llvm/llvm-project/issues/65090 Tested-by: Wende Tan Signed-off-by: Wende Tan Co-developed-by: Nathan Chancellor Signed-off-by: Nathan Chancellor Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20231017-riscv-lto-v4-1-e7810b24e805@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 3 +++ arch/riscv/Makefile | 5 +++++ arch/riscv/kernel/pi/Makefile | 3 +++ 3 files changed, 11 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..60c8fd1a677f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -47,6 +47,9 @@ config RISCV select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGETLBFS if MMU + # LLD >= 14: https://github.com/llvm/llvm-project/issues/50505 + select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000 + select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 0b7d109258e7..252d63942f34 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -50,6 +50,11 @@ ifndef CONFIG_AS_IS_LLVM KBUILD_CFLAGS += -Wa,-mno-relax KBUILD_AFLAGS += -Wa,-mno-relax endif +# LLVM has an issue with target-features and LTO: https://github.com/llvm/llvm-project/issues/59350 +# Ensure it is aware of linker relaxation with LTO, otherwise relocations may +# be incorrect: https://github.com/llvm/llvm-project/issues/65090 +else ifeq ($(CONFIG_LTO_CLANG),y) + KBUILD_LDFLAGS += -mllvm -mattr=+c -mllvm -mattr=+relax endif ifeq ($(CONFIG_SHADOW_CALL_STACK),y) diff --git a/arch/riscv/kernel/pi/Makefile b/arch/riscv/kernel/pi/Makefile index 07915dc9279e..b75f150b923d 100644 --- a/arch/riscv/kernel/pi/Makefile +++ b/arch/riscv/kernel/pi/Makefile @@ -9,6 +9,9 @@ KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \ -fno-asynchronous-unwind-tables -fno-unwind-tables \ $(call cc-option,-fno-addrsig) +# Disable LTO +KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS)) + KBUILD_CFLAGS += -mcmodel=medany CFLAGS_cmdline_early.o += -D__NO_FORTIFY From df513ed49f0073ce1778eb469ab5db44bceade30 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sun, 21 Jan 2024 16:19:12 -0800 Subject: [PATCH 007/496] RISC-V: add helper function to read the vector VLEN VLEN describes the length of each vector register and some instructions need specific minimal VLENs to work correctly. The vector code already includes a variable riscv_v_vsize that contains the value of "32 vector registers with vlenb length" that gets filled during boot. vlenb is the value contained in the CSR_VLENB register and the value represents "VLEN / 8". So add riscv_vector_vlen() to return the actual VLEN value for in-kernel users when they need to check the available VLEN. Signed-off-by: Heiko Stuebner Reviewed-by: Eric Biggers Signed-off-by: Jerry Shih Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-2-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vector.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index 0cd6f0a027d1..731dcd0ed4de 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -284,4 +284,15 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; } #endif /* CONFIG_RISCV_ISA_V */ +/* + * Return the implementation's vlen value. + * + * riscv_v_vsize contains the value of "32 vector registers with vlenb length" + * so rebuild the vlen value in bits from it. + */ +static inline int riscv_vector_vlen(void) +{ + return riscv_v_vsize / 32 * 8; +} + #endif /* ! __ASM_RISCV_VECTOR_H */ From 34ca4ec628deb4f00da38c7d73486b8499e30dea Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 21 Jan 2024 16:19:13 -0800 Subject: [PATCH 008/496] RISC-V: add TOOLCHAIN_HAS_VECTOR_CRYPTO Add a kconfig symbol that indicates whether the toolchain supports the vector crypto extensions. This is needed by the RISC-V crypto code. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-3-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..5613b2bb686e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -578,6 +578,13 @@ config TOOLCHAIN_HAS_ZBB depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900 depends on AS_HAS_OPTION_ARCH +# This symbol indicates that the toolchain supports all v1.0 vector crypto +# extensions, including Zvk*, Zvbb, and Zvbc. LLVM added all of these at once. +# binutils added all except Zvkb, then added Zvkb. So we just check for Zvkb. +config TOOLCHAIN_HAS_VECTOR_CRYPTO + def_bool $(as-instr, .option arch$(comma) +zvkb) + depends on AS_HAS_OPTION_ARCH + config RISCV_ISA_ZBB bool "Zbb extension support for bit manipulation instructions" depends on TOOLCHAIN_HAS_ZBB From 178f3856436c748485cb7f8c134be2471f6d539f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Sun, 21 Jan 2024 16:19:14 -0800 Subject: [PATCH 009/496] RISC-V: hook new crypto subdir into build-system Create a crypto subdirectory for added accelerated cryptography routines and hook it into the riscv Kbuild and the main crypto Kconfig. Signed-off-by: Heiko Stuebner Reviewed-by: Eric Biggers Signed-off-by: Jerry Shih Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-4-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kbuild | 1 + arch/riscv/crypto/Kconfig | 5 +++++ arch/riscv/crypto/Makefile | 1 + crypto/Kconfig | 3 +++ 4 files changed, 10 insertions(+) create mode 100644 arch/riscv/crypto/Kconfig create mode 100644 arch/riscv/crypto/Makefile diff --git a/arch/riscv/Kbuild b/arch/riscv/Kbuild index d25ad1c19f88..2c585f7a0b6e 100644 --- a/arch/riscv/Kbuild +++ b/arch/riscv/Kbuild @@ -2,6 +2,7 @@ obj-y += kernel/ mm/ net/ obj-$(CONFIG_BUILTIN_DTB) += boot/dts/ +obj-$(CONFIG_CRYPTO) += crypto/ obj-y += errata/ obj-$(CONFIG_KVM) += kvm/ diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig new file mode 100644 index 000000000000..10d60edc0110 --- /dev/null +++ b/arch/riscv/crypto/Kconfig @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +menu "Accelerated Cryptographic Algorithms for CPU (riscv)" + +endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile new file mode 100644 index 000000000000..a4e40e534e6a --- /dev/null +++ b/arch/riscv/crypto/Makefile @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0-only diff --git a/crypto/Kconfig b/crypto/Kconfig index 7d156c75f15f..00e4aa16bf2b 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1496,6 +1496,9 @@ endif if PPC source "arch/powerpc/crypto/Kconfig" endif +if RISCV +source "arch/riscv/crypto/Kconfig" +endif if S390 source "arch/s390/crypto/Kconfig" endif From eb24af5d7a05bbcdebb0398f91af990719644093 Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:15 -0800 Subject: [PATCH 010/496] crypto: riscv - add vector crypto accelerated AES-{ECB,CBC,CTR,XTS} Add implementations of AES-ECB, AES-CBC, AES-CTR, and AES-XTS, as well as bare (single-block) AES, using the RISC-V vector crypto extensions. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using regular .S files instead of the so-called perlasm, using the assembler instead of bare '.inst', greatly reducing code duplication, supporting AES-192, and making the code use the same AES key structure as the C code. Co-developed-by: Phoebe Chen Signed-off-by: Phoebe Chen Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-5-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 16 + arch/riscv/crypto/Makefile | 4 + arch/riscv/crypto/aes-macros.S | 156 +++++ arch/riscv/crypto/aes-riscv64-glue.c | 550 ++++++++++++++++++ .../crypto/aes-riscv64-zvkned-zvbb-zvkg.S | 312 ++++++++++ arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S | 146 +++++ arch/riscv/crypto/aes-riscv64-zvkned.S | 180 ++++++ 7 files changed, 1364 insertions(+) create mode 100644 arch/riscv/crypto/aes-macros.S create mode 100644 arch/riscv/crypto/aes-riscv64-glue.c create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S create mode 100644 arch/riscv/crypto/aes-riscv64-zvkned.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 10d60edc0110..ebe805fa3f5f 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -2,4 +2,20 @@ menu "Accelerated Cryptographic Algorithms for CPU (riscv)" +config CRYPTO_AES_RISCV64 + tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_ALGAPI + select CRYPTO_LIB_AES + select CRYPTO_SKCIPHER + help + Block cipher: AES cipher algorithms + Length-preserving ciphers: AES with ECB, CBC, CTR, XTS + + Architecture: riscv64 using: + - Zvkned vector crypto extension + - Zvbb vector extension (XTS) + - Zvkb vector crypto extension (CTR) + - Zvkg vector crypto extension (XTS) + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index a4e40e534e6a..44922df7d182 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -1 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_AES_RISCV64) += aes-riscv64.o +aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \ + aes-riscv64-zvkned-zvbb-zvkg.o aes-riscv64-zvkned-zvkb.o diff --git a/arch/riscv/crypto/aes-macros.S b/arch/riscv/crypto/aes-macros.S new file mode 100644 index 000000000000..d1a258d04bc7 --- /dev/null +++ b/arch/riscv/crypto/aes-macros.S @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file contains macros that are shared by the other aes-*.S files. The +// generated code of these macros depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector AES block cipher extension ('Zvkned') + +// Loads the AES round keys from \keyp into vector registers and jumps to code +// specific to the length of the key. Specifically: +// - If AES-128, loads round keys into v1-v11 and jumps to \label128. +// - If AES-192, loads round keys into v1-v13 and jumps to \label192. +// - If AES-256, loads round keys into v1-v15 and continues onwards. +// +// Also sets vl=4 and vtype=e32,m1,ta,ma. Clobbers t0 and t1. +.macro aes_begin keyp, label128, label192 + lwu t0, 480(\keyp) // t0 = key length in bytes + li t1, 24 // t1 = key length for AES-192 + vsetivli zero, 4, e32, m1, ta, ma + vle32.v v1, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v2, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v3, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v4, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v5, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v6, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v7, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v8, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v9, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v10, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v11, (\keyp) + blt t0, t1, \label128 // If AES-128, goto label128. + addi \keyp, \keyp, 16 + vle32.v v12, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v13, (\keyp) + beq t0, t1, \label192 // If AES-192, goto label192. + // Else, it's AES-256. + addi \keyp, \keyp, 16 + vle32.v v14, (\keyp) + addi \keyp, \keyp, 16 + vle32.v v15, (\keyp) +.endm + +// Encrypts \data using zvkned instructions, using the round keys loaded into +// v1-v11 (for AES-128), v1-v13 (for AES-192), or v1-v15 (for AES-256). \keylen +// is the AES key length in bits. vl and vtype must already be set +// appropriately. Note that if vl > 4, multiple blocks are encrypted. +.macro aes_encrypt data, keylen + vaesz.vs \data, v1 + vaesem.vs \data, v2 + vaesem.vs \data, v3 + vaesem.vs \data, v4 + vaesem.vs \data, v5 + vaesem.vs \data, v6 + vaesem.vs \data, v7 + vaesem.vs \data, v8 + vaesem.vs \data, v9 + vaesem.vs \data, v10 +.if \keylen == 128 + vaesef.vs \data, v11 +.elseif \keylen == 192 + vaesem.vs \data, v11 + vaesem.vs \data, v12 + vaesef.vs \data, v13 +.else + vaesem.vs \data, v11 + vaesem.vs \data, v12 + vaesem.vs \data, v13 + vaesem.vs \data, v14 + vaesef.vs \data, v15 +.endif +.endm + +// Same as aes_encrypt, but decrypts instead of encrypts. +.macro aes_decrypt data, keylen +.if \keylen == 128 + vaesz.vs \data, v11 +.elseif \keylen == 192 + vaesz.vs \data, v13 + vaesdm.vs \data, v12 + vaesdm.vs \data, v11 +.else + vaesz.vs \data, v15 + vaesdm.vs \data, v14 + vaesdm.vs \data, v13 + vaesdm.vs \data, v12 + vaesdm.vs \data, v11 +.endif + vaesdm.vs \data, v10 + vaesdm.vs \data, v9 + vaesdm.vs \data, v8 + vaesdm.vs \data, v7 + vaesdm.vs \data, v6 + vaesdm.vs \data, v5 + vaesdm.vs \data, v4 + vaesdm.vs \data, v3 + vaesdm.vs \data, v2 + vaesdf.vs \data, v1 +.endm + +// Expands to aes_encrypt or aes_decrypt according to \enc, which is 1 or 0. +.macro aes_crypt data, enc, keylen +.if \enc + aes_encrypt \data, \keylen +.else + aes_decrypt \data, \keylen +.endif +.endm diff --git a/arch/riscv/crypto/aes-riscv64-glue.c b/arch/riscv/crypto/aes-riscv64-glue.c new file mode 100644 index 000000000000..37bc6ef0be40 --- /dev/null +++ b/arch/riscv/crypto/aes-riscv64-glue.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AES using the RISC-V vector crypto extensions. Includes the bare block + * cipher and the ECB, CBC, CTR, and XTS modes. + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void aes_encrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 in[AES_BLOCK_SIZE], + u8 out[AES_BLOCK_SIZE]); +asmlinkage void aes_decrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 in[AES_BLOCK_SIZE], + u8 out[AES_BLOCK_SIZE]); + +asmlinkage void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len); +asmlinkage void aes_ecb_decrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len); + +asmlinkage void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + u8 iv[AES_BLOCK_SIZE]); +asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + u8 iv[AES_BLOCK_SIZE]); + +asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + u8 iv[AES_BLOCK_SIZE]); + +asmlinkage void aes_xts_encrypt_zvkned_zvbb_zvkg( + const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + u8 tweak[AES_BLOCK_SIZE]); + +asmlinkage void aes_xts_decrypt_zvkned_zvbb_zvkg( + const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + u8 tweak[AES_BLOCK_SIZE]); + +static int riscv64_aes_setkey(struct crypto_aes_ctx *ctx, + const u8 *key, unsigned int keylen) +{ + /* + * For now we just use the generic key expansion, for these reasons: + * + * - zvkned's key expansion instructions don't support AES-192. + * So, non-zvkned fallback code would be needed anyway. + * + * - Users of AES in Linux usually don't change keys frequently. + * So, key expansion isn't performance-critical. + * + * - For single-block AES exposed as a "cipher" algorithm, it's + * necessary to use struct crypto_aes_ctx and initialize its 'key_dec' + * field with the round keys for the Equivalent Inverse Cipher. This + * is because with "cipher", decryption can be requested from a + * context where the vector unit isn't usable, necessitating a + * fallback to aes_decrypt(). But, zvkned can only generate and use + * the normal round keys. Of course, it's preferable to not have + * special code just for "cipher", as e.g. XTS also uses a + * single-block AES encryption. It's simplest to just use + * struct crypto_aes_ctx and aes_expandkey() everywhere. + */ + return aes_expandkey(ctx, key, keylen); +} + +static int riscv64_aes_setkey_cipher(struct crypto_tfm *tfm, + const u8 *key, unsigned int keylen) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + return riscv64_aes_setkey(ctx, key, keylen); +} + +static int riscv64_aes_setkey_skcipher(struct crypto_skcipher *tfm, + const u8 *key, unsigned int keylen) +{ + struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + + return riscv64_aes_setkey(ctx, key, keylen); +} + +/* Bare AES, without a mode of operation */ + +static void riscv64_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + aes_encrypt_zvkned(ctx, src, dst); + kernel_vector_end(); + } else { + aes_encrypt(ctx, dst, src); + } +} + +static void riscv64_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + aes_decrypt_zvkned(ctx, src, dst); + kernel_vector_end(); + } else { + aes_decrypt(ctx, dst, src); + } +} + +/* AES-ECB */ + +static inline int riscv64_aes_ecb_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + kernel_vector_begin(); + if (enc) + aes_ecb_encrypt_zvkned(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + nbytes & ~(AES_BLOCK_SIZE - 1)); + else + aes_ecb_decrypt_zvkned(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + nbytes & ~(AES_BLOCK_SIZE - 1)); + kernel_vector_end(); + err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1)); + } + + return err; +} + +static int riscv64_aes_ecb_encrypt(struct skcipher_request *req) +{ + return riscv64_aes_ecb_crypt(req, true); +} + +static int riscv64_aes_ecb_decrypt(struct skcipher_request *req) +{ + return riscv64_aes_ecb_crypt(req, false); +} + +/* AES-CBC */ + +static inline int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + int err; + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + kernel_vector_begin(); + if (enc) + aes_cbc_encrypt_zvkned(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + nbytes & ~(AES_BLOCK_SIZE - 1), + walk.iv); + else + aes_cbc_decrypt_zvkned(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + nbytes & ~(AES_BLOCK_SIZE - 1), + walk.iv); + kernel_vector_end(); + err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1)); + } + + return err; +} + +static int riscv64_aes_cbc_encrypt(struct skcipher_request *req) +{ + return riscv64_aes_cbc_crypt(req, true); +} + +static int riscv64_aes_cbc_decrypt(struct skcipher_request *req) +{ + return riscv64_aes_cbc_crypt(req, false); +} + +/* AES-CTR */ + +static int riscv64_aes_ctr_crypt(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + unsigned int nbytes, p1_nbytes; + struct skcipher_walk walk; + u32 ctr32, nblocks; + int err; + + /* Get the low 32-bit word of the 128-bit big endian counter. */ + ctr32 = get_unaligned_be32(req->iv + 12); + + err = skcipher_walk_virt(&walk, req, false); + while ((nbytes = walk.nbytes) != 0) { + if (nbytes < walk.total) { + /* Not the end yet, so keep the length block-aligned. */ + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + nblocks = nbytes / AES_BLOCK_SIZE; + } else { + /* It's the end, so include any final partial block. */ + nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE); + } + ctr32 += nblocks; + + kernel_vector_begin(); + if (ctr32 >= nblocks) { + /* The low 32-bit word of the counter won't overflow. */ + aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, + req->iv); + } else { + /* + * The low 32-bit word of the counter will overflow. + * The assembly doesn't handle this case, so split the + * operation into two at the point where the overflow + * will occur. After the first part, add the carry bit. + */ + p1_nbytes = min_t(unsigned int, nbytes, + (nblocks - ctr32) * AES_BLOCK_SIZE); + aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr, + walk.dst.virt.addr, + p1_nbytes, req->iv); + crypto_inc(req->iv, 12); + + if (ctr32) { + aes_ctr32_crypt_zvkned_zvkb( + ctx, + walk.src.virt.addr + p1_nbytes, + walk.dst.virt.addr + p1_nbytes, + nbytes - p1_nbytes, req->iv); + } + } + kernel_vector_end(); + + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +/* AES-XTS */ + +struct riscv64_aes_xts_ctx { + struct crypto_aes_ctx ctx1; + struct crypto_aes_ctx ctx2; +}; + +static int riscv64_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key, + unsigned int keylen) +{ + struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + + return xts_verify_key(tfm, key, keylen) ?: + riscv64_aes_setkey(&ctx->ctx1, key, keylen / 2) ?: + riscv64_aes_setkey(&ctx->ctx2, key + keylen / 2, keylen / 2); +} + +static int riscv64_aes_xts_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm); + int tail = req->cryptlen % AES_BLOCK_SIZE; + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + /* Encrypt the IV with the tweak key to get the first tweak. */ + kernel_vector_begin(); + aes_encrypt_zvkned(&ctx->ctx2, req->iv, req->iv); + kernel_vector_end(); + + err = skcipher_walk_virt(&walk, req, false); + + /* + * If the message length isn't divisible by the AES block size and the + * full message isn't available in one step of the scatterlist walk, + * then separate off the last full block and the partial block. This + * ensures that they are processed in the same call to the assembly + * function, which is required for ciphertext stealing. + */ + if (unlikely(tail > 0 && walk.nbytes < walk.total)) { + skcipher_walk_abort(&walk); + + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + req->cryptlen - tail - AES_BLOCK_SIZE, + req->iv); + req = &subreq; + err = skcipher_walk_virt(&walk, req, false); + } else { + tail = 0; + } + + while (walk.nbytes) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, AES_BLOCK_SIZE); + + kernel_vector_begin(); + if (enc) + aes_xts_encrypt_zvkned_zvbb_zvkg( + &ctx->ctx1, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, req->iv); + else + aes_xts_decrypt_zvkned_zvbb_zvkg( + &ctx->ctx1, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, req->iv); + kernel_vector_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + if (err || likely(!tail)) + return err; + + /* Do ciphertext stealing with the last full block and partial block. */ + + dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); + + skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, + req->iv); + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + + kernel_vector_begin(); + if (enc) + aes_xts_encrypt_zvkned_zvbb_zvkg( + &ctx->ctx1, walk.src.virt.addr, + walk.dst.virt.addr, walk.nbytes, req->iv); + else + aes_xts_decrypt_zvkned_zvbb_zvkg( + &ctx->ctx1, walk.src.virt.addr, + walk.dst.virt.addr, walk.nbytes, req->iv); + kernel_vector_end(); + + return skcipher_walk_done(&walk, 0); +} + +static int riscv64_aes_xts_encrypt(struct skcipher_request *req) +{ + return riscv64_aes_xts_crypt(req, true); +} + +static int riscv64_aes_xts_decrypt(struct skcipher_request *req) +{ + return riscv64_aes_xts_crypt(req, false); +} + +/* Algorithm definitions */ + +static struct crypto_alg riscv64_zvkned_aes_cipher_alg = { + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_priority = 300, + .cra_name = "aes", + .cra_driver_name = "aes-riscv64-zvkned", + .cra_cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = riscv64_aes_setkey_cipher, + .cia_encrypt = riscv64_aes_encrypt, + .cia_decrypt = riscv64_aes_decrypt, + }, + .cra_module = THIS_MODULE, +}; + +static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = { + { + .setkey = riscv64_aes_setkey_skcipher, + .encrypt = riscv64_aes_ecb_encrypt, + .decrypt = riscv64_aes_ecb_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .walksize = 8 * AES_BLOCK_SIZE, /* matches LMUL=8 */ + .base = { + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_priority = 300, + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-riscv64-zvkned", + .cra_module = THIS_MODULE, + }, + }, { + .setkey = riscv64_aes_setkey_skcipher, + .encrypt = riscv64_aes_cbc_encrypt, + .decrypt = riscv64_aes_cbc_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .base = { + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_priority = 300, + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-riscv64-zvkned", + .cra_module = THIS_MODULE, + }, + } +}; + +static struct skcipher_alg riscv64_zvkned_zvkb_aes_skcipher_alg = { + .setkey = riscv64_aes_setkey_skcipher, + .encrypt = riscv64_aes_ctr_crypt, + .decrypt = riscv64_aes_ctr_crypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */ + .base = { + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_priority = 300, + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-riscv64-zvkned-zvkb", + .cra_module = THIS_MODULE, + }, +}; + +static struct skcipher_alg riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg = { + .setkey = riscv64_aes_xts_setkey, + .encrypt = riscv64_aes_xts_encrypt, + .decrypt = riscv64_aes_xts_decrypt, + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .chunksize = AES_BLOCK_SIZE, + .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */ + .base = { + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct riscv64_aes_xts_ctx), + .cra_priority = 300, + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-riscv64-zvkned-zvbb-zvkg", + .cra_module = THIS_MODULE, + }, +}; + +static inline bool riscv64_aes_xts_supported(void) +{ + return riscv_isa_extension_available(NULL, ZVBB) && + riscv_isa_extension_available(NULL, ZVKG) && + riscv_vector_vlen() < 2048 /* Implementation limitation */; +} + +static int __init riscv64_aes_mod_init(void) +{ + int err = -ENODEV; + + if (riscv_isa_extension_available(NULL, ZVKNED) && + riscv_vector_vlen() >= 128) { + err = crypto_register_alg(&riscv64_zvkned_aes_cipher_alg); + if (err) + return err; + + err = crypto_register_skciphers( + riscv64_zvkned_aes_skcipher_algs, + ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs)); + if (err) + goto unregister_zvkned_cipher_alg; + + if (riscv_isa_extension_available(NULL, ZVKB)) { + err = crypto_register_skcipher( + &riscv64_zvkned_zvkb_aes_skcipher_alg); + if (err) + goto unregister_zvkned_skcipher_algs; + } + + if (riscv64_aes_xts_supported()) { + err = crypto_register_skcipher( + &riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg); + if (err) + goto unregister_zvkned_zvkb_skcipher_alg; + } + } + + return err; + +unregister_zvkned_zvkb_skcipher_alg: + if (riscv_isa_extension_available(NULL, ZVKB)) + crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg); +unregister_zvkned_skcipher_algs: + crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs, + ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs)); +unregister_zvkned_cipher_alg: + crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg); + return err; +} + +static void __exit riscv64_aes_mod_exit(void) +{ + if (riscv64_aes_xts_supported()) + crypto_unregister_skcipher(&riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg); + if (riscv_isa_extension_available(NULL, ZVKB)) + crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg); + crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs, + ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs)); + crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg); +} + +module_init(riscv64_aes_mod_init); +module_exit(riscv64_aes_mod_exit); + +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS (RISC-V accelerated)"); +MODULE_AUTHOR("Jerry Shih "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("aes"); +MODULE_ALIAS_CRYPTO("ecb(aes)"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("ctr(aes)"); +MODULE_ALIAS_CRYPTO("xts(aes)"); diff --git a/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S b/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S new file mode 100644 index 000000000000..146fc9cfb268 --- /dev/null +++ b/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048 +// - RISC-V Vector AES block cipher extension ('Zvkned') +// - RISC-V Vector Bit-manipulation extension ('Zvbb') +// - RISC-V Vector GCM/GMAC extension ('Zvkg') + +#include + +.text +.option arch, +zvkned, +zvbb, +zvkg + +#include "aes-macros.S" + +#define KEYP a0 +#define INP a1 +#define OUTP a2 +#define LEN a3 +#define TWEAKP a4 + +#define LEN32 a5 +#define TAIL_LEN a6 +#define VL a7 +#define VLMAX t4 + +// v1-v15 contain the AES round keys, but they are used for temporaries before +// the AES round keys have been loaded. +#define TWEAKS v16 // LMUL=4 (most of the time) +#define TWEAKS_BREV v20 // LMUL=4 (most of the time) +#define MULTS_BREV v24 // LMUL=4 (most of the time) +#define TMP0 v28 +#define TMP1 v29 +#define TMP2 v30 +#define TMP3 v31 + +// xts_init initializes the following values: +// +// TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1) +// TWEAKS_BREV: same as TWEAKS, but bit-reversed +// MULTS_BREV: N 128-bit values x^N, bit-reversed. Only if N > 1. +// +// N is the maximum number of blocks that will be processed per loop iteration, +// computed using vsetvli. +// +// The field convention used by XTS is the same as that of GHASH, but with the +// bits reversed within each byte. The zvkg extension provides the vgmul +// instruction which does multiplication in this field. Therefore, for tweak +// computation we use vgmul to do multiplications in parallel, instead of +// serially multiplying by x using shifting+xoring. Note that for this to work, +// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8). +.macro xts_init + + // Load the first tweak T. + vsetivli zero, 4, e32, m1, ta, ma + vle32.v TWEAKS, (TWEAKP) + + // If there's only one block (or no blocks at all), then skip the tweak + // sequence computation because (at most) T itself is needed. + li t0, 16 + ble LEN, t0, .Linit_single_block\@ + + // Save a copy of T bit-reversed in v12. + vbrev8.v v12, TWEAKS + + // + // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming + // that N <= 128. Though, this code actually requires N < 64 (or + // equivalently VLEN < 2048) due to the use of 64-bit intermediate + // values here and in the x^N computation later. + // + vsetvli VL, LEN32, e32, m4, ta, ma + srli t0, VL, 2 // t0 = N (num blocks) + // Generate two sequences, each with N 32-bit values: + // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...]. + vsetvli zero, t0, e32, m1, ta, ma + vmv.v.i v0, 1 + vid.v v1 + // Use vzext to zero-extend the sequences to 64 bits. Reinterpret them + // as two sequences, each with 2*N 32-bit values: + // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...]. + vsetvli zero, t0, e64, m2, ta, ma + vzext.vf2 v2, v0 + vzext.vf2 v4, v1 + slli t1, t0, 1 // t1 = 2*N + vsetvli zero, t1, e32, m2, ta, ma + // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...], + // widening to 64 bits per element. When reinterpreted as N 128-bit + // values, this is the needed sequence of 128-bit values 1 << i (x^i). + vwsll.vv v8, v2, v4 + + // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then + // multiply by x^i. This gives the sequence T*(x^i), bit-reversed. + vsetvli zero, LEN32, e32, m4, ta, ma + vmv.v.i TWEAKS_BREV, 0 + vaesz.vs TWEAKS_BREV, v12 + vbrev8.v v8, v8 + vgmul.vv TWEAKS_BREV, v8 + + // Save a copy of the sequence T*(x^i) with the bit reversal undone. + vbrev8.v TWEAKS, TWEAKS_BREV + + // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed. + li t1, 1 + sll t1, t1, t0 // t1 = 1 << N + vsetivli zero, 2, e64, m1, ta, ma + vmv.v.i v0, 0 + vsetivli zero, 1, e64, m1, tu, ma + vmv.v.x v0, t1 + vbrev8.v v0, v0 + vsetvli zero, LEN32, e32, m4, ta, ma + vmv.v.i MULTS_BREV, 0 + vaesz.vs MULTS_BREV, v0 + + j .Linit_done\@ + +.Linit_single_block\@: + vbrev8.v TWEAKS_BREV, TWEAKS +.Linit_done\@: +.endm + +// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed. This is +// the multiplier required to advance the tweak by one. +.macro load_x + li t0, 0x40 + vsetivli zero, 4, e32, m1, ta, ma + vmv.v.i MULTS_BREV, 0 + vsetivli zero, 1, e8, m1, tu, ma + vmv.v.x MULTS_BREV, t0 +.endm + +.macro __aes_xts_crypt enc, keylen + // With 16 < len <= 31, there's no main loop, just ciphertext stealing. + beqz LEN32, .Lcts_without_main_loop\@ + + vsetvli VLMAX, zero, e32, m4, ta, ma +1: + vsetvli VL, LEN32, e32, m4, ta, ma +2: + // Encrypt or decrypt VL/4 blocks. + vle32.v TMP0, (INP) + vxor.vv TMP0, TMP0, TWEAKS + aes_crypt TMP0, \enc, \keylen + vxor.vv TMP0, TMP0, TWEAKS + vse32.v TMP0, (OUTP) + + // Update the pointers and the remaining length. + slli t0, VL, 2 + add INP, INP, t0 + add OUTP, OUTP, t0 + sub LEN32, LEN32, VL + + // Check whether more blocks remain. + beqz LEN32, .Lmain_loop_done\@ + + // Compute the next sequence of tweaks by multiplying the previous + // sequence by x^N. Store the result in both bit-reversed order and + // regular order (i.e. with the bit reversal undone). + vgmul.vv TWEAKS_BREV, MULTS_BREV + vbrev8.v TWEAKS, TWEAKS_BREV + + // Since we compute the tweak multipliers x^N in advance, we require + // that each iteration process the same length except possibly the last. + // This conflicts slightly with the behavior allowed by RISC-V Vector + // Extension, where CPUs can select a lower length for both of the last + // two iterations. E.g., vl might take the sequence of values + // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we + // can use x^4 again instead of computing x^3. Therefore, we explicitly + // keep the vl at VLMAX if there is at least VLMAX remaining. + bge LEN32, VLMAX, 2b + j 1b + +.Lmain_loop_done\@: + load_x + + // Compute the next tweak. + addi t0, VL, -4 + vsetivli zero, 4, e32, m4, ta, ma + vslidedown.vx TWEAKS_BREV, TWEAKS_BREV, t0 // Extract last tweak + vsetivli zero, 4, e32, m1, ta, ma + vgmul.vv TWEAKS_BREV, MULTS_BREV // Advance to next tweak + + bnez TAIL_LEN, .Lcts\@ + + // Update *TWEAKP to contain the next tweak. + vbrev8.v TWEAKS, TWEAKS_BREV + vse32.v TWEAKS, (TWEAKP) + ret + +.Lcts_without_main_loop\@: + load_x +.Lcts\@: + // TWEAKS_BREV now contains the next tweak. Compute the one after that. + vsetivli zero, 4, e32, m1, ta, ma + vmv.v.v TMP0, TWEAKS_BREV + vgmul.vv TMP0, MULTS_BREV + // Undo the bit reversal of the next two tweaks and store them in TMP1 + // and TMP2, such that TMP1 is the first needed and TMP2 the second. +.if \enc + vbrev8.v TMP1, TWEAKS_BREV + vbrev8.v TMP2, TMP0 +.else + vbrev8.v TMP1, TMP0 + vbrev8.v TMP2, TWEAKS_BREV +.endif + + // Encrypt/decrypt the last full block. + vle32.v TMP0, (INP) + vxor.vv TMP0, TMP0, TMP1 + aes_crypt TMP0, \enc, \keylen + vxor.vv TMP0, TMP0, TMP1 + + // Swap the first TAIL_LEN bytes of the above result with the tail. + // Note that to support in-place encryption/decryption, the load from + // the input tail must happen before the store to the output tail. + addi t0, INP, 16 + addi t1, OUTP, 16 + vmv.v.v TMP3, TMP0 + vsetvli zero, TAIL_LEN, e8, m1, tu, ma + vle8.v TMP0, (t0) + vse8.v TMP3, (t1) + + // Encrypt/decrypt again and store the last full block. + vsetivli zero, 4, e32, m1, ta, ma + vxor.vv TMP0, TMP0, TMP2 + aes_crypt TMP0, \enc, \keylen + vxor.vv TMP0, TMP0, TMP2 + vse32.v TMP0, (OUTP) + + ret +.endm + +.macro aes_xts_crypt enc + + // Check whether the length is a multiple of the AES block size. + andi TAIL_LEN, LEN, 15 + beqz TAIL_LEN, 1f + + // The length isn't a multiple of the AES block size, so ciphertext + // stealing will be required. Ciphertext stealing involves special + // handling of the partial block and the last full block, so subtract + // the length of both from the length to be processed in the main loop. + sub LEN, LEN, TAIL_LEN + addi LEN, LEN, -16 +1: + srli LEN32, LEN, 2 + // LEN and LEN32 now contain the total length of the blocks that will be + // processed in the main loop, in bytes and 32-bit words respectively. + + xts_init + aes_begin KEYP, 128f, 192f + __aes_xts_crypt \enc, 256 +128: + __aes_xts_crypt \enc, 128 +192: + __aes_xts_crypt \enc, 192 +.endm + +// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key, +// const u8 *in, u8 *out, size_t len, +// u8 tweak[16]); +// +// |key| is the data key. |tweak| contains the next tweak; the encryption of +// the original IV with the tweak key was already done. This function supports +// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and +// |len| must be a multiple of 16 except on the last call. If |len| is a +// multiple of 16, then this function updates |tweak| to contain the next tweak. +SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg) + aes_xts_crypt 1 +SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg) + +// Same prototype and calling convention as the encryption function +SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg) + aes_xts_crypt 0 +SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg) diff --git a/arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S b/arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S new file mode 100644 index 000000000000..9962d4500587 --- /dev/null +++ b/arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector AES block cipher extension ('Zvkned') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvkned, +zvkb + +#include "aes-macros.S" + +#define KEYP a0 +#define INP a1 +#define OUTP a2 +#define LEN a3 +#define IVP a4 + +#define LEN32 a5 +#define VL_E32 a6 +#define VL_BLOCKS a7 + +.macro aes_ctr32_crypt keylen + // LEN32 = number of blocks, rounded up, in 32-bit words. + addi t0, LEN, 15 + srli t0, t0, 4 + slli LEN32, t0, 2 + + // Create a mask that selects the last 32-bit word of each 128-bit + // block. This is the word that contains the (big-endian) counter. + li t0, 0x88 + vsetvli t1, zero, e8, m1, ta, ma + vmv.v.x v0, t0 + + // Load the IV into v31. The last 32-bit word contains the counter. + vsetivli zero, 4, e32, m1, ta, ma + vle32.v v31, (IVP) + + // Convert the big-endian counter into little-endian. + vsetivli zero, 4, e32, m1, ta, mu + vrev8.v v31, v31, v0.t + + // Splat the IV to v16 (with LMUL=4). The number of copies is the + // maximum number of blocks that will be processed per iteration. + vsetvli zero, LEN32, e32, m4, ta, ma + vmv.v.i v16, 0 + vaesz.vs v16, v31 + + // v20 = [x, x, x, 0, x, x, x, 1, ...] + viota.m v20, v0, v0.t + // v16 = [IV0, IV1, IV2, counter+0, IV0, IV1, IV2, counter+1, ...] + vsetvli VL_E32, LEN32, e32, m4, ta, mu + vadd.vv v16, v16, v20, v0.t + + j 2f +1: + // Set the number of blocks to process in this iteration. vl=VL_E32 is + // the length in 32-bit words, i.e. 4 times the number of blocks. + vsetvli VL_E32, LEN32, e32, m4, ta, mu + + // Increment the counters by the number of blocks processed in the + // previous iteration. + vadd.vx v16, v16, VL_BLOCKS, v0.t +2: + // Prepare the AES inputs into v24. + vmv.v.v v24, v16 + vrev8.v v24, v24, v0.t // Convert counters back to big-endian. + + // Encrypt the AES inputs to create the next portion of the keystream. + aes_encrypt v24, \keylen + + // XOR the data with the keystream. + vsetvli t0, LEN, e8, m4, ta, ma + vle8.v v20, (INP) + vxor.vv v20, v20, v24 + vse8.v v20, (OUTP) + + // Advance the pointers and update the remaining length. + add INP, INP, t0 + add OUTP, OUTP, t0 + sub LEN, LEN, t0 + sub LEN32, LEN32, VL_E32 + srli VL_BLOCKS, VL_E32, 2 + + // Repeat if more data remains. + bnez LEN, 1b + + // Update *IVP to contain the next counter. + vsetivli zero, 4, e32, m1, ta, mu + vadd.vx v16, v16, VL_BLOCKS, v0.t + vrev8.v v16, v16, v0.t // Convert counters back to big-endian. + vse32.v v16, (IVP) + + ret +.endm + +// void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key, +// const u8 *in, u8 *out, size_t len, +// u8 iv[16]); +SYM_FUNC_START(aes_ctr32_crypt_zvkned_zvkb) + aes_begin KEYP, 128f, 192f + aes_ctr32_crypt 256 +128: + aes_ctr32_crypt 128 +192: + aes_ctr32_crypt 192 +SYM_FUNC_END(aes_ctr32_crypt_zvkned_zvkb) diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S new file mode 100644 index 000000000000..78d4e1186c07 --- /dev/null +++ b/arch/riscv/crypto/aes-riscv64-zvkned.S @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector AES block cipher extension ('Zvkned') + +#include + +.text +.option arch, +zvkned + +#include "aes-macros.S" + +#define KEYP a0 +#define INP a1 +#define OUTP a2 +#define LEN a3 +#define IVP a4 + +.macro __aes_crypt_zvkned enc, keylen + vle32.v v16, (INP) + aes_crypt v16, \enc, \keylen + vse32.v v16, (OUTP) + ret +.endm + +.macro aes_crypt_zvkned enc + aes_begin KEYP, 128f, 192f + __aes_crypt_zvkned \enc, 256 +128: + __aes_crypt_zvkned \enc, 128 +192: + __aes_crypt_zvkned \enc, 192 +.endm + +// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key, +// const u8 in[16], u8 out[16]); +SYM_FUNC_START(aes_encrypt_zvkned) + aes_crypt_zvkned 1 +SYM_FUNC_END(aes_encrypt_zvkned) + +// Same prototype and calling convention as the encryption function +SYM_FUNC_START(aes_decrypt_zvkned) + aes_crypt_zvkned 0 +SYM_FUNC_END(aes_decrypt_zvkned) + +.macro __aes_ecb_crypt enc, keylen + srli t0, LEN, 2 + // t0 is the remaining length in 32-bit words. It's a multiple of 4. +1: + vsetvli t1, t0, e32, m8, ta, ma + sub t0, t0, t1 // Subtract number of words processed + slli t1, t1, 2 // Words to bytes + vle32.v v16, (INP) + aes_crypt v16, \enc, \keylen + vse32.v v16, (OUTP) + add INP, INP, t1 + add OUTP, OUTP, t1 + bnez t0, 1b + + ret +.endm + +.macro aes_ecb_crypt enc + aes_begin KEYP, 128f, 192f + __aes_ecb_crypt \enc, 256 +128: + __aes_ecb_crypt \enc, 128 +192: + __aes_ecb_crypt \enc, 192 +.endm + +// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key, +// const u8 *in, u8 *out, size_t len); +// +// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE). +SYM_FUNC_START(aes_ecb_encrypt_zvkned) + aes_ecb_crypt 1 +SYM_FUNC_END(aes_ecb_encrypt_zvkned) + +// Same prototype and calling convention as the encryption function +SYM_FUNC_START(aes_ecb_decrypt_zvkned) + aes_ecb_crypt 0 +SYM_FUNC_END(aes_ecb_decrypt_zvkned) + +.macro aes_cbc_encrypt keylen + vle32.v v16, (IVP) // Load IV +1: + vle32.v v17, (INP) // Load plaintext block + vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block + aes_encrypt v16, \keylen // Encrypt + vse32.v v16, (OUTP) // Store ciphertext block + addi INP, INP, 16 + addi OUTP, OUTP, 16 + addi LEN, LEN, -16 + bnez LEN, 1b + + vse32.v v16, (IVP) // Store next IV + ret +.endm + +.macro aes_cbc_decrypt keylen + vle32.v v16, (IVP) // Load IV +1: + vle32.v v17, (INP) // Load ciphertext block + vmv.v.v v18, v17 // Save ciphertext block + aes_decrypt v17, \keylen // Decrypt + vxor.vv v17, v17, v16 // XOR with IV or prev ciphertext block + vse32.v v17, (OUTP) // Store plaintext block + vmv.v.v v16, v18 // Next "IV" is prev ciphertext block + addi INP, INP, 16 + addi OUTP, OUTP, 16 + addi LEN, LEN, -16 + bnez LEN, 1b + + vse32.v v16, (IVP) // Store next IV + ret +.endm + +// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key, +// const u8 *in, u8 *out, size_t len, u8 iv[16]); +// +// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE). +SYM_FUNC_START(aes_cbc_encrypt_zvkned) + aes_begin KEYP, 128f, 192f + aes_cbc_encrypt 256 +128: + aes_cbc_encrypt 128 +192: + aes_cbc_encrypt 192 +SYM_FUNC_END(aes_cbc_encrypt_zvkned) + +// Same prototype and calling convention as the encryption function +SYM_FUNC_START(aes_cbc_decrypt_zvkned) + aes_begin KEYP, 128f, 192f + aes_cbc_decrypt 256 +128: + aes_cbc_decrypt 128 +192: + aes_cbc_decrypt 192 +SYM_FUNC_END(aes_cbc_decrypt_zvkned) From bb54668837a073f18e173dd7be63f9ef5ee9f7ac Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:16 -0800 Subject: [PATCH 011/496] crypto: riscv - add vector crypto accelerated ChaCha20 Add an implementation of ChaCha20 using the Zvkb extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', and reducing code duplication. Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-6-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 11 + arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/chacha-riscv64-glue.c | 101 ++++++++ arch/riscv/crypto/chacha-riscv64-zvkb.S | 294 ++++++++++++++++++++++++ 4 files changed, 409 insertions(+) create mode 100644 arch/riscv/crypto/chacha-riscv64-glue.c create mode 100644 arch/riscv/crypto/chacha-riscv64-zvkb.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index ebe805fa3f5f..cb59e1d95495 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -18,4 +18,15 @@ config CRYPTO_AES_RISCV64 - Zvkb vector crypto extension (CTR) - Zvkg vector crypto extension (XTS) +config CRYPTO_CHACHA_RISCV64 + tristate "Ciphers: ChaCha" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_SKCIPHER + select CRYPTO_LIB_CHACHA_GENERIC + help + Length-preserving ciphers: ChaCha20 stream cipher algorithm + + Architecture: riscv64 using: + - Zvkb vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index 44922df7d182..ee994c8e6550 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -3,3 +3,6 @@ obj-$(CONFIG_CRYPTO_AES_RISCV64) += aes-riscv64.o aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \ aes-riscv64-zvkned-zvbb-zvkg.o aes-riscv64-zvkned-zvkb.o + +obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o +chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o diff --git a/arch/riscv/crypto/chacha-riscv64-glue.c b/arch/riscv/crypto/chacha-riscv64-glue.c new file mode 100644 index 000000000000..10b46f36375a --- /dev/null +++ b/arch/riscv/crypto/chacha-riscv64-glue.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ChaCha20 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include + +asmlinkage void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, + size_t len, const u32 iv[4]); + +static int riscv64_chacha20_crypt(struct skcipher_request *req) +{ + u32 iv[CHACHA_IV_SIZE / sizeof(u32)]; + u8 block_buffer[CHACHA_BLOCK_SIZE]; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; + unsigned int tail_bytes; + int err; + + iv[0] = get_unaligned_le32(req->iv); + iv[1] = get_unaligned_le32(req->iv + 4); + iv[2] = get_unaligned_le32(req->iv + 8); + iv[3] = get_unaligned_le32(req->iv + 12); + + err = skcipher_walk_virt(&walk, req, false); + while (walk.nbytes) { + nbytes = walk.nbytes & ~(CHACHA_BLOCK_SIZE - 1); + tail_bytes = walk.nbytes & (CHACHA_BLOCK_SIZE - 1); + kernel_vector_begin(); + if (nbytes) { + chacha20_zvkb(ctx->key, walk.src.virt.addr, + walk.dst.virt.addr, nbytes, iv); + iv[0] += nbytes / CHACHA_BLOCK_SIZE; + } + if (walk.nbytes == walk.total && tail_bytes > 0) { + memcpy(block_buffer, walk.src.virt.addr + nbytes, + tail_bytes); + chacha20_zvkb(ctx->key, block_buffer, block_buffer, + CHACHA_BLOCK_SIZE, iv); + memcpy(walk.dst.virt.addr + nbytes, block_buffer, + tail_bytes); + tail_bytes = 0; + } + kernel_vector_end(); + + err = skcipher_walk_done(&walk, tail_bytes); + } + + return err; +} + +static struct skcipher_alg riscv64_chacha_alg = { + .setkey = chacha20_setkey, + .encrypt = riscv64_chacha20_crypt, + .decrypt = riscv64_chacha20_crypt, + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .base = { + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct chacha_ctx), + .cra_priority = 300, + .cra_name = "chacha20", + .cra_driver_name = "chacha20-riscv64-zvkb", + .cra_module = THIS_MODULE, + }, +}; + +static int __init riscv64_chacha_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + return crypto_register_skcipher(&riscv64_chacha_alg); + + return -ENODEV; +} + +static void __exit riscv64_chacha_mod_exit(void) +{ + crypto_unregister_skcipher(&riscv64_chacha_alg); +} + +module_init(riscv64_chacha_mod_init); +module_exit(riscv64_chacha_mod_exit); + +MODULE_DESCRIPTION("ChaCha20 (RISC-V accelerated)"); +MODULE_AUTHOR("Jerry Shih "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/riscv/crypto/chacha-riscv64-zvkb.S b/arch/riscv/crypto/chacha-riscv64-zvkb.S new file mode 100644 index 000000000000..bf057737ac69 --- /dev/null +++ b/arch/riscv/crypto/chacha-riscv64-zvkb.S @@ -0,0 +1,294 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvkb + +#define KEYP a0 +#define INP a1 +#define OUTP a2 +#define LEN a3 +#define IVP a4 + +#define CONSTS0 a5 +#define CONSTS1 a6 +#define CONSTS2 a7 +#define CONSTS3 t0 +#define TMP t1 +#define VL t2 +#define STRIDE t3 +#define NROUNDS t4 +#define KEY0 s0 +#define KEY1 s1 +#define KEY2 s2 +#define KEY3 s3 +#define KEY4 s4 +#define KEY5 s5 +#define KEY6 s6 +#define KEY7 s7 +#define COUNTER s8 +#define NONCE0 s9 +#define NONCE1 s10 +#define NONCE2 s11 + +.macro chacha_round a0, b0, c0, d0, a1, b1, c1, d1, \ + a2, b2, c2, d2, a3, b3, c3, d3 + // a += b; d ^= a; d = rol(d, 16); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 16 + vror.vi \d1, \d1, 32 - 16 + vror.vi \d2, \d2, 32 - 16 + vror.vi \d3, \d3, 32 - 16 + + // c += d; b ^= c; b = rol(b, 12); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 12 + vror.vi \b1, \b1, 32 - 12 + vror.vi \b2, \b2, 32 - 12 + vror.vi \b3, \b3, 32 - 12 + + // a += b; d ^= a; d = rol(d, 8); + vadd.vv \a0, \a0, \b0 + vadd.vv \a1, \a1, \b1 + vadd.vv \a2, \a2, \b2 + vadd.vv \a3, \a3, \b3 + vxor.vv \d0, \d0, \a0 + vxor.vv \d1, \d1, \a1 + vxor.vv \d2, \d2, \a2 + vxor.vv \d3, \d3, \a3 + vror.vi \d0, \d0, 32 - 8 + vror.vi \d1, \d1, 32 - 8 + vror.vi \d2, \d2, 32 - 8 + vror.vi \d3, \d3, 32 - 8 + + // c += d; b ^= c; b = rol(b, 7); + vadd.vv \c0, \c0, \d0 + vadd.vv \c1, \c1, \d1 + vadd.vv \c2, \c2, \d2 + vadd.vv \c3, \c3, \d3 + vxor.vv \b0, \b0, \c0 + vxor.vv \b1, \b1, \c1 + vxor.vv \b2, \b2, \c2 + vxor.vv \b3, \b3, \c3 + vror.vi \b0, \b0, 32 - 7 + vror.vi \b1, \b1, 32 - 7 + vror.vi \b2, \b2, 32 - 7 + vror.vi \b3, \b3, 32 - 7 +.endm + +// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len, +// const u32 iv[4]); +// +// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE). +// The counter is treated as 32-bit, following the RFC7539 convention. +SYM_FUNC_START(chacha20_zvkb) + srli LEN, LEN, 6 // Bytes to blocks + + addi sp, sp, -96 + sd s0, 0(sp) + sd s1, 8(sp) + sd s2, 16(sp) + sd s3, 24(sp) + sd s4, 32(sp) + sd s5, 40(sp) + sd s6, 48(sp) + sd s7, 56(sp) + sd s8, 64(sp) + sd s9, 72(sp) + sd s10, 80(sp) + sd s11, 88(sp) + + li STRIDE, 64 + + // Set up the initial state matrix in scalar registers. + li CONSTS0, 0x61707865 // "expa" little endian + li CONSTS1, 0x3320646e // "nd 3" little endian + li CONSTS2, 0x79622d32 // "2-by" little endian + li CONSTS3, 0x6b206574 // "te k" little endian + lw KEY0, 0(KEYP) + lw KEY1, 4(KEYP) + lw KEY2, 8(KEYP) + lw KEY3, 12(KEYP) + lw KEY4, 16(KEYP) + lw KEY5, 20(KEYP) + lw KEY6, 24(KEYP) + lw KEY7, 28(KEYP) + lw COUNTER, 0(IVP) + lw NONCE0, 4(IVP) + lw NONCE1, 8(IVP) + lw NONCE2, 12(IVP) + +.Lblock_loop: + // Set vl to the number of blocks to process in this iteration. + vsetvli VL, LEN, e32, m1, ta, ma + + // Set up the initial state matrix for the next VL blocks in v0-v15. + // v{i} holds the i'th 32-bit word of the state matrix for all blocks. + // Note that only the counter word, at index 12, differs across blocks. + vmv.v.x v0, CONSTS0 + vmv.v.x v1, CONSTS1 + vmv.v.x v2, CONSTS2 + vmv.v.x v3, CONSTS3 + vmv.v.x v4, KEY0 + vmv.v.x v5, KEY1 + vmv.v.x v6, KEY2 + vmv.v.x v7, KEY3 + vmv.v.x v8, KEY4 + vmv.v.x v9, KEY5 + vmv.v.x v10, KEY6 + vmv.v.x v11, KEY7 + vid.v v12 + vadd.vx v12, v12, COUNTER + vmv.v.x v13, NONCE0 + vmv.v.x v14, NONCE1 + vmv.v.x v15, NONCE2 + + // Load the first half of the input data for each block into v16-v23. + // v{16+i} holds the i'th 32-bit word for all blocks. + vlsseg8e32.v v16, (INP), STRIDE + + li NROUNDS, 20 +.Lnext_doubleround: + addi NROUNDS, NROUNDS, -2 + // column round + chacha_round v0, v4, v8, v12, v1, v5, v9, v13, \ + v2, v6, v10, v14, v3, v7, v11, v15 + // diagonal round + chacha_round v0, v5, v10, v15, v1, v6, v11, v12, \ + v2, v7, v8, v13, v3, v4, v9, v14 + bnez NROUNDS, .Lnext_doubleround + + // Load the second half of the input data for each block into v24-v31. + // v{24+i} holds the {8+i}'th 32-bit word for all blocks. + addi TMP, INP, 32 + vlsseg8e32.v v24, (TMP), STRIDE + + // Finalize the first half of the keystream for each block. + vadd.vx v0, v0, CONSTS0 + vadd.vx v1, v1, CONSTS1 + vadd.vx v2, v2, CONSTS2 + vadd.vx v3, v3, CONSTS3 + vadd.vx v4, v4, KEY0 + vadd.vx v5, v5, KEY1 + vadd.vx v6, v6, KEY2 + vadd.vx v7, v7, KEY3 + + // Encrypt/decrypt the first half of the data for each block. + vxor.vv v16, v16, v0 + vxor.vv v17, v17, v1 + vxor.vv v18, v18, v2 + vxor.vv v19, v19, v3 + vxor.vv v20, v20, v4 + vxor.vv v21, v21, v5 + vxor.vv v22, v22, v6 + vxor.vv v23, v23, v7 + + // Store the first half of the output data for each block. + vssseg8e32.v v16, (OUTP), STRIDE + + // Finalize the second half of the keystream for each block. + vadd.vx v8, v8, KEY4 + vadd.vx v9, v9, KEY5 + vadd.vx v10, v10, KEY6 + vadd.vx v11, v11, KEY7 + vid.v v0 + vadd.vx v12, v12, COUNTER + vadd.vx v13, v13, NONCE0 + vadd.vx v14, v14, NONCE1 + vadd.vx v15, v15, NONCE2 + vadd.vv v12, v12, v0 + + // Encrypt/decrypt the second half of the data for each block. + vxor.vv v24, v24, v8 + vxor.vv v25, v25, v9 + vxor.vv v26, v26, v10 + vxor.vv v27, v27, v11 + vxor.vv v29, v29, v13 + vxor.vv v28, v28, v12 + vxor.vv v30, v30, v14 + vxor.vv v31, v31, v15 + + // Store the second half of the output data for each block. + addi TMP, OUTP, 32 + vssseg8e32.v v24, (TMP), STRIDE + + // Update the counter, the remaining number of blocks, and the input and + // output pointers according to the number of blocks processed (VL). + add COUNTER, COUNTER, VL + sub LEN, LEN, VL + slli TMP, VL, 6 + add OUTP, OUTP, TMP + add INP, INP, TMP + bnez LEN, .Lblock_loop + + ld s0, 0(sp) + ld s1, 8(sp) + ld s2, 16(sp) + ld s3, 24(sp) + ld s4, 32(sp) + ld s5, 40(sp) + ld s6, 48(sp) + ld s7, 56(sp) + ld s8, 64(sp) + ld s9, 72(sp) + ld s10, 80(sp) + ld s11, 88(sp) + addi sp, sp, 96 + ret +SYM_FUNC_END(chacha20_zvkb) From 600a3853dfa007935220b3489e2be5ab8950b4b4 Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:17 -0800 Subject: [PATCH 012/496] crypto: riscv - add vector crypto accelerated GHASH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an implementation of GHASH using the zvkg extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', reducing code duplication, and eliminating unnecessary endianness conversions. Co-developed-by: Christoph Müllner Signed-off-by: Christoph Müllner Co-developed-by: Heiko Stuebner Signed-off-by: Heiko Stuebner Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-7-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 10 ++ arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/ghash-riscv64-glue.c | 168 +++++++++++++++++++++++++ arch/riscv/crypto/ghash-riscv64-zvkg.S | 72 +++++++++++ 4 files changed, 253 insertions(+) create mode 100644 arch/riscv/crypto/ghash-riscv64-glue.c create mode 100644 arch/riscv/crypto/ghash-riscv64-zvkg.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index cb59e1d95495..676ba5af8f55 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -29,4 +29,14 @@ config CRYPTO_CHACHA_RISCV64 Architecture: riscv64 using: - Zvkb vector crypto extension +config CRYPTO_GHASH_RISCV64 + tristate "Hash functions: GHASH" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_GCM + help + GCM GHASH function (NIST SP 800-38D) + + Architecture: riscv64 using: + - Zvkg vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index ee994c8e6550..04c96b610748 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -6,3 +6,6 @@ aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \ obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o + +obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o +ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c new file mode 100644 index 000000000000..312e7891fd0a --- /dev/null +++ b/arch/riscv/crypto/ghash-riscv64-glue.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * GHASH using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data, + size_t len); + +struct riscv64_ghash_tfm_ctx { + be128 key; +}; + +struct riscv64_ghash_desc_ctx { + be128 accumulator; + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int riscv64_ghash_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) + return -EINVAL; + + memcpy(&tctx->key, key, GHASH_BLOCK_SIZE); + + return 0; +} + +static int riscv64_ghash_init(struct shash_desc *desc) +{ + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + *dctx = (struct riscv64_ghash_desc_ctx){}; + + return 0; +} + +static inline void +riscv64_ghash_blocks(const struct riscv64_ghash_tfm_ctx *tctx, + struct riscv64_ghash_desc_ctx *dctx, + const u8 *src, size_t srclen) +{ + /* The srclen is nonzero and a multiple of 16. */ + if (crypto_simd_usable()) { + kernel_vector_begin(); + ghash_zvkg(&dctx->accumulator, &tctx->key, src, srclen); + kernel_vector_end(); + } else { + do { + crypto_xor((u8 *)&dctx->accumulator, src, + GHASH_BLOCK_SIZE); + gf128mul_lle(&dctx->accumulator, &tctx->key); + src += GHASH_BLOCK_SIZE; + srclen -= GHASH_BLOCK_SIZE; + } while (srclen); + } +} + +static int riscv64_ghash_update(struct shash_desc *desc, const u8 *src, + unsigned int srclen) +{ + const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); + unsigned int len; + + if (dctx->bytes) { + if (dctx->bytes + srclen < GHASH_BLOCK_SIZE) { + memcpy(dctx->buffer + dctx->bytes, src, srclen); + dctx->bytes += srclen; + return 0; + } + memcpy(dctx->buffer + dctx->bytes, src, + GHASH_BLOCK_SIZE - dctx->bytes); + riscv64_ghash_blocks(tctx, dctx, dctx->buffer, + GHASH_BLOCK_SIZE); + src += GHASH_BLOCK_SIZE - dctx->bytes; + srclen -= GHASH_BLOCK_SIZE - dctx->bytes; + dctx->bytes = 0; + } + + len = round_down(srclen, GHASH_BLOCK_SIZE); + if (len) { + riscv64_ghash_blocks(tctx, dctx, src, len); + src += len; + srclen -= len; + } + + if (srclen) { + memcpy(dctx->buffer, src, srclen); + dctx->bytes = srclen; + } + + return 0; +} + +static int riscv64_ghash_final(struct shash_desc *desc, u8 *out) +{ + const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc); + int i; + + if (dctx->bytes) { + for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++) + dctx->buffer[i] = 0; + + riscv64_ghash_blocks(tctx, dctx, dctx->buffer, + GHASH_BLOCK_SIZE); + } + + memcpy(out, &dctx->accumulator, GHASH_DIGEST_SIZE); + return 0; +} + +static struct shash_alg riscv64_ghash_alg = { + .init = riscv64_ghash_init, + .update = riscv64_ghash_update, + .final = riscv64_ghash_final, + .setkey = riscv64_ghash_setkey, + .descsize = sizeof(struct riscv64_ghash_desc_ctx), + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct riscv64_ghash_tfm_ctx), + .cra_priority = 300, + .cra_name = "ghash", + .cra_driver_name = "ghash-riscv64-zvkg", + .cra_module = THIS_MODULE, + }, +}; + +static int __init riscv64_ghash_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKG) && + riscv_vector_vlen() >= 128) + return crypto_register_shash(&riscv64_ghash_alg); + + return -ENODEV; +} + +static void __exit riscv64_ghash_mod_exit(void) +{ + crypto_unregister_shash(&riscv64_ghash_alg); +} + +module_init(riscv64_ghash_mod_init); +module_exit(riscv64_ghash_mod_exit); + +MODULE_DESCRIPTION("GHASH (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/riscv/crypto/ghash-riscv64-zvkg.S b/arch/riscv/crypto/ghash-riscv64-zvkg.S new file mode 100644 index 000000000000..f2b43fb4d434 --- /dev/null +++ b/arch/riscv/crypto/ghash-riscv64-zvkg.S @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector GCM/GMAC extension ('Zvkg') + +#include + +.text +.option arch, +zvkg + +#define ACCUMULATOR a0 +#define KEY a1 +#define DATA a2 +#define LEN a3 + +// void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data, +// size_t len); +// +// |len| must be nonzero and a multiple of 16 (GHASH_BLOCK_SIZE). +SYM_FUNC_START(ghash_zvkg) + vsetivli zero, 4, e32, m1, ta, ma + vle32.v v1, (ACCUMULATOR) + vle32.v v2, (KEY) +.Lnext_block: + vle32.v v3, (DATA) + vghsh.vv v1, v2, v3 + addi DATA, DATA, 16 + addi LEN, LEN, -16 + bnez LEN, .Lnext_block + + vse32.v v1, (ACCUMULATOR) + ret +SYM_FUNC_END(ghash_zvkg) From 8c8e40470ffeb7a279254c78c7779d7294a76ef1 Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:18 -0800 Subject: [PATCH 013/496] crypto: riscv - add vector crypto accelerated SHA-{256,224} Add an implementation of SHA-256 and SHA-224 using the Zvknha or Zvknhb extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', and greatly reducing code duplication. Co-developed-by: Charalampos Mitrodimas Signed-off-by: Charalampos Mitrodimas Co-developed-by: Heiko Stuebner Signed-off-by: Heiko Stuebner Co-developed-by: Phoebe Chen Signed-off-by: Phoebe Chen Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-8-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 11 + arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/sha256-riscv64-glue.c | 137 +++++++++++ .../sha256-riscv64-zvknha_or_zvknhb-zvkb.S | 225 ++++++++++++++++++ 4 files changed, 376 insertions(+) create mode 100644 arch/riscv/crypto/sha256-riscv64-glue.c create mode 100644 arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 676ba5af8f55..687dbb71f7d5 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -39,4 +39,15 @@ config CRYPTO_GHASH_RISCV64 Architecture: riscv64 using: - Zvkg vector crypto extension +config CRYPTO_SHA256_RISCV64 + tristate "Hash functions: SHA-224 and SHA-256" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_SHA256 + help + SHA-224 and SHA-256 secure hash algorithm (FIPS 180) + + Architecture: riscv64 using: + - Zvknha or Zvknhb vector crypto extensions + - Zvkb vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index 04c96b610748..56064ea6e3ef 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -9,3 +9,6 @@ chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o + +obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o +sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o diff --git a/arch/riscv/crypto/sha256-riscv64-glue.c b/arch/riscv/crypto/sha256-riscv64-glue.c new file mode 100644 index 000000000000..71e051e40a64 --- /dev/null +++ b/arch/riscv/crypto/sha256-riscv64-glue.c @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 and SHA-224 using the RISC-V vector crypto extensions + * + * Copyright (C) 2022 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: the asm function only uses the 'state' field of struct sha256_state. + * It is assumed to be the first field. + */ +asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb( + struct sha256_state *state, const u8 *data, int num_blocks); + +static int riscv64_sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + /* + * Ensure struct sha256_state begins directly with the SHA-256 + * 256-bit internal state, as this is what the asm function expects. + */ + BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + sha256_base_do_update(desc, data, len, + sha256_transform_zvknha_or_zvknhb_zvkb); + kernel_vector_end(); + } else { + crypto_sha256_update(desc, data, len); + } + return 0; +} + +static int riscv64_sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (crypto_simd_usable()) { + kernel_vector_begin(); + if (len) + sha256_base_do_update( + desc, data, len, + sha256_transform_zvknha_or_zvknhb_zvkb); + sha256_base_do_finalize( + desc, sha256_transform_zvknha_or_zvknhb_zvkb); + kernel_vector_end(); + + return sha256_base_finish(desc, out); + } + + return crypto_sha256_finup(desc, data, len, out); +} + +static int riscv64_sha256_final(struct shash_desc *desc, u8 *out) +{ + return riscv64_sha256_finup(desc, NULL, 0, out); +} + +static int riscv64_sha256_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha256_base_init(desc) ?: + riscv64_sha256_finup(desc, data, len, out); +} + +static struct shash_alg riscv64_sha256_algs[] = { + { + .init = sha256_base_init, + .update = riscv64_sha256_update, + .final = riscv64_sha256_final, + .finup = riscv64_sha256_finup, + .digest = riscv64_sha256_digest, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA256_DIGEST_SIZE, + .base = { + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_priority = 300, + .cra_name = "sha256", + .cra_driver_name = "sha256-riscv64-zvknha_or_zvknhb-zvkb", + .cra_module = THIS_MODULE, + }, + }, { + .init = sha224_base_init, + .update = riscv64_sha256_update, + .final = riscv64_sha256_final, + .finup = riscv64_sha256_finup, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA224_DIGEST_SIZE, + .base = { + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_priority = 300, + .cra_name = "sha224", + .cra_driver_name = "sha224-riscv64-zvknha_or_zvknhb-zvkb", + .cra_module = THIS_MODULE, + }, + }, +}; + +static int __init riscv64_sha256_mod_init(void) +{ + /* Both zvknha and zvknhb provide the SHA-256 instructions. */ + if ((riscv_isa_extension_available(NULL, ZVKNHA) || + riscv_isa_extension_available(NULL, ZVKNHB)) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + return crypto_register_shashes(riscv64_sha256_algs, + ARRAY_SIZE(riscv64_sha256_algs)); + + return -ENODEV; +} + +static void __exit riscv64_sha256_mod_exit(void) +{ + crypto_unregister_shashes(riscv64_sha256_algs, + ARRAY_SIZE(riscv64_sha256_algs)); +} + +module_init(riscv64_sha256_mod_init); +module_exit(riscv64_sha256_mod_exit); + +MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha224"); diff --git a/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S new file mode 100644 index 000000000000..8ebcc17de4dc --- /dev/null +++ b/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvknha, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 + +#define MASK v0 +#define INDICES v1 +#define W0 v2 +#define W1 v3 +#define W2 v4 +#define W3 v5 +#define VTMP v6 +#define FEBA v7 +#define HGDC v8 +#define K0 v10 +#define K1 v11 +#define K2 v12 +#define K3 v13 +#define K4 v14 +#define K5 v15 +#define K6 v16 +#define K7 v17 +#define K8 v18 +#define K9 v19 +#define K10 v20 +#define K11 v21 +#define K12 v22 +#define K13 v23 +#define K14 v24 +#define K15 v25 +#define PREV_FEBA v26 +#define PREV_HGDC v27 + +// Do 4 rounds of SHA-256. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha256_4rounds last, k, w0, w1, w2, w3 + vadd.vv VTMP, \k, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha256_16rounds last, k0, k1, k2, k3 + sha256_4rounds \last, \k0, W0, W1, W2, W3 + sha256_4rounds \last, \k1, W1, W2, W3, W0 + sha256_4rounds \last, \k2, W2, W3, W0, W1 + sha256_4rounds \last, \k3, W3, W0, W1, W2 +.endm + +// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[8], const u8 *data, +// int num_blocks); +SYM_TYPED_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb) + + // Load the round constants into K0-K15. + vsetivli zero, 4, e32, m1, ta, ma + la t0, K256 + vle32.v K0, (t0) + addi t0, t0, 16 + vle32.v K1, (t0) + addi t0, t0, 16 + vle32.v K2, (t0) + addi t0, t0, 16 + vle32.v K3, (t0) + addi t0, t0, 16 + vle32.v K4, (t0) + addi t0, t0, 16 + vle32.v K5, (t0) + addi t0, t0, 16 + vle32.v K6, (t0) + addi t0, t0, 16 + vle32.v K7, (t0) + addi t0, t0, 16 + vle32.v K8, (t0) + addi t0, t0, 16 + vle32.v K9, (t0) + addi t0, t0, 16 + vle32.v K10, (t0) + addi t0, t0, 16 + vle32.v K11, (t0) + addi t0, t0, 16 + vle32.v K12, (t0) + addi t0, t0, 16 + vle32.v K13, (t0) + addi t0, t0, 16 + vle32.v K14, (t0) + addi t0, t0, 16 + vle32.v K15, (t0) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e32m1 and the index vtype + // is e8mf4. We use index-load with the i8 indices {20, 16, 4, 0}, + // loaded using the 32-bit little endian value 0x00041014. + li t0, 0x00041014 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 8 + vsetivli zero, 4, e32, m1, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 512-bit message block and endian-swap each 32-bit word. + vle32.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 16 + vle32.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 16 + vle32.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 16 + vle32.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 16 + + // Do the 64 rounds of SHA-256. + sha256_16rounds 0, K0, K1, K2, K3 + sha256_16rounds 0, K4, K5, K6, K7 + sha256_16rounds 0, K8, K9, K10, K11 + sha256_16rounds 1, K12, K13, K14, K15 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb) + +.section ".rodata" +.p2align 2 +.type K256, @object +K256: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.size K256, . - K256 From b3415925a08b13e468e8f3805bce86015475dd99 Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:19 -0800 Subject: [PATCH 014/496] crypto: riscv - add vector crypto accelerated SHA-{512,384} Add an implementation of SHA-512 and SHA-384 using the Zvknhb extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', and greatly reducing code duplication. Co-developed-by: Charalampos Mitrodimas Signed-off-by: Charalampos Mitrodimas Co-developed-by: Heiko Stuebner Signed-off-by: Heiko Stuebner Co-developed-by: Phoebe Chen Signed-off-by: Phoebe Chen Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-9-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 11 + arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/sha512-riscv64-glue.c | 133 ++++++++++++ .../riscv/crypto/sha512-riscv64-zvknhb-zvkb.S | 203 ++++++++++++++++++ 4 files changed, 350 insertions(+) create mode 100644 arch/riscv/crypto/sha512-riscv64-glue.c create mode 100644 arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 687dbb71f7d5..0fd0bf46c909 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -50,4 +50,15 @@ config CRYPTO_SHA256_RISCV64 - Zvknha or Zvknhb vector crypto extensions - Zvkb vector crypto extension +config CRYPTO_SHA512_RISCV64 + tristate "Hash functions: SHA-384 and SHA-512" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_SHA512 + help + SHA-384 and SHA-512 secure hash algorithm (FIPS 180) + + Architecture: riscv64 using: + - Zvknhb vector crypto extension + - Zvkb vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index 56064ea6e3ef..356a931db0eb 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -12,3 +12,6 @@ ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o + +obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o +sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c new file mode 100644 index 000000000000..43b56a08aeb5 --- /dev/null +++ b/arch/riscv/crypto/sha512-riscv64-glue.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-512 and SHA-384 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: the asm function only uses the 'state' field of struct sha512_state. + * It is assumed to be the first field. + */ +asmlinkage void sha512_transform_zvknhb_zvkb( + struct sha512_state *state, const u8 *data, int num_blocks); + +static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + /* + * Ensure struct sha512_state begins directly with the SHA-512 + * 512-bit internal state, as this is what the asm function expects. + */ + BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + sha512_base_do_update(desc, data, len, + sha512_transform_zvknhb_zvkb); + kernel_vector_end(); + } else { + crypto_sha512_update(desc, data, len); + } + return 0; +} + +static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (crypto_simd_usable()) { + kernel_vector_begin(); + if (len) + sha512_base_do_update(desc, data, len, + sha512_transform_zvknhb_zvkb); + sha512_base_do_finalize(desc, sha512_transform_zvknhb_zvkb); + kernel_vector_end(); + + return sha512_base_finish(desc, out); + } + + return crypto_sha512_finup(desc, data, len, out); +} + +static int riscv64_sha512_final(struct shash_desc *desc, u8 *out) +{ + return riscv64_sha512_finup(desc, NULL, 0, out); +} + +static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return sha512_base_init(desc) ?: + riscv64_sha512_finup(desc, data, len, out); +} + +static struct shash_alg riscv64_sha512_algs[] = { + { + .init = sha512_base_init, + .update = riscv64_sha512_update, + .final = riscv64_sha512_final, + .finup = riscv64_sha512_finup, + .digest = riscv64_sha512_digest, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA512_DIGEST_SIZE, + .base = { + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_priority = 300, + .cra_name = "sha512", + .cra_driver_name = "sha512-riscv64-zvknhb-zvkb", + .cra_module = THIS_MODULE, + }, + }, { + .init = sha384_base_init, + .update = riscv64_sha512_update, + .final = riscv64_sha512_final, + .finup = riscv64_sha512_finup, + .descsize = sizeof(struct sha512_state), + .digestsize = SHA384_DIGEST_SIZE, + .base = { + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_priority = 300, + .cra_name = "sha384", + .cra_driver_name = "sha384-riscv64-zvknhb-zvkb", + .cra_module = THIS_MODULE, + }, + }, +}; + +static int __init riscv64_sha512_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKNHB) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + return crypto_register_shashes(riscv64_sha512_algs, + ARRAY_SIZE(riscv64_sha512_algs)); + + return -ENODEV; +} + +static void __exit riscv64_sha512_mod_exit(void) +{ + crypto_unregister_shashes(riscv64_sha512_algs, + ARRAY_SIZE(riscv64_sha512_algs)); +} + +module_init(riscv64_sha512_mod_init); +module_exit(riscv64_sha512_mod_exit); + +MODULE_DESCRIPTION("SHA-512 (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("sha512"); +MODULE_ALIAS_CRYPTO("sha384"); diff --git a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S new file mode 100644 index 000000000000..3a9ae210f915 --- /dev/null +++ b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Phoebe Chen +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvknhb, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATEP_C a3 +#define K a4 + +#define MASK v0 +#define INDICES v1 +#define W0 v10 // LMUL=2 +#define W1 v12 // LMUL=2 +#define W2 v14 // LMUL=2 +#define W3 v16 // LMUL=2 +#define VTMP v20 // LMUL=2 +#define FEBA v22 // LMUL=2 +#define HGDC v24 // LMUL=2 +#define PREV_FEBA v26 // LMUL=2 +#define PREV_HGDC v28 // LMUL=2 + +// Do 4 rounds of SHA-512. w0 contains the current 4 message schedule words. +// +// If not all the message schedule words have been computed yet, then this also +// computes 4 more message schedule words. w1-w3 contain the next 3 groups of 4 +// message schedule words; this macro computes the group after w3 and writes it +// to w0. This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3, +// w0), so the caller must cycle through the registers accordingly. +.macro sha512_4rounds last, w0, w1, w2, w3 + vle64.v VTMP, (K) + addi K, K, 32 + vadd.vv VTMP, VTMP, \w0 + vsha2cl.vv HGDC, FEBA, VTMP + vsha2ch.vv FEBA, HGDC, VTMP +.if !\last + vmerge.vvm VTMP, \w2, \w1, MASK + vsha2ms.vv \w0, VTMP, \w3 +.endif +.endm + +.macro sha512_16rounds last + sha512_4rounds \last, W0, W1, W2, W3 + sha512_4rounds \last, W1, W2, W3, W0 + sha512_4rounds \last, W2, W3, W0, W1 + sha512_4rounds \last, W3, W0, W1, W2 +.endm + +// void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data, +// int num_blocks); +SYM_TYPED_FUNC_START(sha512_transform_zvknhb_zvkb) + + // Setup mask for the vmerge to replace the first word (idx==0) in + // message scheduling. There are 4 words, so an 8-bit mask suffices. + vsetivli zero, 1, e8, m1, ta, ma + vmv.v.i MASK, 0x01 + + // Load the state. The state is stored as {a,b,c,d,e,f,g,h}, but we + // need {f,e,b,a},{h,g,d,c}. The dst vtype is e64m2 and the index vtype + // is e8mf4. We use index-load with the i8 indices {40, 32, 8, 0}, + // loaded using the 32-bit little endian value 0x00082028. + li t0, 0x00082028 + vsetivli zero, 1, e32, m1, ta, ma + vmv.v.x INDICES, t0 + addi STATEP_C, STATEP, 16 + vsetivli zero, 4, e64, m2, ta, ma + vluxei8.v FEBA, (STATEP), INDICES + vluxei8.v HGDC, (STATEP_C), INDICES + +.Lnext_block: + la K, K512 + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_FEBA, FEBA + vmv.v.v PREV_HGDC, HGDC + + // Load the next 1024-bit message block and endian-swap each 64-bit word + vle64.v W0, (DATA) + vrev8.v W0, W0 + addi DATA, DATA, 32 + vle64.v W1, (DATA) + vrev8.v W1, W1 + addi DATA, DATA, 32 + vle64.v W2, (DATA) + vrev8.v W2, W2 + addi DATA, DATA, 32 + vle64.v W3, (DATA) + vrev8.v W3, W3 + addi DATA, DATA, 32 + + // Do the 80 rounds of SHA-512. + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 0 + sha512_16rounds 1 + + // Add the previous state. + vadd.vv FEBA, FEBA, PREV_FEBA + vadd.vv HGDC, HGDC, PREV_HGDC + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vsuxei8.v FEBA, (STATEP), INDICES + vsuxei8.v HGDC, (STATEP_C), INDICES + ret +SYM_FUNC_END(sha512_transform_zvknhb_zvkb) + +.section ".rodata" +.p2align 3 +.type K512, @object +K512: + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +.size K512, . - K512 From 563a5255afa237c961c5c8c8c552425c519b88da Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:20 -0800 Subject: [PATCH 015/496] crypto: riscv - add vector crypto accelerated SM3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an implementation of SM3 using the Zvksh extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', and greatly reducing code duplication. Co-developed-by: Christoph Müllner Signed-off-by: Christoph Müllner Co-developed-by: Heiko Stuebner Signed-off-by: Heiko Stuebner Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-10-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 12 ++ arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/sm3-riscv64-glue.c | 112 +++++++++++++++++++ arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S | 123 +++++++++++++++++++++ 4 files changed, 250 insertions(+) create mode 100644 arch/riscv/crypto/sm3-riscv64-glue.c create mode 100644 arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 0fd0bf46c909..179d09df8e0c 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -61,4 +61,16 @@ config CRYPTO_SHA512_RISCV64 - Zvknhb vector crypto extension - Zvkb vector crypto extension +config CRYPTO_SM3_RISCV64 + tristate "Hash functions: SM3 (ShangMi 3)" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_HASH + select CRYPTO_SM3 + help + SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) + + Architecture: riscv64 using: + - Zvksh vector crypto extension + - Zvkb vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index 356a931db0eb..da48977e96f5 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -15,3 +15,6 @@ sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o + +obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o +sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o diff --git a/arch/riscv/crypto/sm3-riscv64-glue.c b/arch/riscv/crypto/sm3-riscv64-glue.c new file mode 100644 index 000000000000..e1737a970c7c --- /dev/null +++ b/arch/riscv/crypto/sm3-riscv64-glue.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SM3 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Note: the asm function only uses the 'state' field of struct sm3_state. + * It is assumed to be the first field. + */ +asmlinkage void sm3_transform_zvksh_zvkb( + struct sm3_state *state, const u8 *data, int num_blocks); + +static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + /* + * Ensure struct sm3_state begins directly with the SM3 + * 256-bit internal state, as this is what the asm function expects. + */ + BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + sm3_base_do_update(desc, data, len, sm3_transform_zvksh_zvkb); + kernel_vector_end(); + } else { + sm3_update(shash_desc_ctx(desc), data, len); + } + return 0; +} + +static int riscv64_sm3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sm3_state *ctx; + + if (crypto_simd_usable()) { + kernel_vector_begin(); + if (len) + sm3_base_do_update(desc, data, len, + sm3_transform_zvksh_zvkb); + sm3_base_do_finalize(desc, sm3_transform_zvksh_zvkb); + kernel_vector_end(); + + return sm3_base_finish(desc, out); + } + + ctx = shash_desc_ctx(desc); + if (len) + sm3_update(ctx, data, len); + sm3_final(ctx, out); + + return 0; +} + +static int riscv64_sm3_final(struct shash_desc *desc, u8 *out) +{ + return riscv64_sm3_finup(desc, NULL, 0, out); +} + +static struct shash_alg riscv64_sm3_alg = { + .init = sm3_base_init, + .update = riscv64_sm3_update, + .final = riscv64_sm3_final, + .finup = riscv64_sm3_finup, + .descsize = sizeof(struct sm3_state), + .digestsize = SM3_DIGEST_SIZE, + .base = { + .cra_blocksize = SM3_BLOCK_SIZE, + .cra_priority = 300, + .cra_name = "sm3", + .cra_driver_name = "sm3-riscv64-zvksh-zvkb", + .cra_module = THIS_MODULE, + }, +}; + +static int __init riscv64_sm3_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKSH) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + return crypto_register_shash(&riscv64_sm3_alg); + + return -ENODEV; +} + +static void __exit riscv64_sm3_mod_exit(void) +{ + crypto_unregister_shash(&riscv64_sm3_alg); +} + +module_init(riscv64_sm3_mod_init); +module_exit(riscv64_sm3_mod_exit); + +MODULE_DESCRIPTION("SM3 (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("sm3"); diff --git a/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S b/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S new file mode 100644 index 000000000000..a2b65d961c04 --- /dev/null +++ b/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SM3 Secure Hash extension ('Zvksh') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvksh, +zvkb + +#define STATEP a0 +#define DATA a1 +#define NUM_BLOCKS a2 + +#define STATE v0 // LMUL=2 +#define PREV_STATE v2 // LMUL=2 +#define W0 v4 // LMUL=2 +#define W1 v6 // LMUL=2 +#define VTMP v8 // LMUL=2 + +.macro sm3_8rounds i, w0, w1 + // Do 4 rounds using W_{0+i}..W_{7+i}. + vsm3c.vi STATE, \w0, \i + 0 + vslidedown.vi VTMP, \w0, 2 + vsm3c.vi STATE, VTMP, \i + 1 + + // Compute W_{4+i}..W_{11+i}. + vslidedown.vi VTMP, \w0, 4 + vslideup.vi VTMP, \w1, 4 + + // Do 4 rounds using W_{4+i}..W_{11+i}. + vsm3c.vi STATE, VTMP, \i + 2 + vslidedown.vi VTMP, VTMP, 2 + vsm3c.vi STATE, VTMP, \i + 3 + +.if \i < 28 + // Compute W_{16+i}..W_{23+i}. + vsm3me.vv \w0, \w1, \w0 +.endif + // For the next 8 rounds, w0 and w1 are swapped. +.endm + +// void sm3_transform_zvksh_zvkb(u32 state[8], const u8 *data, int num_blocks); +SYM_TYPED_FUNC_START(sm3_transform_zvksh_zvkb) + + // Load the state and endian-swap each 32-bit word. + vsetivli zero, 8, e32, m2, ta, ma + vle32.v STATE, (STATEP) + vrev8.v STATE, STATE + +.Lnext_block: + addi NUM_BLOCKS, NUM_BLOCKS, -1 + + // Save the previous state, as it's needed later. + vmv.v.v PREV_STATE, STATE + + // Load the next 512-bit message block into W0-W1. + vle32.v W0, (DATA) + addi DATA, DATA, 32 + vle32.v W1, (DATA) + addi DATA, DATA, 32 + + // Do the 64 rounds of SM3. + sm3_8rounds 0, W0, W1 + sm3_8rounds 4, W1, W0 + sm3_8rounds 8, W0, W1 + sm3_8rounds 12, W1, W0 + sm3_8rounds 16, W0, W1 + sm3_8rounds 20, W1, W0 + sm3_8rounds 24, W0, W1 + sm3_8rounds 28, W1, W0 + + // XOR in the previous state. + vxor.vv STATE, STATE, PREV_STATE + + // Repeat if more blocks remain. + bnez NUM_BLOCKS, .Lnext_block + + // Store the new state and return. + vrev8.v STATE, STATE + vse32.v STATE, (STATEP) + ret +SYM_FUNC_END(sm3_transform_zvksh_zvkb) From b8d06352bbf397608a262c9d5f2b03ce32a3544a Mon Sep 17 00:00:00 2001 From: Jerry Shih Date: Sun, 21 Jan 2024 16:19:21 -0800 Subject: [PATCH 016/496] crypto: riscv - add vector crypto accelerated SM4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an implementation of SM4 using the Zvksed extension. The assembly code is derived from OpenSSL code (openssl/openssl#21923) that was dual-licensed so that it could be reused in the kernel. Nevertheless, the assembly has been significantly reworked for integration with the kernel, for example by using a regular .S file instead of the so-called perlasm, using the assembler instead of bare '.inst', and greatly reducing code duplication. Co-developed-by: Christoph Müllner Signed-off-by: Christoph Müllner Co-developed-by: Heiko Stuebner Signed-off-by: Heiko Stuebner Signed-off-by: Jerry Shih Co-developed-by: Eric Biggers Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240122002024.27477-11-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 17 +++ arch/riscv/crypto/Makefile | 3 + arch/riscv/crypto/sm4-riscv64-glue.c | 107 ++++++++++++++++++ arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S | 117 ++++++++++++++++++++ 4 files changed, 244 insertions(+) create mode 100644 arch/riscv/crypto/sm4-riscv64-glue.c create mode 100644 arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 179d09df8e0c..2ad44e1d464a 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -73,4 +73,21 @@ config CRYPTO_SM3_RISCV64 - Zvksh vector crypto extension - Zvkb vector crypto extension +config CRYPTO_SM4_RISCV64 + tristate "Ciphers: SM4 (ShangMi 4)" + depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO + select CRYPTO_ALGAPI + select CRYPTO_SM4 + help + SM4 block cipher algorithm (OSCCA GB/T 32907-2016, + ISO/IEC 18033-3:2010/Amd 1:2021) + + SM4 (GBT.32907-2016) is a cryptographic standard issued by the + Organization of State Commercial Administration of China (OSCCA) + as an authorized cryptographic algorithm for use within China. + + Architecture: riscv64 using: + - Zvksed vector crypto extension + - Zvkb vector crypto extension + endmenu diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile index da48977e96f5..247c7bc7288c 100644 --- a/arch/riscv/crypto/Makefile +++ b/arch/riscv/crypto/Makefile @@ -18,3 +18,6 @@ sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o + +obj-$(CONFIG_CRYPTO_SM4_RISCV64) += sm4-riscv64.o +sm4-riscv64-y := sm4-riscv64-glue.o sm4-riscv64-zvksed-zvkb.o diff --git a/arch/riscv/crypto/sm4-riscv64-glue.c b/arch/riscv/crypto/sm4-riscv64-glue.c new file mode 100644 index 000000000000..47fb84ebe577 --- /dev/null +++ b/arch/riscv/crypto/sm4-riscv64-glue.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * SM4 using the RISC-V vector crypto extensions + * + * Copyright (C) 2023 VRULL GmbH + * Author: Heiko Stuebner + * + * Copyright (C) 2023 SiFive, Inc. + * Author: Jerry Shih + */ + +#include +#include +#include +#include +#include +#include +#include + +asmlinkage void sm4_expandkey_zvksed_zvkb(const u8 user_key[SM4_KEY_SIZE], + u32 rkey_enc[SM4_RKEY_WORDS], + u32 rkey_dec[SM4_RKEY_WORDS]); +asmlinkage void sm4_crypt_zvksed_zvkb(const u32 rkey[SM4_RKEY_WORDS], + const u8 in[SM4_BLOCK_SIZE], + u8 out[SM4_BLOCK_SIZE]); + +static int riscv64_sm4_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (crypto_simd_usable()) { + if (keylen != SM4_KEY_SIZE) + return -EINVAL; + kernel_vector_begin(); + sm4_expandkey_zvksed_zvkb(key, ctx->rkey_enc, ctx->rkey_dec); + kernel_vector_end(); + return 0; + } + return sm4_expandkey(ctx, key, keylen); +} + +static void riscv64_sm4_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + sm4_crypt_zvksed_zvkb(ctx->rkey_enc, src, dst); + kernel_vector_end(); + } else { + sm4_crypt_block(ctx->rkey_enc, dst, src); + } +} + +static void riscv64_sm4_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); + + if (crypto_simd_usable()) { + kernel_vector_begin(); + sm4_crypt_zvksed_zvkb(ctx->rkey_dec, src, dst); + kernel_vector_end(); + } else { + sm4_crypt_block(ctx->rkey_dec, dst, src); + } +} + +static struct crypto_alg riscv64_sm4_alg = { + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = SM4_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct sm4_ctx), + .cra_priority = 300, + .cra_name = "sm4", + .cra_driver_name = "sm4-riscv64-zvksed-zvkb", + .cra_cipher = { + .cia_min_keysize = SM4_KEY_SIZE, + .cia_max_keysize = SM4_KEY_SIZE, + .cia_setkey = riscv64_sm4_setkey, + .cia_encrypt = riscv64_sm4_encrypt, + .cia_decrypt = riscv64_sm4_decrypt, + }, + .cra_module = THIS_MODULE, +}; + +static int __init riscv64_sm4_mod_init(void) +{ + if (riscv_isa_extension_available(NULL, ZVKSED) && + riscv_isa_extension_available(NULL, ZVKB) && + riscv_vector_vlen() >= 128) + return crypto_register_alg(&riscv64_sm4_alg); + + return -ENODEV; +} + +static void __exit riscv64_sm4_mod_exit(void) +{ + crypto_unregister_alg(&riscv64_sm4_alg); +} + +module_init(riscv64_sm4_mod_init); +module_exit(riscv64_sm4_mod_exit); + +MODULE_DESCRIPTION("SM4 (RISC-V accelerated)"); +MODULE_AUTHOR("Heiko Stuebner "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CRYPTO("sm4"); diff --git a/arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S b/arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S new file mode 100644 index 000000000000..fae62179a4a3 --- /dev/null +++ b/arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ +// +// This file is dual-licensed, meaning that you can use it under your +// choice of either of the following two licenses: +// +// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License 2.0 (the "License"). You can obtain +// a copy in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html +// +// or +// +// Copyright (c) 2023, Christoph Müllner +// Copyright (c) 2023, Jerry Shih +// Copyright 2024 Google LLC +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// The generated code of this file depends on the following RISC-V extensions: +// - RV64I +// - RISC-V Vector ('V') with VLEN >= 128 +// - RISC-V Vector SM4 Block Cipher extension ('Zvksed') +// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb') + +#include + +.text +.option arch, +zvksed, +zvkb + +// void sm4_expandkey_zksed_zvkb(const u8 user_key[16], u32 rkey_enc[32], +// u32 rkey_dec[32]); +SYM_FUNC_START(sm4_expandkey_zvksed_zvkb) + vsetivli zero, 4, e32, m1, ta, ma + + // Load the user key. + vle32.v v1, (a0) + vrev8.v v1, v1 + + // XOR the user key with the family key. + la t0, FAMILY_KEY + vle32.v v2, (t0) + vxor.vv v1, v1, v2 + + // Compute the round keys. Store them in forwards order in rkey_enc + // and in reverse order in rkey_dec. + addi a2, a2, 31*4 + li t0, -4 + .set i, 0 +.rept 8 + vsm4k.vi v1, v1, i + vse32.v v1, (a1) // Store to rkey_enc. + vsse32.v v1, (a2), t0 // Store to rkey_dec. +.if i < 7 + addi a1, a1, 16 + addi a2, a2, -16 +.endif + .set i, i + 1 +.endr + + ret +SYM_FUNC_END(sm4_expandkey_zvksed_zvkb) + +// void sm4_crypt_zvksed_zvkb(const u32 rkey[32], const u8 in[16], u8 out[16]); +SYM_FUNC_START(sm4_crypt_zvksed_zvkb) + vsetivli zero, 4, e32, m1, ta, ma + + // Load the input data. + vle32.v v1, (a1) + vrev8.v v1, v1 + + // Do the 32 rounds of SM4, 4 at a time. + .set i, 0 +.rept 8 + vle32.v v2, (a0) + vsm4r.vs v1, v2 +.if i < 7 + addi a0, a0, 16 +.endif + .set i, i + 1 +.endr + + // Store the output data (in reverse element order). + vrev8.v v1, v1 + li t0, -4 + addi a2, a2, 12 + vsse32.v v1, (a2), t0 + + ret +SYM_FUNC_END(sm4_crypt_zvksed_zvkb) + +.section ".rodata" +.p2align 2 +.type FAMILY_KEY, @object +FAMILY_KEY: + .word 0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC +.size FAMILY_KEY, . - FAMILY_KEY From d38e2e7bcb3e27d1d1433e5f7480f2a1ff6bcd98 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 5 Sep 2023 15:09:45 +0800 Subject: [PATCH 017/496] clocksource: extend the max_delta_ns of timer-riscv and timer-clint to ULONG_MAX When registering the riscv-timer or clint-timer as a clock_event device, the driver needs to specify the value of max_delta_ticks. This value directly influences the max_delta_ns, which represents the maximum time interval for configuring subsequent clock events. Currently, both riscv-timer and clint-timer are set with a max_delta_ticks value of 0x7fff_ffff. When the timer operates at a high frequency, this values limists the system to sleep only for a short time. For the 1GHz case, the sleep cannot exceed two seconds. To address this limitation, refer to other timer implementations to extend it to 2^(bit-width of the timer) - 1. Because the bit-width of $mtimecmp is 64bit, this value becomes ULONG_MAX (0xffff_ffff_ffff_ffff). Signed-off-by: Vincent Chen Link: https://lore.kernel.org/r/20230905070945.404653-1-vincent.chen@sifive.com Signed-off-by: Palmer Dabbelt --- drivers/clocksource/timer-clint.c | 2 +- drivers/clocksource/timer-riscv.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-clint.c b/drivers/clocksource/timer-clint.c index 9a55e733ae99..09fd292eb83d 100644 --- a/drivers/clocksource/timer-clint.c +++ b/drivers/clocksource/timer-clint.c @@ -131,7 +131,7 @@ static int clint_timer_starting_cpu(unsigned int cpu) struct clock_event_device *ce = per_cpu_ptr(&clint_clock_event, cpu); ce->cpumask = cpumask_of(cpu); - clockevents_config_and_register(ce, clint_timer_freq, 100, 0x7fffffff); + clockevents_config_and_register(ce, clint_timer_freq, 100, ULONG_MAX); enable_percpu_irq(clint_timer_irq, irq_get_trigger_type(clint_timer_irq)); diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index e66dcbd66566..87a7ac0ce6ce 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -114,7 +114,7 @@ static int riscv_timer_starting_cpu(unsigned int cpu) ce->features |= CLOCK_EVT_FEAT_C3STOP; if (static_branch_likely(&riscv_sstc_available)) ce->rating = 450; - clockevents_config_and_register(ce, riscv_timebase, 100, 0x7fffffff); + clockevents_config_and_register(ce, riscv_timebase, 100, ULONG_MAX); enable_percpu_irq(riscv_clock_event_irq, irq_get_trigger_type(riscv_clock_event_irq)); From e2d6b54b935a98c7d83f7e27597738be903d6703 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Wed, 2 Aug 2023 12:12:53 +0100 Subject: [PATCH 018/496] Revert "RISC-V: mark hibernation as nonportable" Revert commit ed309ce52218 ("RISC-V: mark hibernation as nonportable") as it appears the broken versions of OpenSBI have not made it to production on any systems that support hibernation. Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230802-chef-throng-d9de8b672a49@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 61826240926d..a82bc8bed503 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -1011,11 +1011,8 @@ menu "Power management options" source "kernel/power/Kconfig" -# Hibernation is only possible on systems where the SBI implementation has -# marked its reserved memory as not accessible from, or does not run -# from the same memory as, Linux config ARCH_HIBERNATION_POSSIBLE - def_bool NONPORTABLE + def_bool y config ARCH_HIBERNATION_HEADER def_bool HIBERNATION From 71a5849aedaa9ea028fc51ee74576cad61954743 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 29 Sep 2023 21:11:57 +0000 Subject: [PATCH 019/496] mm: Change mmap_rnd_bits_max to __ro_after_init Allow mmap_rnd_bits_max to be updated on architectures that determine virtual address space size at runtime instead of relying on Kconfig options by changing it from const to __ro_after_init. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230929211155.3910949-5-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- include/linux/mm.h | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..2488c0c5a288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -86,7 +86,7 @@ extern int sysctl_legacy_va_layout; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; -extern const int mmap_rnd_bits_max; +extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS diff --git a/mm/mmap.c b/mm/mmap.c index b78e83d351d2..8f47011de22e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -64,7 +64,7 @@ #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; -const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS From 7df1ff5a5cd615815bc6fb4a3a981e9746935e59 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 29 Sep 2023 21:11:58 +0000 Subject: [PATCH 020/496] riscv: mm: Update mmap_rnd_bits_max ARCH_MMAP_RND_BITS_MAX is based on Sv39, which leaves a few potential bits of mmap randomness on the table if we end up enabling 4/5-level paging. Update mmap_rnd_bits_max to take the final address space size into account. This increases mmap_rnd_bits_max from 24 to 33 with Sv48/57. Signed-off-by: Sami Tolvanen Reviewed-by: Kees Cook Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Link: https://lore.kernel.org/r/20230929211155.3910949-6-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 32cad6a65ccd..c55915554836 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -767,6 +767,11 @@ static int __init print_no5lvl(char *p) } early_param("no5lvl", print_no5lvl); +static void __init set_mmap_rnd_bits_max(void) +{ + mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3; +} + /* * There is a simple way to determine if 4-level is supported by the * underlying hardware: establish 1:1 mapping in 4-level page table mode @@ -1081,6 +1086,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) set_satp_mode(dtb_pa); + set_mmap_rnd_bits_max(); #endif /* From 40d1bb92a49313b3e0dc5513fdd2578362c40312 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 20 Dec 2023 01:50:44 +0800 Subject: [PATCH 021/496] riscv: tlb: convert __p*d_free_tlb() to inline functions This is to prepare for enabling MMU_GATHER_RCU_TABLE_FREE. No functionality changes. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231219175046.2496-3-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgalloc.h | 54 +++++++++++++++++++------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index c80bb9990d32..3c5e3bd15f46 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -95,13 +95,16 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) __pud_free(mm, pud); } -#define __pud_free_tlb(tlb, pud, addr) \ -do { \ - if (pgtable_l4_enabled) { \ - pagetable_pud_dtor(virt_to_ptdesc(pud)); \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pud)); \ - } \ -} while (0) +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + if (pgtable_l4_enabled) { + struct ptdesc *ptdesc = virt_to_ptdesc(pud); + + pagetable_pud_dtor(ptdesc); + tlb_remove_page_ptdesc(tlb, ptdesc); + } +} #define p4d_alloc_one p4d_alloc_one static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -130,11 +133,12 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) __p4d_free(mm, p4d); } -#define __p4d_free_tlb(tlb, p4d, addr) \ -do { \ - if (pgtable_l5_enabled) \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(p4d)); \ -} while (0) +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + if (pgtable_l5_enabled) + tlb_remove_page_ptdesc(tlb, virt_to_ptdesc(p4d)); +} #endif /* __PAGETABLE_PMD_FOLDED */ static inline void sync_kernel_mappings(pgd_t *pgd) @@ -159,19 +163,25 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #ifndef __PAGETABLE_PMD_FOLDED -#define __pmd_free_tlb(tlb, pmd, addr) \ -do { \ - pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ - tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ -} while (0) +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); + + pagetable_pmd_dtor(ptdesc); + tlb_remove_page_ptdesc(tlb, ptdesc); +} #endif /* __PAGETABLE_PMD_FOLDED */ -#define __pte_free_tlb(tlb, pte, buf) \ -do { \ - pagetable_pte_dtor(page_ptdesc(pte)); \ - tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ -} while (0) +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, + unsigned long addr) +{ + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); + tlb_remove_page_ptdesc(tlb, ptdesc); +} #endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_PGALLOC_H */ From 69be3fb111e73bd025ce6d2322371da5aa497c70 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 20 Dec 2023 01:50:45 +0800 Subject: [PATCH 022/496] riscv: enable MMU_GATHER_RCU_TABLE_FREE for SMP && MMU In order to implement fast gup we need to ensure that the page table walker is protected from page table pages being freed from under it. riscv situation is more complicated than other architectures: some riscv platforms may use IPI to perform TLB shootdown, for example, those platforms which support AIA, usually the riscv_ipi_for_rfence is true on these platforms; some riscv platforms may rely on the SBI to perform TLB shootdown, usually the riscv_ipi_for_rfence is false on these platforms. To keep software pagetable walkers safe in this case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h for more details. This patch enables MMU_GATHER_RCU_TABLE_FREE, then use *tlb_remove_page_ptdesc() for those platforms which use IPI to perform TLB shootdown; *tlb_remove_ptdesc() for those platforms which use SBI to perform TLB shootdown; Both case mean that disabling interrupts will block the free and protect the fast gup page walker. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231219175046.2496-4-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgalloc.h | 23 ++++++++++++++++++----- arch/riscv/include/asm/tlb.h | 18 ++++++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 95a2a06acc6a..a0d9b8f5371d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -147,6 +147,7 @@ config RISCV select IRQ_FORCED_THREADING select KASAN_VMALLOC if KASAN select LOCK_MM_AND_FIND_VMA + select MMU_GATHER_RCU_TABLE_FREE if SMP && MMU select MODULES_USE_ELF_RELA if MODULES select MODULE_SECTIONS if MODULES select OF diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 3c5e3bd15f46..deaf971253a2 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -102,7 +102,10 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, struct ptdesc *ptdesc = virt_to_ptdesc(pud); pagetable_pud_dtor(ptdesc); - tlb_remove_page_ptdesc(tlb, ptdesc); + if (riscv_use_ipi_for_rfence()) + tlb_remove_page_ptdesc(tlb, ptdesc); + else + tlb_remove_ptdesc(tlb, ptdesc); } } @@ -136,8 +139,12 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr) { - if (pgtable_l5_enabled) - tlb_remove_page_ptdesc(tlb, virt_to_ptdesc(p4d)); + if (pgtable_l5_enabled) { + if (riscv_use_ipi_for_rfence()) + tlb_remove_page_ptdesc(tlb, virt_to_ptdesc(p4d)); + else + tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); + } } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -169,7 +176,10 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, struct ptdesc *ptdesc = virt_to_ptdesc(pmd); pagetable_pmd_dtor(ptdesc); - tlb_remove_page_ptdesc(tlb, ptdesc); + if (riscv_use_ipi_for_rfence()) + tlb_remove_page_ptdesc(tlb, ptdesc); + else + tlb_remove_ptdesc(tlb, ptdesc); } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -180,7 +190,10 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, struct ptdesc *ptdesc = page_ptdesc(pte); pagetable_pte_dtor(ptdesc); - tlb_remove_page_ptdesc(tlb, ptdesc); + if (riscv_use_ipi_for_rfence()) + tlb_remove_page_ptdesc(tlb, ptdesc); + else + tlb_remove_ptdesc(tlb, ptdesc); } #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 1eb5682b2af6..a0b8b853503f 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -10,6 +10,24 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); +#ifdef CONFIG_MMU +#include + +/* + * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to + * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use + * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this + * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the + * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h + * for more details. + */ +static inline void __tlb_remove_table(void *table) +{ + free_page_and_swap_cache(table); +} + +#endif /* CONFIG_MMU */ + #define tlb_flush tlb_flush #include From 3f910b7a522e064d7261f31a00d9c9dca31d902a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 20 Dec 2023 01:50:46 +0800 Subject: [PATCH 023/496] riscv: enable HAVE_FAST_GUP if MMU Activate the fast gup for riscv mmu platforms. Here are some GUP_FAST_BENCHMARK performance numbers: Before the patch: GUP_FAST_BENCHMARK: Time: get:53203 put:5085 us After the patch: GUP_FAST_BENCHMARK: Time: get:17711 put:5060 us The get time is reduced by 66.7%! IOW, 3x get speed! Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231219175046.2496-5-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable.h | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a0d9b8f5371d..9e1e7c74b1f9 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -119,6 +119,7 @@ config RISCV select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION select HAVE_EBPF_JIT if MMU + select HAVE_FAST_GUP if MMU select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_ERROR_INJECTION select HAVE_GCC_PLUGINS diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 294044429e8e..a76e86e1cb4c 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -673,6 +673,12 @@ static inline int pmd_write(pmd_t pmd) return pte_write(pmd_pte(pmd)); } +#define pud_write pud_write +static inline int pud_write(pud_t pud) +{ + return pte_write(pud_pte(pud)); +} + static inline int pmd_dirty(pmd_t pmd) { return pte_dirty(pmd_pte(pmd)); From 5014396af9bbac0f28d9afee7eae405206d01ee7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Wed, 4 Oct 2023 15:10:09 +0200 Subject: [PATCH 024/496] riscv: blacklist assembly symbols for kprobe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding kprobes on some assembly functions (mainly exception handling) will result in crashes (either recursive trap or panic). To avoid such errors, add ASM_NOKPROBE() macro which allow adding specific symbols into the __kprobe_blacklist section and use to blacklist the following symbols that showed to be problematic: - handle_exception() - ret_from_exception() - handle_kernel_stack_overflow() Signed-off-by: Clément Léger Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231004131009.409193-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/asm.h | 10 ++++++++++ arch/riscv/kernel/entry.S | 3 +++ 2 files changed, 13 insertions(+) diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index b0487b39e674..776354895b81 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -183,6 +183,16 @@ REG_L x31, PT_T6(sp) .endm +/* Annotate a function as being unsuitable for kprobes. */ +#ifdef CONFIG_KPROBES +#define ASM_NOKPROBE(name) \ + .pushsection "_kprobe_blacklist", "aw"; \ + RISCV_PTR name; \ + .popsection +#else +#define ASM_NOKPROBE(name) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_RISCV_ASM_H */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 9d1a305d5508..68a24cf9481a 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -111,6 +111,7 @@ SYM_CODE_START(handle_exception) 1: tail do_trap_unknown SYM_CODE_END(handle_exception) +ASM_NOKPROBE(handle_exception) /* * The ret_from_exception must be called with interrupt disabled. Here is the @@ -184,6 +185,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception) sret #endif SYM_CODE_END(ret_from_exception) +ASM_NOKPROBE(ret_from_exception) #ifdef CONFIG_VMAP_STACK SYM_CODE_START_LOCAL(handle_kernel_stack_overflow) @@ -219,6 +221,7 @@ SYM_CODE_START_LOCAL(handle_kernel_stack_overflow) move a0, sp tail handle_bad_stack SYM_CODE_END(handle_kernel_stack_overflow) +ASM_NOKPROBE(handle_kernel_stack_overflow) #endif SYM_CODE_START(ret_from_fork) From dded618c07fd786f781c3f3529d8253e31e2c7d6 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 31 Oct 2023 08:40:18 +0800 Subject: [PATCH 025/496] RISC-V: Remove duplicated include in smpboot.c ./arch/riscv/kernel/smpboot.c: asm/cpufeature.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7086 Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20231031004018.45074-1-yang.lee@linux.alibaba.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 519b6bd946e5..cfbe4b840d42 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -28,7 +28,6 @@ #include #include -#include #include #include #include From 05d450aabd7386246c5aafc341fe9febe5855967 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Thu, 9 Nov 2023 21:37:51 +0800 Subject: [PATCH 026/496] riscv: Support RANDOMIZE_KSTACK_OFFSET Inspired from arm64's implement -- commit 70918779aec9 ("arm64: entry: Enable random_kstack_offset support") Add support of kernel stack offset randomization while handling syscall, the offset is defaultly limited by KSTACK_OFFSET_MAX() (i.e. 10 bits). In order to avoid trigger stack canaries (due to __builtin_alloca) and slowing down the entry path, use __no_stack_protector attribute to disable stack protector for do_trap_ecall_u() at the function level. Acked-by: Palmer Dabbelt Reviewed-by: Kees Cook Signed-off-by: Song Shuai Link: https://lore.kernel.org/r/20231109133751.212079-1-songshuaishuai@tinylab.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/kernel/traps.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index cd9fc635857e..b49016bb5077 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -109,6 +109,7 @@ config RISCV select HAVE_ARCH_KGDB_QXFER_PKT select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index a1b9be3c4332..868d6280cf66 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -310,7 +311,8 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs) } } -asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs) +asmlinkage __visible __trap_section __no_stack_protector +void do_trap_ecall_u(struct pt_regs *regs) { if (user_mode(regs)) { long syscall = regs->a7; @@ -322,10 +324,23 @@ asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs) syscall = syscall_enter_from_user_mode(regs, syscall); + add_random_kstack_offset(); + if (syscall >= 0 && syscall < NR_syscalls) syscall_handler(regs, syscall); else if (syscall != -1) regs->a0 = -ENOSYS; + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * so the maximum stack offset is 1k bytes (10 bits). + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: 16-byte (i.e. 4-bit) aligned + * for RV32I or RV64I. + * + * The resulting 6 bits of entropy is seen in SP[9:4]. + */ + choose_random_kstack_offset(get_random_u16()); syscall_exit_to_user_mode(regs); } else { From cb4ede926134a65bc3bf90ed58dace8451d7e759 Mon Sep 17 00:00:00 2001 From: Xiao Wang Date: Sun, 12 Nov 2023 17:44:21 +0800 Subject: [PATCH 027/496] riscv: Avoid code duplication with generic bitops implementation There's code duplication between the fallback implementation for bitops __ffs/__fls/ffs/fls API and the generic C implementation in include/asm-generic/bitops/. To avoid this duplication, this patch renames the generic C implementation by adding a "generic_" prefix to them, then we can use these generic APIs as fallback. Suggested-by: Geert Uytterhoeven Signed-off-by: Xiao Wang Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20231112094421.4014931-1-xiao.w.wang@intel.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/bitops.h | 138 +++++------------------------ include/asm-generic/bitops/__ffs.h | 8 +- include/asm-generic/bitops/__fls.h | 8 +- include/asm-generic/bitops/ffs.h | 8 +- include/asm-generic/bitops/fls.h | 8 +- 5 files changed, 48 insertions(+), 122 deletions(-) diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 9ffc35537024..c4c2173dfe99 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -22,6 +22,16 @@ #include #else +#define __HAVE_ARCH___FFS +#define __HAVE_ARCH___FLS +#define __HAVE_ARCH_FFS +#define __HAVE_ARCH_FLS + +#include +#include +#include +#include + #include #include @@ -37,8 +47,6 @@ static __always_inline unsigned long variable__ffs(unsigned long word) { - int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -52,32 +60,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word) return word; legacy: - num = 0; -#if BITS_PER_LONG == 64 - if ((word & 0xffffffff) == 0) { - num += 32; - word >>= 32; - } -#endif - if ((word & 0xffff) == 0) { - num += 16; - word >>= 16; - } - if ((word & 0xff) == 0) { - num += 8; - word >>= 8; - } - if ((word & 0xf) == 0) { - num += 4; - word >>= 4; - } - if ((word & 0x3) == 0) { - num += 2; - word >>= 2; - } - if ((word & 0x1) == 0) - num += 1; - return num; + return generic___ffs(word); } /** @@ -93,8 +76,6 @@ legacy: static __always_inline unsigned long variable__fls(unsigned long word) { - int num; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); @@ -108,32 +89,7 @@ static __always_inline unsigned long variable__fls(unsigned long word) return BITS_PER_LONG - 1 - word; legacy: - num = BITS_PER_LONG - 1; -#if BITS_PER_LONG == 64 - if (!(word & (~0ul << 32))) { - num -= 32; - word <<= 32; - } -#endif - if (!(word & (~0ul << (BITS_PER_LONG - 16)))) { - num -= 16; - word <<= 16; - } - if (!(word & (~0ul << (BITS_PER_LONG - 8)))) { - num -= 8; - word <<= 8; - } - if (!(word & (~0ul << (BITS_PER_LONG - 4)))) { - num -= 4; - word <<= 4; - } - if (!(word & (~0ul << (BITS_PER_LONG - 2)))) { - num -= 2; - word <<= 2; - } - if (!(word & (~0ul << (BITS_PER_LONG - 1)))) - num -= 1; - return num; + return generic___fls(word); } /** @@ -149,46 +105,23 @@ legacy: static __always_inline int variable_ffs(int x) { - int r; - - if (!x) - return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); + if (!x) + return 0; + asm volatile (".option push\n" ".option arch,+zbb\n" CTZW "%0, %1\n" ".option pop\n" - : "=r" (r) : "r" (x) :); + : "=r" (x) : "r" (x) :); - return r + 1; + return x + 1; legacy: - r = 1; - if (!(x & 0xffff)) { - x >>= 16; - r += 16; - } - if (!(x & 0xff)) { - x >>= 8; - r += 8; - } - if (!(x & 0xf)) { - x >>= 4; - r += 4; - } - if (!(x & 3)) { - x >>= 2; - r += 2; - } - if (!(x & 1)) { - x >>= 1; - r += 1; - } - return r; + return generic_ffs(x); } /** @@ -204,46 +137,23 @@ legacy: static __always_inline int variable_fls(unsigned int x) { - int r; - - if (!x) - return 0; - asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) : : : : legacy); + if (!x) + return 0; + asm volatile (".option push\n" ".option arch,+zbb\n" CLZW "%0, %1\n" ".option pop\n" - : "=r" (r) : "r" (x) :); + : "=r" (x) : "r" (x) :); - return 32 - r; + return 32 - x; legacy: - r = 32; - if (!(x & 0xffff0000u)) { - x <<= 16; - r -= 16; - } - if (!(x & 0xff000000u)) { - x <<= 8; - r -= 8; - } - if (!(x & 0xf0000000u)) { - x <<= 4; - r -= 4; - } - if (!(x & 0xc0000000u)) { - x <<= 2; - r -= 2; - } - if (!(x & 0x80000000u)) { - x <<= 1; - r -= 1; - } - return r; + return generic_fls(x); } /** diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 39e56e1c7203..446fea6dda78 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -5,12 +5,12 @@ #include /** - * __ffs - find first bit in word. + * generic___ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __ffs(unsigned long word) +static __always_inline unsigned long generic___ffs(unsigned long word) { int num = 0; @@ -41,4 +41,8 @@ static __always_inline unsigned long __ffs(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FFS +#define __ffs(word) generic___ffs(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FFS_H_ */ diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index 03f721a8a2b1..54ccccf96e21 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -5,12 +5,12 @@ #include /** - * __fls - find last (most-significant) set bit in a long word + * generic___fls - find last (most-significant) set bit in a long word * @word: the word to search * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned long __fls(unsigned long word) +static __always_inline unsigned long generic___fls(unsigned long word) { int num = BITS_PER_LONG - 1; @@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word) return num; } +#ifndef __HAVE_ARCH___FLS +#define __fls(word) generic___fls(word) +#endif + #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */ diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h index 323fd5d6ae26..4c43f242daeb 100644 --- a/include/asm-generic/bitops/ffs.h +++ b/include/asm-generic/bitops/ffs.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FFS_H_ /** - * ffs - find first bit set + * generic_ffs - find first bit set * @x: the word to search * * This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ -static inline int ffs(int x) +static inline int generic_ffs(int x) { int r = 1; @@ -39,4 +39,8 @@ static inline int ffs(int x) return r; } +#ifndef __HAVE_ARCH_FFS +#define ffs(x) generic_ffs(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FFS_H_ */ diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index b168bb10e1be..26f3ce1dd6e4 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -3,14 +3,14 @@ #define _ASM_GENERIC_BITOPS_FLS_H_ /** - * fls - find last (most-significant) bit set + * generic_fls - find last (most-significant) bit set * @x: the word to search * * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline int generic_fls(unsigned int x) { int r = 32; @@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x) return r; } +#ifndef __HAVE_ARCH_FLS +#define fls(x) generic_fls(x) +#endif + #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */ From 72fee6b0a3a4ad6c5131d4c20e8ab7253b16e38b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 31 Jan 2024 17:08:23 +0100 Subject: [PATCH 028/496] fbdev: Restrict FB_SH_MOBILE_LCDC to SuperH Since commit f402f7a02af6956d ("staging: board: Remove Armadillo-800-EVA board staging code"), there are no more users of the legacy SuperH Mobile LCDC framebuffer driver on Renesas ARM platforms. All former users on these platforms have been converted to the SH-Mobile DRM driver, using DT. Suggested-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven Acked-by: Javier Martinez Canillas Reviewed-by: Wolfram Sang Signed-off-by: Helge Deller --- drivers/video/fbdev/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 2d0bcc1d786e..b688900bb67e 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -1554,7 +1554,7 @@ config FB_FSL_DIU config FB_SH_MOBILE_LCDC tristate "SuperH Mobile LCDC framebuffer support" depends on FB && HAVE_CLK && HAS_IOMEM - depends on SUPERH || ARCH_RENESAS || COMPILE_TEST + depends on SUPERH || COMPILE_TEST depends on FB_DEVICE select FB_BACKLIGHT select FB_DEFERRED_IO From 0076a37a426b6c850a0b41b814952760e4a70fcf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 31 Jan 2024 17:11:45 +0100 Subject: [PATCH 029/496] dt-bindings: timer: renesas,tmu: Document input capture interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Timer Unit (TMU) instances with 3 channels support a fourth interrupt: an input capture interrupt for the third channel. While at it, document the meaning of the four interrupts, and add "interrupt-names" for clarity. Update the example to match reality. Inspired by a patch by Yoshinori Sato for SH. Signed-off-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Reviewed-by: Wolfram Sang Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/8cb38b5236213a467c6c0073f97ccc4bfd5a39ff.1706717378.git.geert+renesas@glider.be --- .../devicetree/bindings/timer/renesas,tmu.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml index a67e427a9e7e..84bbe15028a1 100644 --- a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml @@ -46,7 +46,19 @@ properties: interrupts: minItems: 2 - maxItems: 3 + items: + - description: Underflow interrupt, channel 0 + - description: Underflow interrupt, channel 1 + - description: Underflow interrupt, channel 2 + - description: Input capture interrupt, channel 2 + + interrupt-names: + minItems: 2 + items: + - const: tuni0 + - const: tuni1 + - const: tuni2 + - const: ticpi2 clocks: maxItems: 1 @@ -100,7 +112,9 @@ examples: reg = <0xffd80000 0x30>; interrupts = , , - ; + , + ; + interrupt-names = "tuni0", "tuni1", "tuni2", "ticpi2"; clocks = <&mstp0_clks R8A7779_CLK_TMU0>; clock-names = "fck"; power-domains = <&sysc R8A7779_PD_ALWAYS_ON>; From 69518264da6298fb1a490ca8adcfcb798c16e15c Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 15 Nov 2023 21:29:08 +0000 Subject: [PATCH 030/496] dt-bindings: timer: renesas: ostm: Document RZ/Five SoC The OSTM block on the RZ/Five SoC is identical to one found on the RZ/G2UL SoC. "renesas,r9a07g043-ostm" compatible string will be used on the RZ/Five SoC so to make this clear and to keep this file consistent, update the comment to include RZ/Five SoC. No driver changes are required as generic compatible string "renesas,ostm" will be used as a fallback on RZ/Five SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Acked-by: Conor Dooley Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231115212908.33131-1-prabhakar.mahadev-lad.rj@bp.renesas.com --- Documentation/devicetree/bindings/timer/renesas,ostm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/renesas,ostm.yaml b/Documentation/devicetree/bindings/timer/renesas,ostm.yaml index 7207929e5cd6..8b06a681764e 100644 --- a/Documentation/devicetree/bindings/timer/renesas,ostm.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,ostm.yaml @@ -23,7 +23,7 @@ properties: - enum: - renesas,r7s72100-ostm # RZ/A1H - renesas,r7s9210-ostm # RZ/A2M - - renesas,r9a07g043-ostm # RZ/G2UL + - renesas,r9a07g043-ostm # RZ/G2UL and RZ/Five - renesas,r9a07g044-ostm # RZ/G2{L,LC} - renesas,r9a07g054-ostm # RZ/V2L - const: renesas,ostm # Generic From d6cfd1770f20392d7009ae1fdb04733794514fa9 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:33 +0100 Subject: [PATCH 031/496] membarrier: riscv: Add full memory barrier in switch_mm() The membarrier system call requires a full memory barrier after storing to rq->curr, before going back to user-space. The barrier is only needed when switching between processes: the barrier is implied by mmdrop() when switching from kernel to userspace, and it's not needed when switching from userspace to kernel. Rely on the feature/mechanism ARCH_HAS_MEMBARRIER_CALLBACKS and on the primitive membarrier_arch_switch_mm(), already adopted by the PowerPC architecture, to insert the required barrier. Fixes: fab957c11efe2f ("RISC-V: Atomic and Locking Code") Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-2-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- MAINTAINERS | 2 +- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/membarrier.h | 31 +++++++++++++++++++++++++++++ arch/riscv/mm/context.c | 2 ++ kernel/sched/core.c | 5 +++-- 5 files changed, 38 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/include/asm/membarrier.h diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a69..2450e88d3e2c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14039,7 +14039,7 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported -F: arch/powerpc/include/asm/membarrier.h +F: arch/*/include/asm/membarrier.h F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..087abf9e51c6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -27,6 +27,7 @@ config RISCV select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV + select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h new file mode 100644 index 000000000000..6c016ebb5020 --- /dev/null +++ b/arch/riscv/include/asm/membarrier.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_RISCV_MEMBARRIER_H +#define _ASM_RISCV_MEMBARRIER_H + +static inline void membarrier_arch_switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + /* + * Only need the full barrier when switching between processes. + * Barrier when switching from kernel to userspace is not + * required here, given that it is implied by mmdrop(). Barrier + * when switching from userspace to kernel is not needed after + * store to rq->curr. + */ + if (IS_ENABLED(CONFIG_SMP) && + likely(!(atomic_read(&next->membarrier_state) & + (MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) + return; + + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * Matches a full barrier in the proximity of the membarrier + * system call entry. + */ + smp_mb(); +} + +#endif /* _ASM_RISCV_MEMBARRIER_H */ diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 217fd4de6134..ba8eb3944687 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (unlikely(prev == next)) return; + membarrier_arch_switch_mm(prev, next, task); + /* * Mark the current MM context as inactive, and the next as * active. This is at least used by the icache flushing diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..c4ca8085885a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6709,8 +6709,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * Here are the schemes providing that barrier on the * various architectures: - * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. - * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC, + * RISC-V. switch_mm() relies on membarrier_arch_switch_mm() + * on PowerPC and on RISC-V. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock From a14d11a0f5f4105e0df96811dfa81dc5f79fecba Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:34 +0100 Subject: [PATCH 032/496] membarrier: Create Documentation/scheduler/membarrier.rst To gather the architecture requirements of the "private/global expedited" membarrier commands. The file will be expanded to integrate further information about the membarrier syscall (as needed/desired in the future). While at it, amend some related inline comments in the membarrier codebase. Suggested-by: Mathieu Desnoyers Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-3-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- Documentation/scheduler/index.rst | 1 + Documentation/scheduler/membarrier.rst | 39 ++++++++++++++++++++++++++ MAINTAINERS | 1 + kernel/sched/core.c | 7 ++++- kernel/sched/membarrier.c | 8 +++--- 5 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 Documentation/scheduler/membarrier.rst diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst index 3170747226f6..43bd8a145b7a 100644 --- a/Documentation/scheduler/index.rst +++ b/Documentation/scheduler/index.rst @@ -7,6 +7,7 @@ Scheduler completion + membarrier sched-arch sched-bwc sched-deadline diff --git a/Documentation/scheduler/membarrier.rst b/Documentation/scheduler/membarrier.rst new file mode 100644 index 000000000000..2387804b1c63 --- /dev/null +++ b/Documentation/scheduler/membarrier.rst @@ -0,0 +1,39 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +membarrier() System Call +======================== + +MEMBARRIER_CMD_{PRIVATE,GLOBAL}_EXPEDITED - Architecture requirements +===================================================================== + +Memory barriers before updating rq->curr +---------------------------------------- + +The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED +require each architecture to have a full memory barrier after coming from +user-space, before updating rq->curr. This barrier is implied by the sequence +rq_lock(); smp_mb__after_spinlock() in __schedule(). The barrier matches a full +barrier in the proximity of the membarrier system call exit, cf. +membarrier_{private,global}_expedited(). + +Memory barriers after updating rq->curr +--------------------------------------- + +The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED +require each architecture to have a full memory barrier after updating rq->curr, +before returning to user-space. The schemes providing this barrier on the various +architectures are as follows. + + - alpha, arc, arm, hexagon, mips rely on the full barrier implied by + spin_unlock() in finish_lock_switch(). + + - arm64 relies on the full barrier implied by switch_to(). + + - powerpc, riscv, s390, sparc, x86 rely on the full barrier implied by + switch_mm(), if mm is not NULL; they rely on the full barrier implied + by mmdrop(), otherwise. On powerpc and riscv, switch_mm() relies on + membarrier_arch_switch_mm(). + +The barrier matches a full barrier in the proximity of the membarrier system call +entry, cf. membarrier_{private,global}_expedited(). diff --git a/MAINTAINERS b/MAINTAINERS index 2450e88d3e2c..78c835cc69c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14039,6 +14039,7 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported +F: Documentation/scheduler/membarrier.rst F: arch/*/include/asm/membarrier.h F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c4ca8085885a..a972628e7756 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6638,7 +6638,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * if (signal_pending_state()) if (p->state & @state) * * Also, the membarrier system call requires a full memory barrier - * after coming from user-space, before storing to rq->curr. + * after coming from user-space, before storing to rq->curr; this + * barrier matches a full barrier in the proximity of the membarrier + * system call exit. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -6716,6 +6718,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), + * + * The barrier matches a full barrier in the proximity of + * the membarrier system call entry. */ ++*switch_count; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 2ad881d07752..f3d91628d6b8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -251,7 +251,7 @@ static int membarrier_global_expedited(void) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -300,7 +300,7 @@ static int membarrier_global_expedited(void) /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ @@ -339,7 +339,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) return 0; /* - * Matches memory barriers around rq->curr modification in + * Matches memory barriers after rq->curr modification in * scheduler. */ smp_mb(); /* system call entry is not a mb. */ @@ -415,7 +415,7 @@ out: /* * Memory barrier on the caller thread _after_ we finished - * waiting for the last IPI. Matches memory barriers around + * waiting for the last IPI. Matches memory barriers before * rq->curr modification in scheduler. */ smp_mb(); /* exit from system call is not a mb */ From 4ff4c745a16c4c151a71863420811e7f406c3ec2 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:35 +0100 Subject: [PATCH 033/496] locking: Introduce prepare_sync_core_cmd() Introduce an architecture function that architectures can use to set up ("prepare") SYNC_CORE commands. The function will be used by RISC-V to update its "deferred icache- flush" data structures (icache_stale_mask). Architectures defining prepare_sync_core_cmd() static inline need to select ARCH_HAS_PREPARE_SYNC_CORE_CMD. Suggested-by: Mathieu Desnoyers Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-4-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- include/linux/sync_core.h | 16 +++++++++++++++- init/Kconfig | 3 +++ kernel/sched/membarrier.c | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h index 013da4b8b327..67bb9794b875 100644 --- a/include/linux/sync_core.h +++ b/include/linux/sync_core.h @@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void) } #endif -#endif /* _LINUX_SYNC_CORE_H */ +#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD +#include +#else +/* + * This is a dummy prepare_sync_core_cmd() implementation that can be used on + * all architectures which provide unconditional core serializing instructions + * in switch_mm(). + * If your architecture doesn't provide such core serializing instructions in + * switch_mm(), you may need to write your own functions. + */ +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ +} +#endif +#endif /* _LINUX_SYNC_CORE_H */ diff --git a/init/Kconfig b/init/Kconfig index 8df18f3a9748..c3994b92333d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1970,6 +1970,9 @@ source "kernel/Kconfig.locks" config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE bool +config ARCH_HAS_PREPARE_SYNC_CORE_CMD + bool + config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index f3d91628d6b8..6d1f31b3a967 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -320,6 +320,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; ipi_func = ipi_sync_core; + prepare_sync_core_cmd(mm); } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; From cd9b29014dc69609489261efe351d0c7709ae8bf Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Wed, 31 Jan 2024 15:49:36 +0100 Subject: [PATCH 034/496] membarrier: riscv: Provide core serializing command RISC-V uses xRET instructions on return from interrupt and to go back to user-space; the xRET instruction is not core serializing. Use FENCE.I for providing core serialization as follows: - by calling sync_core_before_usermode() on return from interrupt (cf. ipi_sync_core()), - via switch_mm() and sync_core_before_usermode() (respectively, for uthread->uthread and kthread->uthread transitions) before returning to user-space. On RISC-V, the serialization in switch_mm() is activated by resetting the icache_stale_mask of the mm at prepare_sync_core_cmd(). Suggested-by: Palmer Dabbelt Signed-off-by: Andrea Parri Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/20240131144936.29190-5-parri.andrea@gmail.com Signed-off-by: Palmer Dabbelt --- .../membarrier-sync-core/arch-support.txt | 18 +++++++++++- MAINTAINERS | 1 + arch/riscv/Kconfig | 3 ++ arch/riscv/include/asm/membarrier.h | 19 ++++++++++++ arch/riscv/include/asm/sync_core.h | 29 +++++++++++++++++++ kernel/sched/core.c | 4 +++ kernel/sched/membarrier.c | 4 +++ 7 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/sync_core.h diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index d96b778b87ed..7425d2b994a3 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -10,6 +10,22 @@ # Rely on implicit context synchronization as a result of exception return # when returning from IPI handler, and when returning to user-space. # +# * riscv +# +# riscv uses xRET as return from interrupt and to return to user-space. +# +# Given that xRET is not core serializing, we rely on FENCE.I for providing +# core serialization: +# +# - by calling sync_core_before_usermode() on return from interrupt (cf. +# ipi_sync_core()), +# +# - via switch_mm() and sync_core_before_usermode() (respectively, for +# uthread->uthread and kthread->uthread transitions) before returning +# to user-space. +# +# The serialization in switch_mm() is activated by prepare_sync_core_cmd(). +# # * x86 # # x86-32 uses IRET as return from interrupt, which takes care of the IPI. @@ -43,7 +59,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/MAINTAINERS b/MAINTAINERS index 78c835cc69c9..cc80968ec355 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14041,6 +14041,7 @@ L: linux-kernel@vger.kernel.org S: Supported F: Documentation/scheduler/membarrier.rst F: arch/*/include/asm/membarrier.h +F: arch/*/include/asm/sync_core.h F: include/uapi/linux/membarrier.h F: kernel/sched/membarrier.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 087abf9e51c6..70836381b948 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,14 +28,17 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MMIOWB select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API + select ARCH_HAS_PREPARE_SYNC_CORE_CMD select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU select ARCH_HAS_SET_MEMORY if MMU select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h index 6c016ebb5020..47b240d0d596 100644 --- a/arch/riscv/include/asm/membarrier.h +++ b/arch/riscv/include/asm/membarrier.h @@ -22,6 +22,25 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, /* * The membarrier system call requires a full memory barrier * after storing to rq->curr, before going back to user-space. + * + * This barrier is also needed for the SYNC_CORE command when + * switching between processes; in particular, on a transition + * from a thread belonging to another mm to a thread belonging + * to the mm for which a membarrier SYNC_CORE is done on CPU0: + * + * - [CPU0] sets all bits in the mm icache_stale_mask (in + * prepare_sync_core_cmd()); + * + * - [CPU1] stores to rq->curr (by the scheduler); + * + * - [CPU0] loads rq->curr within membarrier and observes + * cpu_rq(1)->curr->mm != mm, so the IPI is skipped on + * CPU1; this means membarrier relies on switch_mm() to + * issue the sync-core; + * + * - [CPU1] switch_mm() loads icache_stale_mask; if the bit + * is zero, switch_mm() may incorrectly skip the sync-core. + * * Matches a full barrier in the proximity of the membarrier * system call entry. */ diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h new file mode 100644 index 000000000000..9153016da8f1 --- /dev/null +++ b/arch/riscv/include/asm/sync_core.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_SYNC_CORE_H +#define _ASM_RISCV_SYNC_CORE_H + +/* + * RISC-V implements return to user-space through an xRET instruction, + * which is not core serializing. + */ +static inline void sync_core_before_usermode(void) +{ + asm volatile ("fence.i" ::: "memory"); +} + +#ifdef CONFIG_SMP +/* + * Ensure the next switch_mm() on every CPU issues a core serializing + * instruction for the given @mm. + */ +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ + cpumask_setall(&mm->context.icache_stale_mask); +} +#else +static inline void prepare_sync_core_cmd(struct mm_struct *mm) +{ +} +#endif /* CONFIG_SMP */ + +#endif /* _ASM_RISCV_SYNC_CORE_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a972628e7756..e4a87bcf28d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6721,6 +6721,10 @@ static void __sched notrace __schedule(unsigned int sched_mode) * * The barrier matches a full barrier in the proximity of * the membarrier system call entry. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ ++*switch_count; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 6d1f31b3a967..703e8d80a576 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -342,6 +342,10 @@ static int membarrier_private_expedited(int flags, int cpu_id) /* * Matches memory barriers after rq->curr modification in * scheduler. + * + * On RISC-V, this barrier pairing is also needed for the + * SYNC_CORE command when switching between processes, cf. + * the inline comments in membarrier_arch_switch_mm(). */ smp_mb(); /* system call entry is not a mb. */ From 45e0b0fd6dc574101825ac2738b890da024e4cda Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Wed, 6 Dec 2023 00:09:21 -0800 Subject: [PATCH 035/496] riscv: defconfig: Enable mmc and dma drivers for T-Head TH1520 Enable the mmc controller driver and dma controller driver needed for T-Head TH1520 based boards, like the LicheePi 4A and BeagleV-Ahead, to boot from eMMC storage. Reviewed-by: Guo Ren Signed-off-by: Drew Fustini Reviewed-by: Emil Renner Berthing Reviewed-by: Jisheng Zhang Link: https://lore.kernel.org/r/20231206-th1520_mmc_dts-v8-1-69220e373e8f@baylibre.com Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index eaf34e871e30..89a009a580fe 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -215,6 +215,7 @@ CONFIG_MMC=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SDHCI_CADENCE=y +CONFIG_MMC_SDHCI_OF_DWCMSHC=y CONFIG_MMC_SPI=y CONFIG_MMC_DW=y CONFIG_MMC_DW_STARFIVE=y @@ -224,6 +225,7 @@ CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_SUN6I=y CONFIG_DMADEVICES=y CONFIG_DMA_SUN6I=m +CONFIG_DW_AXI_DMAC=y CONFIG_RZ_DMAC=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BALLOON=y From 8ec11bd89e15f7858359f2c4c9eed1d7de739c3c Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 5 Feb 2024 11:17:57 +0800 Subject: [PATCH 036/496] dt-bindings: timer: nxp,sysctr-timer: support i.MX95 Add i.MX95 System counter module compatible string, the SCMI firmware blocks access to control register, so should not add "nxp,sysctr-timer" as fallback. Acked-by: Krzysztof Kozlowski Signed-off-by: Peng Fan Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240205-imx-sysctr-v4-1-ca5a6e1552e7@nxp.com --- Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml index 2b9653dafab8..891cca009528 100644 --- a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml +++ b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml @@ -18,7 +18,9 @@ description: | properties: compatible: - const: nxp,sysctr-timer + enum: + - nxp,imx95-sysctr-timer + - nxp,sysctr-timer reg: maxItems: 1 From 418062b548b138a1e6a9fde3a8ddf7fa77c44c9e Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 5 Feb 2024 11:17:58 +0800 Subject: [PATCH 037/496] clocksource/drivers/imx-sysctr: Drop use global variables Clean up code to not use global variables and introduce sysctr_private structure to prepare the support for i.MX95. Signed-off-by: Peng Fan Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240205-imx-sysctr-v4-2-ca5a6e1552e7@nxp.com --- drivers/clocksource/timer-imx-sysctr.c | 76 +++++++++++++++----------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/drivers/clocksource/timer-imx-sysctr.c b/drivers/clocksource/timer-imx-sysctr.c index 5a7a951c4efc..c075ea89a214 100644 --- a/drivers/clocksource/timer-imx-sysctr.c +++ b/drivers/clocksource/timer-imx-sysctr.c @@ -4,6 +4,7 @@ #include #include +#include #include "timer-of.h" @@ -20,32 +21,39 @@ #define SYS_CTR_CLK_DIV 0x3 -static void __iomem *sys_ctr_base __ro_after_init; -static u32 cmpcr __ro_after_init; +struct sysctr_private { + u32 cmpcr; +}; -static void sysctr_timer_enable(bool enable) +static void sysctr_timer_enable(struct clock_event_device *evt, bool enable) { - writel(enable ? cmpcr | SYS_CTR_EN : cmpcr, sys_ctr_base + CMPCR); + struct timer_of *to = to_timer_of(evt); + struct sysctr_private *priv = to->private_data; + void __iomem *base = timer_of_base(to); + + writel(enable ? priv->cmpcr | SYS_CTR_EN : priv->cmpcr, base + CMPCR); } -static void sysctr_irq_acknowledge(void) +static void sysctr_irq_acknowledge(struct clock_event_device *evt) { /* * clear the enable bit(EN =0) will clear * the status bit(ISTAT = 0), then the interrupt * signal will be negated(acknowledged). */ - sysctr_timer_enable(false); + sysctr_timer_enable(evt, false); } -static inline u64 sysctr_read_counter(void) +static inline u64 sysctr_read_counter(struct clock_event_device *evt) { + struct timer_of *to = to_timer_of(evt); + void __iomem *base = timer_of_base(to); u32 cnt_hi, tmp_hi, cnt_lo; do { - cnt_hi = readl_relaxed(sys_ctr_base + CNTCV_HI); - cnt_lo = readl_relaxed(sys_ctr_base + CNTCV_LO); - tmp_hi = readl_relaxed(sys_ctr_base + CNTCV_HI); + cnt_hi = readl_relaxed(base + CNTCV_HI); + cnt_lo = readl_relaxed(base + CNTCV_LO); + tmp_hi = readl_relaxed(base + CNTCV_HI); } while (tmp_hi != cnt_hi); return ((u64) cnt_hi << 32) | cnt_lo; @@ -54,22 +62,24 @@ static inline u64 sysctr_read_counter(void) static int sysctr_set_next_event(unsigned long delta, struct clock_event_device *evt) { + struct timer_of *to = to_timer_of(evt); + void __iomem *base = timer_of_base(to); u32 cmp_hi, cmp_lo; u64 next; - sysctr_timer_enable(false); + sysctr_timer_enable(evt, false); - next = sysctr_read_counter(); + next = sysctr_read_counter(evt); next += delta; cmp_hi = (next >> 32) & 0x00fffff; cmp_lo = next & 0xffffffff; - writel_relaxed(cmp_hi, sys_ctr_base + CMPCV_HI); - writel_relaxed(cmp_lo, sys_ctr_base + CMPCV_LO); + writel_relaxed(cmp_hi, base + CMPCV_HI); + writel_relaxed(cmp_lo, base + CMPCV_LO); - sysctr_timer_enable(true); + sysctr_timer_enable(evt, true); return 0; } @@ -81,7 +91,7 @@ static int sysctr_set_state_oneshot(struct clock_event_device *evt) static int sysctr_set_state_shutdown(struct clock_event_device *evt) { - sysctr_timer_enable(false); + sysctr_timer_enable(evt, false); return 0; } @@ -90,7 +100,7 @@ static irqreturn_t sysctr_timer_interrupt(int irq, void *dev_id) { struct clock_event_device *evt = dev_id; - sysctr_irq_acknowledge(); + sysctr_irq_acknowledge(evt); evt->event_handler(evt); @@ -117,34 +127,36 @@ static struct timer_of to_sysctr = { }, }; -static void __init sysctr_clockevent_init(void) -{ - to_sysctr.clkevt.cpumask = cpu_possible_mask; - - clockevents_config_and_register(&to_sysctr.clkevt, - timer_of_rate(&to_sysctr), - 0xff, 0x7fffffff); -} - static int __init sysctr_timer_init(struct device_node *np) { - int ret = 0; + struct sysctr_private *priv; + void __iomem *base; + int ret; + + priv = kzalloc(sizeof(struct sysctr_private), GFP_KERNEL); + if (!priv) + return -ENOMEM; ret = timer_of_init(np, &to_sysctr); - if (ret) + if (ret) { + kfree(priv); return ret; + } if (!of_property_read_bool(np, "nxp,no-divider")) { /* system counter clock is divided by 3 internally */ to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV; } - sys_ctr_base = timer_of_base(&to_sysctr); - cmpcr = readl(sys_ctr_base + CMPCR); - cmpcr &= ~SYS_CTR_EN; + to_sysctr.clkevt.cpumask = cpu_possible_mask; + to_sysctr.private_data = priv; - sysctr_clockevent_init(); + base = timer_of_base(&to_sysctr); + priv->cmpcr = readl(base + CMPCR) & ~SYS_CTR_EN; + clockevents_config_and_register(&to_sysctr.clkevt, + timer_of_rate(&to_sysctr), + 0xff, 0x7fffffff); return 0; } TIMER_OF_DECLARE(sysctr_timer, "nxp,sysctr-timer", sysctr_timer_init); From b67686e971b06eee1be363b89863bd1217f65190 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 5 Feb 2024 11:17:59 +0800 Subject: [PATCH 038/496] clocksource/drivers/imx-sysctr: Add i.MX95 support To i.MX95 System counter module, we use Read register space to get the counter, not the Control register space to get the counter, because System Manager firmware not allow Linux to read Control register space, so add a new TIMER_OF_DECLARE entry for i.MX95. Signed-off-by: Peng Fan Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240205-imx-sysctr-v4-3-ca5a6e1552e7@nxp.com --- drivers/clocksource/timer-imx-sysctr.c | 53 ++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/timer-imx-sysctr.c b/drivers/clocksource/timer-imx-sysctr.c index c075ea89a214..44525813be1e 100644 --- a/drivers/clocksource/timer-imx-sysctr.c +++ b/drivers/clocksource/timer-imx-sysctr.c @@ -9,12 +9,15 @@ #include "timer-of.h" #define CMP_OFFSET 0x10000 +#define RD_OFFSET 0x20000 #define CNTCV_LO 0x8 #define CNTCV_HI 0xc #define CMPCV_LO (CMP_OFFSET + 0x20) #define CMPCV_HI (CMP_OFFSET + 0x24) #define CMPCR (CMP_OFFSET + 0x2c) +#define CNTCV_LO_IMX95 (RD_OFFSET + 0x8) +#define CNTCV_HI_IMX95 (RD_OFFSET + 0xc) #define SYS_CTR_EN 0x1 #define SYS_CTR_IRQ_MASK 0x2 @@ -23,6 +26,8 @@ struct sysctr_private { u32 cmpcr; + u32 lo_off; + u32 hi_off; }; static void sysctr_timer_enable(struct clock_event_device *evt, bool enable) @@ -47,13 +52,14 @@ static void sysctr_irq_acknowledge(struct clock_event_device *evt) static inline u64 sysctr_read_counter(struct clock_event_device *evt) { struct timer_of *to = to_timer_of(evt); + struct sysctr_private *priv = to->private_data; void __iomem *base = timer_of_base(to); u32 cnt_hi, tmp_hi, cnt_lo; do { - cnt_hi = readl_relaxed(base + CNTCV_HI); - cnt_lo = readl_relaxed(base + CNTCV_LO); - tmp_hi = readl_relaxed(base + CNTCV_HI); + cnt_hi = readl_relaxed(base + priv->hi_off); + cnt_lo = readl_relaxed(base + priv->lo_off); + tmp_hi = readl_relaxed(base + priv->hi_off); } while (tmp_hi != cnt_hi); return ((u64) cnt_hi << 32) | cnt_lo; @@ -127,7 +133,7 @@ static struct timer_of to_sysctr = { }, }; -static int __init sysctr_timer_init(struct device_node *np) +static int __init __sysctr_timer_init(struct device_node *np) { struct sysctr_private *priv; void __iomem *base; @@ -154,9 +160,48 @@ static int __init sysctr_timer_init(struct device_node *np) base = timer_of_base(&to_sysctr); priv->cmpcr = readl(base + CMPCR) & ~SYS_CTR_EN; + return 0; +} + +static int __init sysctr_timer_init(struct device_node *np) +{ + struct sysctr_private *priv; + int ret; + + ret = __sysctr_timer_init(np); + if (ret) + return ret; + + priv = to_sysctr.private_data; + priv->lo_off = CNTCV_LO; + priv->hi_off = CNTCV_HI; + clockevents_config_and_register(&to_sysctr.clkevt, timer_of_rate(&to_sysctr), 0xff, 0x7fffffff); + return 0; } + +static int __init sysctr_timer_imx95_init(struct device_node *np) +{ + struct sysctr_private *priv; + int ret; + + ret = __sysctr_timer_init(np); + if (ret) + return ret; + + priv = to_sysctr.private_data; + priv->lo_off = CNTCV_LO_IMX95; + priv->hi_off = CNTCV_HI_IMX95; + + clockevents_config_and_register(&to_sysctr.clkevt, + timer_of_rate(&to_sysctr), + 0xff, 0x7fffffff); + + return 0; +} + TIMER_OF_DECLARE(sysctr_timer, "nxp,sysctr-timer", sysctr_timer_init); +TIMER_OF_DECLARE(sysctr_timer_imx95, "nxp,imx95-sysctr-timer", sysctr_timer_imx95_init); From b34b9547cee41575a4fddf390f615570759dc999 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 18 Feb 2024 18:41:37 +0100 Subject: [PATCH 039/496] clocksource/drivers/arm_global_timer: Fix maximum prescaler value The prescaler in the "Global Timer Control Register bit assignments" is documented to use bits [15:8], which means that the maximum prescaler register value is 0xff. Fixes: 171b45a4a70e ("clocksource/drivers/arm_global_timer: Implement rate compensation whenever source clock changes") Signed-off-by: Martin Blumenstingl Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240218174138.1942418-2-martin.blumenstingl@googlemail.com --- drivers/clocksource/arm_global_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 44a61dc6f932..e1c773bb5535 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -32,7 +32,7 @@ #define GT_CONTROL_IRQ_ENABLE BIT(2) /* banked */ #define GT_CONTROL_AUTO_INC BIT(3) /* banked */ #define GT_CONTROL_PRESCALER_SHIFT 8 -#define GT_CONTROL_PRESCALER_MAX 0xF +#define GT_CONTROL_PRESCALER_MAX 0xFF #define GT_CONTROL_PRESCALER_MASK (GT_CONTROL_PRESCALER_MAX << \ GT_CONTROL_PRESCALER_SHIFT) From 9256cec7b4f3293c11585326401325b1f81670e1 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 18 Feb 2024 18:41:38 +0100 Subject: [PATCH 040/496] clocksource/drivers/arm_global_timer: Remove stray tab Remove a stray tab in global_timer_of_register() which is different from the coding style in the rest of the driver. Signed-off-by: Martin Blumenstingl Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240218174138.1942418-3-martin.blumenstingl@googlemail.com --- drivers/clocksource/arm_global_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index e1c773bb5535..8dd1e46b7176 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -411,7 +411,7 @@ static int __init global_timer_of_register(struct device_node *np) err = gt_clocksource_init(); if (err) goto out_irq; - + err = cpuhp_setup_state(CPUHP_AP_ARM_GLOBAL_TIMER_STARTING, "clockevents/arm/global_timer:starting", gt_starting_cpu, gt_dying_cpu); From 97454a65d56b240013ed8d396427b85e34384238 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Jan 2024 21:36:41 -0800 Subject: [PATCH 041/496] clocksource: arm_global_timer: fix non-kernel-doc comment Use a common C comment "/*" instead of a kernel-doc marker "/**" to prevent kernel-doc warnings: arm_global_timer.c:92: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * To ensure that updates to comparator value register do not set the arm_global_timer.c:92: warning: missing initial short description on line: * To ensure that updates to comparator value register do not set the Signed-off-by: Randy Dunlap Cc: Patrice Chotard Cc: linux-arm-kernel@lists.infradead.org Cc: Daniel Lezcano Cc: Thomas Gleixner Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240115053641.29129-1-rdunlap@infradead.org --- drivers/clocksource/arm_global_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 8dd1e46b7176..d749dee1d41d 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -88,7 +88,7 @@ static u64 gt_counter_read(void) return _gt_counter_read(); } -/** +/* * To ensure that updates to comparator value register do not set the * Interrupt Status Register proceed as follows: * 1. Clear the Comp Enable bit in the Timer Control Register. From ec64db6955c5ad7e8fd03f7a52f8df84f943a9b8 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Tue, 12 Dec 2023 10:34:43 +0100 Subject: [PATCH 042/496] dt-bindings: timer: add Ralink SoCs system tick counter Add YAML doc for the system tick counter which is present on Ralink SoCs. cc: Daniel Lezcano Reviewed-by: Rob Herring Signed-off-by: Sergio Paracuellos Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231212093443.1898591-1-sergio.paracuellos@gmail.com --- .../bindings/timer/ralink,cevt-systick.yaml | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Documentation/devicetree/bindings/timer/ralink,cevt-systick.yaml diff --git a/Documentation/devicetree/bindings/timer/ralink,cevt-systick.yaml b/Documentation/devicetree/bindings/timer/ralink,cevt-systick.yaml new file mode 100644 index 000000000000..59d97feddf4e --- /dev/null +++ b/Documentation/devicetree/bindings/timer/ralink,cevt-systick.yaml @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/timer/ralink,cevt-systick.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: System tick counter present in Ralink family SoCs + +maintainers: + - Sergio Paracuellos + +properties: + compatible: + const: ralink,cevt-systick + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + systick@d00 { + compatible = "ralink,cevt-systick"; + reg = <0xd00 0x10>; + + interrupt-parent = <&cpuintc>; + interrupts = <7>; + }; +... From 154c56d80b8f64da92f10d94531edcd070b1af72 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Jan 2024 06:30:34 +0100 Subject: [PATCH 043/496] ARM: 9334/1: mm: init: remove misuse of kernel-doc comment Change the "/**" beginning of comment to the common "/*" comment since the comment is not in kernel-doc format. This prevents a kernel-doc warning: arch/arm/mm/init.c:422: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * update_sections_early intended to be called only through stop_machine Signed-off-by: Randy Dunlap Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Signed-off-by: Russell King (Oracle) --- arch/arm/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index a42e4cd11db2..4634db84c5af 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -418,7 +418,7 @@ static void set_section_perms(struct section_perm *perms, int n, bool set, } -/** +/* * update_sections_early intended to be called only through stop_machine * framework and executed by only one CPU while all other CPUs will spin and * wait, so no locking is required in this function. From daa559570d4b81e98e5a77a5c0f2a88879a9f245 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Feb 2024 18:28:06 +0100 Subject: [PATCH 044/496] ARM: 9349/1: unwind: Add missing "Call trace:" line Every other architecture in Linux includes the line "Call trace:" before backtraces. In some cases ARM would print "Backtrace:", but this was only via 1 specific call path, and wasn't included in CPU Oops nor things like KASAN, UBSAN, etc that called dump_stack(). Regularize this line so CI systems and other things (like LKDTM) that depend on parsing "Call trace:" out of dmesg will see it for ARM. Before this patch: UBSAN: array-index-out-of-bounds in ../drivers/misc/lkdtm/bugs.c:376:16 index 8 is out of range for type 'char [8]' CPU: 0 PID: 1402 Comm: cat Not tainted 6.7.0-rc2 #1 Hardware name: Generic DT based system dump_backtrace from show_stack+0x20/0x24 r7:00000042 r6:00000000 r5:60070013 r4:80cf5d7c show_stack from dump_stack_lvl+0x88/0x98 dump_stack_lvl from dump_stack+0x18/0x1c r7:00000042 r6:00000008 r5:00000008 r4:80fab118 dump_stack from ubsan_epilogue+0x10/0x3c ubsan_epilogue from __ubsan_handle_out_of_bounds+0x80/0x84 ... After this patch: UBSAN: array-index-out-of-bounds in ../drivers/misc/lkdtm/bugs.c:376:16 index 8 is out of range for type 'char [8]' CPU: 0 PID: 1402 Comm: cat Not tainted 6.7.0-rc2 #1 Hardware name: Generic DT based system Call trace: dump_backtrace from show_stack+0x20/0x24 r7:00000042 r6:00000000 r5:60070013 r4:80cf5d7c show_stack from dump_stack_lvl+0x88/0x98 dump_stack_lvl from dump_stack+0x18/0x1c r7:00000042 r6:00000008 r5:00000008 r4:80fab118 dump_stack from ubsan_epilogue+0x10/0x3c ubsan_epilogue from __ubsan_handle_out_of_bounds+0x80/0x84 ... Link: https://lore.kernel.org/r/20240110215554.work.460-kees@kernel.org Reported-by: Mark Brown Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Linus Walleij Cc: Vladimir Murzin Cc: Zhen Lei Cc: Keith Packard Cc: Haibo Li Cc: Reviewed-by: Mark Brown Reviewed-by: Linus Walleij Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Kees Cook Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/traps.c | 2 +- arch/arm/kernel/unwind.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 3bad79db5d6e..72c82a4d63ac 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -220,7 +220,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, unsigned int fp, mode; int ok = 1; - printk("%sBacktrace: ", loglvl); + printk("%sCall trace: ", loglvl); if (!tsk) tsk = current; diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index 9d2192156087..f60547dadc93 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -524,6 +524,8 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk, { struct stackframe frame; + printk("%sCall trace: ", loglvl); + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) From 169f9102f9198b04afffa6164372a4ba4070f412 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Feb 2024 18:32:58 +0100 Subject: [PATCH 045/496] ARM: 9350/1: fault: Implement copy_from_kernel_nofault_allowed() Under PAN emulation when dumping backtraces from things like the LKDTM EXEC_USERSPACE test[1], a double fault (which would hang a CPU) would happen because of dump_instr() attempting to read a userspace address. Make sure copy_from_kernel_nofault() does not attempt this any more. Closes: https://lava.sirena.org.uk/scheduler/job/497571 Link: https://lore.kernel.org/all/202401181125.D48DCB4C@keescook/ [1] Reported-by: Mark Brown Suggested-by: Russell King (Oracle) Reviewed-by: Ard Biesheuvel Tested-by: Mark Brown Cc: Wang Kefeng Cc: Andrew Morton Cc: Ben Hutchings Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Kees Cook Signed-off-by: Russell King (Oracle) --- arch/arm/mm/fault.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index e96fb40b9cc3..ec16907a551c 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -25,6 +25,13 @@ #include "fault.h" +bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) +{ + unsigned long addr = (unsigned long)unsafe_src; + + return addr >= TASK_SIZE && ULONG_MAX - addr >= size; +} + #ifdef CONFIG_MMU /* From 8f09b8b4fa58e99cbfd9a650b31d65cdbd8e4276 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Feb 2024 18:32:23 +0100 Subject: [PATCH 046/496] ARM: 9351/1: fault: Add "cut here" line for prefetch aborts The common pattern in arm is to emit a "8<--- cut here ---" line for faults, but it was missing for do_PrefetchAbort(). Add it. Cc: Wang Kefeng Cc: Ben Hutchings Cc: linux-arm-kernel@lists.infradead.org Acked-by: Ard Biesheuvel Signed-off-by: Kees Cook Signed-off-by: Russell King (Oracle) --- arch/arm/mm/fault.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index ec16907a551c..bc5b959b6f90 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -593,6 +593,7 @@ do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs) if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs)) return; + pr_alert("8<--- cut here ---\n"); pr_alert("Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n", inf->name, ifsr, addr); From c8c178e0aef1eb16c267500f702675a46ab2aace Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 15 Feb 2024 14:55:25 +0100 Subject: [PATCH 047/496] ARM: 9353/1: remove unneeded entry for CONFIG_FRAME_POINTER This is no-op. FRAME_POINTER is defined in lib/Kconfig.debug. Signed-off-by: Masahiro Yamada Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig.debug | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 5fbbac1b708b..7374483591f6 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -90,9 +90,6 @@ config BACKTRACE_VERBOSE In most cases, say N here, unless you are intending to debug the kernel and have access to the kernel binary image. -config FRAME_POINTER - bool - config DEBUG_USER bool "Verbose user fault messages" help From 723012cab779eee8228376754e22c6594229bf8f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:44 +0000 Subject: [PATCH 048/496] ubifs: Set page uptodate in the correct place Page cache reads are lockless, so setting the freshly allocated page uptodate before we've overwritten it with the data it's supposed to have in it will allow a simultaneous reader to see old data. Move the call to SetPageUptodate into ubifs_write_end(), which is after we copied the new data into the page. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5029eb3390a5..d0694b83dd02 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -261,9 +261,6 @@ static int write_begin_slow(struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } if (PagePrivate(page)) @@ -463,9 +460,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return err; } } - - SetPageUptodate(page); - ClearPageError(page); } err = allocate_budget(c, page, ui, appending); @@ -475,10 +469,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * If we skipped reading the page because we were going to * write all of it, then it is not up to date. */ - if (skipped_read) { + if (skipped_read) ClearPageChecked(page); - ClearPageUptodate(page); - } /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -569,6 +561,9 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, goto out; } + if (len == PAGE_SIZE) + SetPageUptodate(page); + if (!PagePrivate(page)) { attach_page_private(page, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); From 0df030d082d5b226984733b2e7386fa9760a7ca1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:45 +0000 Subject: [PATCH 049/496] ubifs: Convert from writepage to writepages This is a simplistic conversion to separate out any effects of no longer having a writepage method. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d0694b83dd02..2022a31006df 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1001,8 +1001,10 @@ static int do_writepage(struct page *page, int len) * on the page lock and it would not write the truncated inode node to the * journal before we have finished. */ -static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); @@ -1074,6 +1076,12 @@ out_unlock: return err; } +static int ubifs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return write_cache_pages(mapping, wbc, ubifs_writepage, NULL); +} + /** * do_attr_changes - change inode attributes. * @inode: inode to change attributes for @@ -1643,7 +1651,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap, const struct address_space_operations ubifs_file_address_operations = { .read_folio = ubifs_read_folio, - .writepage = ubifs_writepage, + .writepages = ubifs_writepages, .write_begin = ubifs_write_begin, .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, From c35acef383f4a2f2cfc30a5d8d3b0d49a86a1f7f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:46 +0000 Subject: [PATCH 050/496] ubifs: Convert ubifs_writepage to use a folio We still pass the page down to do_writepage(), but ubifs_writepage() itself is now large folio safe. It also contains far fewer hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2022a31006df..a4e8bec6c03c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1004,21 +1004,18 @@ static int do_writepage(struct page *page, int len) static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); loff_t i_size = i_size_read(inode), synced_i_size; - pgoff_t end_index = i_size >> PAGE_SHIFT; - int err, len = i_size & (PAGE_SIZE - 1); - void *kaddr; + int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, page->index, page->flags); - ubifs_assert(c, PagePrivate(page)); + inode->i_ino, folio->index, folio->flags); + ubifs_assert(c, folio->private != NULL); - /* Is the page fully outside @i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !len)) { + /* Is the folio fully outside @i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { err = 0; goto out_unlock; } @@ -1027,9 +1024,9 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, synced_i_size = ui->synced_i_size; spin_unlock(&ui->ui_lock); - /* Is the page fully inside @i_size? */ - if (page->index < end_index) { - if (page->index >= synced_i_size >> PAGE_SHIFT) { + /* Is the folio fully inside i_size? */ + if (folio_pos(folio) + len <= i_size) { + if (folio_pos(folio) >= synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); if (err) goto out_redirty; @@ -1042,20 +1039,18 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(page, PAGE_SIZE); + return do_writepage(&folio->page, len); } /* - * The page straddles @i_size. It must be zeroed out on each and every + * The folio straddles @i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - kaddr = kmap_atomic(page); - memset(kaddr + len, 0, PAGE_SIZE - len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + len = i_size - folio_pos(folio); + folio_zero_segment(folio, len, folio_size(folio)); if (i_size > synced_i_size) { err = inode->i_sb->s_op->write_inode(inode, NULL); @@ -1063,16 +1058,16 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(page, len); + return do_writepage(&folio->page, len); out_redirty: /* - * redirty_page_for_writepage() won't call ubifs_dirty_inode() because + * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so * there is no need to do space budget for dirty inode. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); out_unlock: - unlock_page(page); + folio_unlock(folio); return err; } From 783d074167711d8109131b66dd0b70743bfc2918 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:47 +0000 Subject: [PATCH 051/496] ubifs: Use a folio in do_truncation() Convert from the old page APIs to the new folio APIs which saves a few hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a4e8bec6c03c..7039c9006f24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1153,11 +1153,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (offset) { pgoff_t index = new_size >> PAGE_SHIFT; - struct page *page; + struct folio *folio; - page = find_lock_page(inode->i_mapping, index); - if (page) { - if (PageDirty(page)) { + folio = filemap_lock_folio(inode->i_mapping, index); + if (!IS_ERR(folio)) { + if (folio_test_dirty(folio)) { /* * 'ubifs_jnl_truncate()' will try to truncate * the last data node, but it contains @@ -1166,14 +1166,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * 'ubifs_jnl_truncate()' will see an already * truncated (and up to date) data node. */ - ubifs_assert(c, PagePrivate(page)); + ubifs_assert(c, folio->private != NULL); - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); if (UBIFS_BLOCKS_PER_PAGE_SHIFT) - offset = new_size & - (PAGE_SIZE - 1); - err = do_writepage(page, offset); - put_page(page); + offset = offset_in_folio(folio, + new_size); + err = do_writepage(&folio->page, offset); + folio_put(folio); if (err) goto out_budg; /* @@ -1186,8 +1186,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, * to 'ubifs_jnl_truncate()' to save it from * having to read it. */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } } } From 0c2d140c1f73f610c7e3f71888e1a50d770117e0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:48 +0000 Subject: [PATCH 052/496] ubifs: Convert do_writepage() to take a folio Replace the call to SetPageError() with a call to mapping_set_error(). Support large folios by using kmap_local_folio() and remapping each time we cross a page boundary. Saves a lot of hidden calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 56 ++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 7039c9006f24..70d391b72897 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -898,60 +898,64 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) return 0; } -static int do_writepage(struct page *page, int len) +static int do_writepage(struct folio *folio, size_t len) { - int err = 0, i, blen; + int err = 0, blen; unsigned int block; void *addr; + size_t offset = 0; union ubifs_key key; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; #ifdef UBIFS_DEBUG struct ubifs_inode *ui = ubifs_inode(inode); spin_lock(&ui->ui_lock); - ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT); + ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT); spin_unlock(&ui->ui_lock); #endif - /* Update radix tree tags */ - set_page_writeback(page); + folio_start_writeback(folio); - addr = kmap(page); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; - i = 0; - while (len) { - blen = min_t(int, len, UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, offset); + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + for (;;) { + blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); err = ubifs_jnl_write_data(c, inode, &key, addr, blen); if (err) break; - if (++i >= UBIFS_BLOCKS_PER_PAGE) + len -= blen; + if (!len) break; block += 1; addr += blen; - len -= blen; + if (folio_test_highmem(folio) && !offset_in_page(addr)) { + kunmap_local(addr - blen); + offset += PAGE_SIZE; + addr = kmap_local_folio(folio, offset); + } } + kunmap_local(addr); if (err) { - SetPageError(page); - ubifs_err(c, "cannot write page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); + mapping_set_error(folio->mapping, err); + ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } - ubifs_assert(c, PagePrivate(page)); - if (PageChecked(page)) + ubifs_assert(c, folio->private != NULL); + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); atomic_long_dec(&c->dirty_pg_cnt); - detach_page_private(page); - ClearPageChecked(page); + folio_detach_private(folio); + folio_clear_checked(folio); - kunmap(page); - unlock_page(page); - end_page_writeback(page); + folio_unlock(folio); + folio_end_writeback(folio); return err; } @@ -1039,7 +1043,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, * with this. */ } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); } /* @@ -1058,7 +1062,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc, goto out_redirty; } - return do_writepage(&folio->page, len); + return do_writepage(folio, len); out_redirty: /* * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because @@ -1172,7 +1176,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, if (UBIFS_BLOCKS_PER_PAGE_SHIFT) offset = offset_in_folio(folio, new_size); - err = do_writepage(&folio->page, offset); + err = do_writepage(folio, offset); folio_put(folio); if (err) goto out_budg; From 85ffbf555794c2484b9ab440d3bc23638a0cb315 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:49 +0000 Subject: [PATCH 053/496] ubifs: Convert ubifs_vm_page_mkwrite() to use a folio Replace six implicit calls to compound_head() with one. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- Documentation/mm/page_cache.rst | 10 +++++++++ fs/ubifs/file.c | 36 ++++++++++++++++----------------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Documentation/mm/page_cache.rst b/Documentation/mm/page_cache.rst index 75eba7c431b2..138d61f869df 100644 --- a/Documentation/mm/page_cache.rst +++ b/Documentation/mm/page_cache.rst @@ -3,3 +3,13 @@ ========== Page Cache ========== + +The page cache is the primary way that the user and the rest of the kernel +interact with filesystems. It can be bypassed (e.g. with O_DIRECT), +but normal reads, writes and mmaps go through the page cache. + +Folios +====== + +The folio is the unit of memory management within the page cache. +Operations diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 70d391b72897..3c4a98476c23 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1514,14 +1514,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) */ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; struct timespec64 now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1529,17 +1529,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* -EROFS */ /* - * We have not locked @page so far so we may budget for changing the - * page. Note, we cannot do this after we locked the page, because + * We have not locked @folio so far so we may budget for changing the + * folio. Note, we cannot do this after we locked the folio, because * budgeting may cause write-back which would cause deadlock. * - * At the moment we do not know whether the page is dirty or not, so we - * assume that it is not and budget for a new page. We could look at + * At the moment we do not know whether the folio is dirty or not, so we + * assume that it is not and budget for a new folio. We could look at * the @PG_private flag and figure this out, but we may race with write - * back and the page state may change by the time we lock it, so this + * back and the folio state may change by the time we lock it, so this * would need additional care. We do not bother with this at the * moment, although it might be good idea to do. Instead, we allocate - * budget for a new page and amend it later on if the page was in fact + * budget for a new folio and amend it later on if the folio was in fact * dirty. * * The budgeting-related logic of this function is similar to what we @@ -1562,21 +1562,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping || - page_offset(page) > i_size_read(inode))) { - /* Page got truncated out from underneath us */ + folio_lock(folio); + if (unlikely(folio->mapping != inode->i_mapping || + folio_pos(folio) >= i_size_read(inode))) { + /* Folio got truncated out from underneath us */ goto sigbus; } - if (PagePrivate(page)) + if (folio->private) release_new_page_budget(c); else { - if (!PageChecked(page)) + if (!folio_test_checked(folio)) ubifs_convert_page_budget(c); - attach_page_private(page, (void *)1); + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(folio->mapping, folio); } if (update_time) { @@ -1592,11 +1592,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) ubifs_release_dirty_inode_budget(c, ui); } - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; sigbus: - unlock_page(page); + folio_unlock(folio); ubifs_release_budget(c, &req); return VM_FAULT_SIGBUS; } From 2ec718435abb42901c83fbf62fc4d638d4078c27 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:50 +0000 Subject: [PATCH 054/496] ubifs: Convert write_begin_slow() to use a folio Update to new APIs, removing several calls to compound_head() and including support for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 3c4a98476c23..cc4d2ec95b70 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -222,16 +222,16 @@ static int write_begin_slow(struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; struct ubifs_budget_req req = { .new_page = 1 }; int err, appending = !!(pos + len > inode->i_size); - struct page *page; + struct folio *folio; dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* - * At the slow path we have to budget before locking the page, because - * budgeting may force write-back, which would wait on locked pages and - * deadlock if we had the page locked. At this point we do not know - * anything about the page, so assume that this is a new page which is + * At the slow path we have to budget before locking the folio, because + * budgeting may force write-back, which would wait on locked folios and + * deadlock if we had the folio locked. At this point we do not know + * anything about the folio, so assume that this is a new folio which is * written to a hole. This corresponds to largest budget. Later the * budget will be amended if this is not true. */ @@ -243,42 +243,43 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { ubifs_release_budget(c, &req); - return -ENOMEM; + return PTR_ERR(folio); } - if (!PageUptodate(page)) { - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) - SetPageChecked(page); + if (!folio_test_uptodate(folio)) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) + folio_set_checked(folio); else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ubifs_release_budget(c, &req); return err; } } } - if (PagePrivate(page)) + if (folio->private) /* - * The page is dirty, which means it was budgeted twice: + * The folio is dirty, which means it was budgeted twice: * o first time the budget was allocated by the task which - * made the page dirty and set the PG_private flag; + * made the folio dirty and set the private field; * o and then we budgeted for it for the second time at the * very beginning of this function. * - * So what we have to do is to release the page budget we + * So what we have to do is to release the folio budget we * allocated. */ release_new_page_budget(c); - else if (!PageChecked(page)) + else if (!folio_test_checked(folio)) /* - * We are changing a page which already exists on the media. - * This means that changing the page does not make the amount + * We are changing a folio which already exists on the media. + * This means that changing the folio does not make the amount * of indexing information larger, and this part of the budget * which we have already acquired may be released. */ @@ -301,7 +302,7 @@ static int write_begin_slow(struct address_space *mapping, ubifs_release_dirty_inode_budget(c, ui); } - *pagep = page; + *pagep = &folio->page; return 0; } From f60d356e6c5f966af5ab86701c93eb2028a66b31 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:51 +0000 Subject: [PATCH 055/496] ubifs: Convert ubifs_write_begin() to use a folio Save eight calls to compound_head() by using the new folio API. Remove a few assumptions that would break with large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cc4d2ec95b70..13c078bdf156 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -426,7 +426,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; int err, appending = !!(pos + len > inode->i_size); int skipped_read = 0; - struct page *page; + struct folio *folio; ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -435,13 +435,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = grab_cache_page_write_begin(mapping, index); - if (unlikely(!page)) - return -ENOMEM; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) { + if (pos == folio_pos(folio) && len >= folio_size(folio)) { /* * We change whole page so no need to load it. But we * do not know whether this page exists on the media or @@ -451,19 +452,19 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * media. Thus, we are setting the @PG_checked flag * here. */ - SetPageChecked(page); + folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(page); + err = do_readpage(&folio->page); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } } } - err = allocate_budget(c, page, ui, appending); + err = allocate_budget(c, &folio->page, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* @@ -471,7 +472,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * write all of it, then it is not up to date. */ if (skipped_read) - ClearPageChecked(page); + folio_clear_checked(folio); /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the @@ -483,8 +484,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); mutex_unlock(&ui->ui_mutex); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return write_begin_slow(mapping, pos, len, pagep); } @@ -495,9 +496,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * with @ui->ui_mutex locked if we are appending pages, and unlocked * otherwise. This is an optimization (slightly hacky though). */ - *pagep = page; + *pagep = &folio->page; return 0; - } /** From ffdff813d5b1fa738b5433c5698108c69cd4b71f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:52 +0000 Subject: [PATCH 056/496] ubifs: Convert ubifs_write_end() to use a folio Convert the incoming page pointer to a folio and use it throughout, saving several calls to compound_head(). Also remove some PAGE_SIZE assumptions. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 13c078bdf156..feab95b59b05 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -530,6 +530,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -537,47 +538,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, int appending = !!(end_pos > inode->i_size); dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", - inode->i_ino, pos, page->index, len, copied, inode->i_size); + inode->i_ino, pos, folio->index, len, copied, inode->i_size); - if (unlikely(copied < len && len == PAGE_SIZE)) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * VFS copied less data to the page that it intended and + * VFS copied less data to the folio than it intended and * declared in its '->write_begin()' call via the @len - * argument. If the page was not up-to-date, and @len was - * @PAGE_SIZE, the 'ubifs_write_begin()' function did + * argument. If the folio was not up-to-date, + * the 'ubifs_write_begin()' function did * not load it from the media (for optimization reasons). This - * means that part of the page contains garbage. So read the - * page now. + * means that part of the folio contains garbage. So read the + * folio now. */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, page, ui, appending); - ClearPageChecked(page); + cancel_budget(c, &folio->page, ui, appending); + folio_clear_checked(folio); /* * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(page); + copied = do_readpage(&folio->page); goto out; } - if (len == PAGE_SIZE) - SetPageUptodate(page); + if (len == folio_size(folio)) + folio_mark_uptodate(folio); - if (!PagePrivate(page)) { - attach_page_private(page, (void *)1); + if (!folio->private) { + folio_attach_private(folio, (void *)1); atomic_long_inc(&c->dirty_pg_cnt); - __set_page_dirty_nobuffers(page); + filemap_dirty_folio(mapping, folio); } if (appending) { i_size_write(inode, end_pos); ui->ui_size = end_pos; /* - * Note, we do not set @I_DIRTY_PAGES (which means that the - * inode has dirty pages), this has been done in - * '__set_page_dirty_nobuffers()'. + * We do not set @I_DIRTY_PAGES (which means that + * the inode has dirty pages), this was done in + * filemap_dirty_folio(). */ __mark_inode_dirty(inode, I_DIRTY_DATASYNC); ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); @@ -585,8 +586,8 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, } out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } From b96af1fdb47cf85eaddc3c82413b580b75aae2d2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:53 +0000 Subject: [PATCH 057/496] ubifs: Convert do_readpage() to take a folio All the callers now have a folio, so pass it in, and convert do_readpage() to us folios directly. Includes unifying the exit paths from this function and using kmap_local instead of plain kmap. This function should now work with large folios, but this is not tested. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 64 +++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index feab95b59b05..654af636b11d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -96,36 +96,36 @@ dump: return -EINVAL; } -static int do_readpage(struct page *page) +static int do_readpage(struct folio *folio) { void *addr; int err = 0, i; unsigned int block, beyond; - struct ubifs_data_node *dn; - struct inode *inode = page->mapping->host; + struct ubifs_data_node *dn = NULL; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); - ubifs_assert(c, !PageChecked(page)); - ubifs_assert(c, !PagePrivate(page)); + inode->i_ino, folio->index, i_size, folio->flags); + ubifs_assert(c, !folio_test_checked(folio)); + ubifs_assert(c, !folio->private); - addr = kmap(page); + addr = kmap_local_folio(folio, 0); - block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ - SetPageChecked(page); - memset(addr, 0, PAGE_SIZE); + folio_set_checked(folio); + addr = folio_zero_tail(folio, 0, addr); goto out; } dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); if (!dn) { err = -ENOMEM; - goto error; + goto out; } i = 0; @@ -150,39 +150,35 @@ static int do_readpage(struct page *page) memset(addr + ilen, 0, dlen - ilen); } } - if (++i >= UBIFS_BLOCKS_PER_PAGE) + if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; addr += UBIFS_BLOCK_SIZE; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } + if (err) { struct ubifs_info *c = inode->i_sb->s_fs_info; if (err == -ENOENT) { /* Not found, so it must be a hole */ - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); - goto out_free; + err = 0; + } else { + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + folio->index, inode->i_ino, err); } - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", - page->index, inode->i_ino, err); - goto error; } -out_free: - kfree(dn); out: - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); - return 0; - -error: kfree(dn); - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + if (!err) + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); return err; } @@ -254,7 +250,7 @@ static int write_begin_slow(struct address_space *mapping, if (pos == folio_pos(folio) && len >= folio_size(folio)) folio_set_checked(folio); else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -455,7 +451,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, folio_set_checked(folio); skipped_read = 1; } else { - err = do_readpage(&folio->page); + err = do_readpage(folio); if (err) { folio_unlock(folio); folio_put(folio); @@ -559,7 +555,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, * Return 0 to force VFS to repeat the whole operation, or the * error code if 'do_readpage()' fails. */ - copied = do_readpage(&folio->page); + copied = do_readpage(folio); goto out; } @@ -895,7 +891,7 @@ static int ubifs_read_folio(struct file *file, struct folio *folio) if (ubifs_bulk_read(page)) return 0; - do_readpage(page); + do_readpage(folio); folio_unlock(folio); return 0; } From a3c2f196cdfc21404e9678204f55ad7be8e7226e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:54 +0000 Subject: [PATCH 058/496] ubifs: Convert allocate_budget() to work on a folio The one caller has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 654af636b11d..5542d73db717 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -305,7 +305,7 @@ static int write_begin_slow(struct address_space *mapping, /** * allocate_budget - allocate budget for 'ubifs_write_begin()'. * @c: UBIFS file-system description object - * @page: page to allocate budget for + * @folio: folio to allocate budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * @@ -316,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping, * * Returns: %0 in case of success and %-ENOSPC in case of failure. */ -static int allocate_budget(struct ubifs_info *c, struct page *page, +static int allocate_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { struct ubifs_budget_req req = { .fast = 1 }; - if (PagePrivate(page)) { + if (folio->private) { if (!appending) /* - * The page is dirty and we are not appending, which + * The folio is dirty and we are not appending, which * means no budget is needed at all. */ return 0; @@ -348,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page, */ req.dirtied_ino = 1; } else { - if (PageChecked(page)) + if (folio_test_checked(folio)) /* * The page corresponds to a hole and does not * exist on the media. So changing it makes - * make the amount of indexing information + * the amount of indexing information * larger, and we have to budget for a new * page. */ @@ -460,7 +460,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, } } - err = allocate_budget(c, &folio->page, ui, appending); + err = allocate_budget(c, folio, ui, appending); if (unlikely(err)) { ubifs_assert(c, err == -ENOSPC); /* From 45d76698d119fffcd4d39d1cfcbfe30a87a7df77 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:55 +0000 Subject: [PATCH 059/496] ubifs: Convert cancel_budget() to take a folio The one caller already has a folio, so pass it in instead of the page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5542d73db717..9cd2e6d37c5d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -499,14 +499,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, /** * cancel_budget - cancel budget. * @c: UBIFS file-system description object - * @page: page to cancel budget for + * @folio: folio to cancel budget for * @ui: UBIFS inode object the page belongs to * @appending: non-zero if the page is appended * * This is a helper function for a page write operation. It unlocks the * @ui->ui_mutex in case of appending. */ -static void cancel_budget(struct ubifs_info *c, struct page *page, +static void cancel_budget(struct ubifs_info *c, struct folio *folio, struct ubifs_inode *ui, int appending) { if (appending) { @@ -514,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page, ubifs_release_dirty_inode_budget(c, ui); mutex_unlock(&ui->ui_mutex); } - if (!PagePrivate(page)) { - if (PageChecked(page)) + if (!folio->private) { + if (folio_test_checked(folio)) release_new_page_budget(c); else release_existing_page_budget(c); @@ -548,7 +548,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping, */ dbg_gen("copied %d instead of %d, read page and repeat", copied, len); - cancel_budget(c, &folio->page, ui, appending); + cancel_budget(c, folio, ui, appending); folio_clear_checked(folio); /* From 7f348f8ce51c963d99a66d00ab73717f35dd27e1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:56 +0000 Subject: [PATCH 060/496] ubifs: Pass a folio into ubifs_bulk_read() and ubifs_do_bulk_read() This saves a single call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 9cd2e6d37c5d..f481fca7f53d 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -703,15 +703,15 @@ out_err: * ubifs_do_bulk_read - do bulk-read. * @c: UBIFS file-system description object * @bu: bulk-read information - * @page1: first page to read + * @folio1: first folio to read * * Returns: %1 if the bulk-read is done, otherwise %0 is returned. */ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, - struct page *page1) + struct folio *folio1) { - pgoff_t offset = page1->index, end_index; - struct address_space *mapping = page1->mapping; + pgoff_t offset = folio1->index, end_index; + struct address_space *mapping = folio1->mapping; struct inode *inode = mapping->host; struct ubifs_inode *ui = ubifs_inode(inode); int err, page_idx, page_cnt, ret = 0, n = 0; @@ -761,11 +761,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, page1, bu, &n); + err = populate_page(c, &folio1->page, bu, &n); if (err) goto out_warn; - unlock_page(page1); + folio_unlock(folio1); ret = 1; isize = i_size_read(inode); @@ -810,7 +810,7 @@ out_bu_off: /** * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. - * @page: page from which to start bulk-read. + * @folio: folio from which to start bulk-read. * * Some flash media are capable of reading sequentially at faster rates. UBIFS * bulk-read facility is designed to take advantage of that, by reading in one @@ -819,12 +819,12 @@ out_bu_off: * * Returns: %1 if a bulk-read is done and %0 otherwise. */ -static int ubifs_bulk_read(struct page *page) +static int ubifs_bulk_read(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_inode *ui = ubifs_inode(inode); - pgoff_t index = page->index, last_page_read = ui->last_page_read; + pgoff_t index = folio->index, last_page_read = ui->last_page_read; struct bu_info *bu; int err = 0, allocated = 0; @@ -872,8 +872,8 @@ static int ubifs_bulk_read(struct page *page) bu->buf_len = c->max_bu_buf_len; data_key_init(c, &bu->key, inode->i_ino, - page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); - err = ubifs_do_bulk_read(c, bu, page); + folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, folio); if (!allocated) mutex_unlock(&c->bu_mutex); @@ -887,9 +887,7 @@ out_unlock: static int ubifs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - - if (ubifs_bulk_read(page)) + if (ubifs_bulk_read(folio)) return 0; do_readpage(folio); folio_unlock(folio); From d06192731c335cb7b62ad0dd211de26e03d56f3f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:57 +0000 Subject: [PATCH 061/496] ubifs: Use a folio in ubifs_do_bulk_read() When looking in the page cache, retrieve a folio instead of a page. This would need some work to make it safe for large folios. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index f481fca7f53d..11bf9f8cca6c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -775,19 +775,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, for (page_idx = 1; page_idx < page_cnt; page_idx++) { pgoff_t page_offset = offset + page_idx; - struct page *page; + struct folio *folio; if (page_offset > end_index) break; - page = pagecache_get_page(mapping, page_offset, + folio = __filemap_get_folio(mapping, page_offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT, ra_gfp_mask); - if (!page) + if (IS_ERR(folio)) break; - if (!PageUptodate(page)) - err = populate_page(c, page, bu, &n); - unlock_page(page); - put_page(page); + if (!folio_test_uptodate(folio)) + err = populate_page(c, &folio->page, bu, &n); + folio_unlock(folio); + folio_put(folio); if (err) break; } From a16bfab367c60f1a66be03676a374451dc022b37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Jan 2024 17:52:58 +0000 Subject: [PATCH 062/496] ubifs: Convert populate_page() to take a folio Both callers now have a folio, so pass it in. This function contains several assumptions that folios are not large. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/file.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 11bf9f8cca6c..a1f46919934c 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -590,35 +590,35 @@ out: /** * populate_page - copy data nodes into a page for bulk-read. * @c: UBIFS file-system description object - * @page: page + * @folio: folio * @bu: bulk-read information * @n: next zbranch slot * * Returns: %0 on success and a negative error code on failure. */ -static int populate_page(struct ubifs_info *c, struct page *page, +static int populate_page(struct ubifs_info *c, struct folio *folio, struct bu_info *bu, int *n) { int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; void *addr, *zaddr; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, page->index, i_size, page->flags); + inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap(page); + addr = zaddr = kmap_local_folio(folio, 0); end_index = (i_size - 1) >> PAGE_SHIFT; - if (!i_size || page->index > end_index) { + if (!i_size || folio->index > end_index) { hole = 1; - memset(addr, 0, PAGE_SIZE); + addr = folio_zero_tail(folio, 0, addr); goto out_hole; } - page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; while (1) { int err, len, out_len, dlen; @@ -667,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, break; addr += UBIFS_BLOCK_SIZE; page_block += 1; + if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { + kunmap_local(addr - UBIFS_BLOCK_SIZE); + addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); + } } - if (end_index == page->index) { + if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) @@ -678,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page, out_hole: if (hole) { - SetPageChecked(page); + folio_set_checked(folio); dbg_gen("hole"); } - SetPageUptodate(page); - ClearPageError(page); - flush_dcache_page(page); - kunmap(page); + folio_mark_uptodate(folio); + flush_dcache_folio(folio); + kunmap_local(addr); *n = nn; return 0; out_err: - ClearPageUptodate(page); - SetPageError(page); - flush_dcache_page(page); - kunmap(page); + flush_dcache_folio(folio); + kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -761,7 +762,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, goto out_warn; } - err = populate_page(c, &folio1->page, bu, &n); + err = populate_page(c, folio1, bu, &n); if (err) goto out_warn; @@ -785,7 +786,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, if (IS_ERR(folio)) break; if (!folio_test_uptodate(folio)) - err = populate_page(c, &folio->page, bu, &n); + err = populate_page(c, folio, bu, &n); folio_unlock(folio); folio_put(folio); if (err) From eb54235315f4577b035c89a10ca3eb48caab0445 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 24 Jan 2024 10:22:46 +0100 Subject: [PATCH 063/496] MAINTAINERS: Add Zhihao Cheng as UBI/UBIFS reviewer Recognizing Zhihao Cheng's valuable contributions, let's officially appoint him as a UBI/UBIFS reviewer. His demonstrated expertise and assistance make him a valuable addition to the MTD community. Cc: Zhihao Cheng Acked-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9ed4d3868539..94ff0f30130b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22471,6 +22471,7 @@ F: include/uapi/misc/uacce/ UBI FILE SYSTEM (UBIFS) M: Richard Weinberger +R: Zhihao Cheng L: linux-mtd@lists.infradead.org S: Supported W: http://www.linux-mtd.infradead.org/doc/ubifs.html @@ -22599,6 +22600,7 @@ F: drivers/ufs/host/ufs-renesas.c UNSORTED BLOCK IMAGES (UBI) M: Richard Weinberger +R: Zhihao Cheng L: linux-mtd@lists.infradead.org S: Supported W: http://www.linux-mtd.infradead.org/ From 68a24aba7c593eafa8fd00f2f76407b9b32b47a9 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 24 Jan 2024 07:37:02 +0100 Subject: [PATCH 064/496] ubi: Check for too small LEB size in VTBL code If the LEB size is smaller than a volume table record we cannot have volumes. In this case abort attaching. Cc: Chenyuan Yang Cc: stable@vger.kernel.org Fixes: 801c135ce73d ("UBI: Unsorted Block Images") Reported-by: Chenyuan Yang Closes: https://lore.kernel.org/linux-mtd/1433EB7A-FC89-47D6-8F47-23BE41B263B3@illinois.edu/ Signed-off-by: Richard Weinberger Reviewed-by: Zhihao Cheng --- drivers/mtd/ubi/vtbl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c index f700f0e4f2ec..6e5489e233dd 100644 --- a/drivers/mtd/ubi/vtbl.c +++ b/drivers/mtd/ubi/vtbl.c @@ -791,6 +791,12 @@ int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai) * The number of supported volumes is limited by the eraseblock size * and by the UBI_MAX_VOLUMES constant. */ + + if (ubi->leb_size < UBI_VTBL_RECORD_SIZE) { + ubi_err(ubi, "LEB size too small for a volume record"); + return -EINVAL; + } + ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE; if (ubi->vtbl_slots > UBI_MAX_VOLUMES) ubi->vtbl_slots = UBI_MAX_VOLUMES; From 60f16e912a53ad78306ca7cf0c95913293042411 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:07 +0100 Subject: [PATCH 065/496] ubifs: fix sort function prototype The global sort() function expects a callback pointer to a function with two void* arguments, but ubifs has a function with specific object types, which causes a warning in clang-16 and higher: fs/ubifs/lprops.c:1272:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct ubifs_lp_stats *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 1272 | (ubifs_lpt_scan_callback)scan_check_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change the prototype to the regular one and cast the object pointers locally instead. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 873e6e1c92b5..1cb79b167a4f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -726,11 +726,10 @@ out: return err; } -static int cmp_dirty_idx(const struct ubifs_lprops **a, - const struct ubifs_lprops **b) +static int cmp_dirty_idx(const void *a, const void *b) { - const struct ubifs_lprops *lpa = *a; - const struct ubifs_lprops *lpb = *b; + const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a; + const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b; return lpa->dirty + lpa->free - lpb->dirty - lpb->free; } @@ -754,7 +753,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) sizeof(void *) * c->dirty_idx.cnt); /* Sort it so that the dirtiest is now at the end */ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), - (int (*)(const void *, const void *))cmp_dirty_idx, NULL); + cmp_dirty_idx, NULL); dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); if (c->dirty_idx.cnt) dbg_find("dirtiest index LEB is %d with dirty %d and free %d", From ec724e534dfdd592abc5ac066be77ef15c455ccc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Feb 2024 10:54:08 +0100 Subject: [PATCH 066/496] ubifs: fix function pointer cast warnings ubifs has a number of callback functions for ubifs_lpt_scan_nolock() using two different prototypes, either passing a struct scan_data or a struct ubifs_lp_stats, but the caller expects a void pointer instead. clang-16 now warns about this: fs/ubifs/find.c:170:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 170 | (ubifs_lpt_scan_callback)scan_for_dirty_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ fs/ubifs/find.c:449:9: error: cast from 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, struct scan_data *)' to 'ubifs_lpt_scan_callback' (aka 'int (*)(struct ubifs_info *, const struct ubifs_lprops *, int, void *)') converts to incompatible function type [-Werror,-Wcast-function-type-strict] 449 | (ubifs_lpt_scan_callback)scan_for_free_cb, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Change all of these callback functions to actually take the void * argument that is passed by their caller. Signed-off-by: Arnd Bergmann Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/find.c | 23 ++++++++++++----------- fs/ubifs/lprops.c | 6 +++--- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 1cb79b167a4f..6ebf3c04ac5f 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) */ static int scan_for_dirty_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; data.exclude_index = exclude_index; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_dirty_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb, &data); if (err) return ERR_PTR(err); @@ -349,8 +349,9 @@ out: */ static int scan_for_free_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, data.pick_free = pick_free; data.lnum = -1; err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_free_cb, + scan_for_free_cb, &data); if (err) return ERR_PTR(err); @@ -589,8 +590,9 @@ out: */ static int scan_for_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) int err; data.lnum = -1; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_for_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb, &data); if (err) return ERR_PTR(err); @@ -781,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) */ static int scan_dirty_idx_cb(struct ubifs_info *c, const struct ubifs_lprops *lprops, int in_tree, - struct scan_data *data) + void *arg) { + struct scan_data *data = arg; int ret = LPT_SCAN_CONTINUE; /* Exclude LEBs that are currently in use */ @@ -841,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c) if (c->pnodes_have >= c->pnode_cnt) /* All pnodes are in memory, so skip scan */ return -ENOSPC; - err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, - (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb, &data); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 6d6cd85c2b4c..a11c3dab7e16 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1014,8 +1014,9 @@ out: */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct ubifs_lp_stats *lst) + void *arg) { + struct ubifs_lp_stats *lst = arg; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; @@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c) memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, - (ubifs_lpt_scan_callback)scan_check_cb, - &lst); + scan_check_cb, &lst); if (err && err != -ENOSPC) goto out; From 788cd161f99638db81b4c9ce836dbdf9a65c7701 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 11 Jan 2024 14:36:56 +0800 Subject: [PATCH 067/496] ubifs: Remove unreachable code in dbg_check_ltab_lnum Because there is no break statement in the dead loop above,it is impossible to execute the 'err=0' statement.Delete the code that will never execute. Fixes: 6fb324a4b0c3 ("UBIFS: allocate ltab checking buffer on demand") Signed-off-by: Kunwu Chan Cc: Kunwu Chan Suggested-by: Richard Weinberger Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- fs/ubifs/lpt_commit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index c4d079328b92..07351fdce722 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) len -= node_len; } - err = 0; out: vfree(buf); return err; From 7f174ae4f39e8475adcc09d26c5a43394689ad6c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 20 Feb 2024 10:49:03 +0800 Subject: [PATCH 068/496] ubi: correct the calculation of fastmap size Now that the calculation of fastmap size in ubi_calc_fm_size() is incorrect since it miss each user volume's ubi_fm_eba structure and the Internal UBI volume info. Let's correct the calculation. Cc: stable@vger.kernel.org Signed-off-by: Zhang Yi Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/fastmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c index 2a728c31e6b8..9a4940874be5 100644 --- a/drivers/mtd/ubi/fastmap.c +++ b/drivers/mtd/ubi/fastmap.c @@ -85,9 +85,10 @@ size_t ubi_calc_fm_size(struct ubi_device *ubi) sizeof(struct ubi_fm_scan_pool) + sizeof(struct ubi_fm_scan_pool) + (ubi->peb_count * sizeof(struct ubi_fm_ec)) + - (sizeof(struct ubi_fm_eba) + - (ubi->peb_count * sizeof(__be32))) + - sizeof(struct ubi_fm_volhdr) * UBI_MAX_VOLUMES; + ((sizeof(struct ubi_fm_eba) + + sizeof(struct ubi_fm_volhdr)) * + (UBI_MAX_VOLUMES + UBI_INT_VOL_COUNT)) + + (ubi->peb_count * sizeof(__be32)); return roundup(size, ubi->leb_size); } From fbed4baed046a2815889810c396e333820b164b6 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Sat, 13 Jan 2024 21:06:00 +0800 Subject: [PATCH 069/496] ubi: fix slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 When using the ioctl interface to resize a UBI volume, `ubi_resize_volume` resizes the EBA table first but does not change `vol->reserved_pebs` in the same atomic context, which may cause concurrent access to the EBA table. For example, when a user shrinks UBI volume A by calling `ubi_resize_volume`, while another thread is writing to volume B and triggering wear-leveling, which may call `ubi_write_fastmap`, under these circumstances, KASAN may report a slab-out-of-bounds error in `ubi_eba_get_ldesc+0xfb/0x130`. This patch fixes race conditions in `ubi_resize_volume` and `ubi_update_fastmap` to avoid out-of-bounds reads of `eba_tbl`. First, it ensures that updates to `eba_tbl` and `reserved_pebs` are protected by `vol->volumes_lock`. Second, it implements a rollback mechanism in case of resize failure. It is also worth mentioning that for volume shrinkage failures, since part of the volume has already been shrunk and unmapped, there is no need to recover `{rsvd/avail}_pebs`. ================================================================== BUG: KASAN: slab-out-of-bounds in ubi_eba_get_ldesc+0xfb/0x130 [ubi] Read of size 4 at addr ffff88800f43f570 by task kworker/u16:0/7 CPU: 0 PID: 7 Comm: kworker/u16:0 Not tainted 5.16.0-rc7 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call Trace: dump_stack_lvl+0x4d/0x66 print_address_description.constprop.0+0x41/0x60 kasan_report.cold+0x83/0xdf ubi_eba_get_ldesc+0xfb/0x130 [ubi] ubi_update_fastmap.cold+0x60f/0xc7d [ubi] ubi_wl_get_peb+0x25b/0x4f0 [ubi] try_write_vid_and_data+0x9a/0x4d0 [ubi] ubi_eba_write_leb+0x7e4/0x17d0 [ubi] ubi_leb_map+0x1a0/0x2c0 [ubi] ubifs_leb_map+0x139/0x270 [ubifs] ubifs_add_bud_to_log+0xb40/0xf30 [ubifs] make_reservation+0x86e/0xb00 [ubifs] ubifs_jnl_write_data+0x430/0x9d0 [ubifs] do_writepage+0x1d1/0x550 [ubifs] ubifs_writepage+0x37c/0x670 [ubifs] __writepage+0x67/0x170 write_cache_pages+0x259/0xa90 do_writepages+0x277/0x5d0 __writeback_single_inode+0xb8/0x850 writeback_sb_inodes+0x4b3/0xb20 __writeback_inodes_wb+0xc1/0x220 wb_writeback+0x59f/0x740 wb_workfn+0x6d0/0xca0 process_one_work+0x711/0xfc0 worker_thread+0x95/0xd00 kthread+0x3a6/0x490 ret_from_fork+0x1f/0x30 Allocated by task 711: kasan_save_stack+0x1e/0x50 __kasan_kmalloc+0x81/0xa0 ubi_eba_create_table+0x88/0x1a0 [ubi] ubi_resize_volume.cold+0x175/0xae7 [ubi] ubi_cdev_ioctl+0x57f/0x1a60 [ubi] __x64_sys_ioctl+0x13a/0x1c0 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Last potentially related work creation: kasan_save_stack+0x1e/0x50 __kasan_record_aux_stack+0xb7/0xc0 call_rcu+0xd6/0x1000 blk_stat_free_callback+0x28/0x30 blk_release_queue+0x8a/0x2e0 kobject_put+0x186/0x4c0 scsi_device_dev_release_usercontext+0x620/0xbd0 execute_in_process_context+0x2f/0x120 device_release+0xa4/0x240 kobject_put+0x186/0x4c0 put_device+0x20/0x30 __scsi_remove_device+0x1c3/0x300 scsi_probe_and_add_lun+0x2140/0x2eb0 __scsi_scan_target+0x1f2/0xbb0 scsi_scan_channel+0x11b/0x1a0 scsi_scan_host_selected+0x24c/0x310 do_scsi_scan_host+0x1e0/0x250 do_scan_async+0x45/0x490 async_run_entry_fn+0xa2/0x530 process_one_work+0x711/0xfc0 worker_thread+0x95/0xd00 kthread+0x3a6/0x490 ret_from_fork+0x1f/0x30 The buggy address belongs to the object at ffff88800f43f500 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 112 bytes inside of 128-byte region [ffff88800f43f500, ffff88800f43f580) The buggy address belongs to the page: page:ffffea00003d0f00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xf43c head:ffffea00003d0f00 order:2 compound_mapcount:0 compound_pincount:0 flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) raw: 001fffff80010200 ffffea000046ba08 ffffea0000457208 ffff88810004d1c0 raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88800f43f400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88800f43f480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc > ffff88800f43f500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc ^ ffff88800f43f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88800f43f600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc The following steps can used to reproduce: Process 1: write and trigger ubi wear-leveling ubimkvol /dev/ubi0 -s 5000MiB -N v1 ubimkvol /dev/ubi0 -s 2000MiB -N v2 ubimkvol /dev/ubi0 -s 10MiB -N v3 mount -t ubifs /dev/ubi0_0 /mnt/ubifs while true; do filename=/mnt/ubifs/$((RANDOM)) dd if=/dev/random of=${filename} bs=1M count=$((RANDOM % 1000)) rm -rf ${filename} sync /mnt/ubifs/ done Process 2: do random resize struct ubi_rsvol_req req; req.vol_id = 1; req.bytes = (rand() % 50) * 512KB; ioctl(fd, UBI_IOCRSVOL, &req); V3: - Fix the commit message error. V2: - Add volumes_lock in ubi_eba_copy_leb() to avoid race caused by updating eba_tbl. V1: - Rebase the patch on the latest mainline. Signed-off-by: Guo Xuenan Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/eba.c | 7 +++++++ drivers/mtd/ubi/vmt.c | 24 +++++++++++++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c index 8d1f0e05892c..e5ac3cd0bbae 100644 --- a/drivers/mtd/ubi/eba.c +++ b/drivers/mtd/ubi/eba.c @@ -1456,7 +1456,14 @@ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, } ubi_assert(vol->eba_tbl->entries[lnum].pnum == from); + + /** + * The volumes_lock lock is needed here to prevent the expired old eba_tbl + * being updated when the eba_tbl is copied in the ubi_resize_volume() process. + */ + spin_lock(&ubi->volumes_lock); vol->eba_tbl->entries[lnum].pnum = to; + spin_unlock(&ubi->volumes_lock); out_unlock_buf: mutex_unlock(&ubi->buf_mutex); diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 2c867d16f89f..97294def01eb 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -408,6 +408,7 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) struct ubi_device *ubi = vol->ubi; struct ubi_vtbl_record vtbl_rec; struct ubi_eba_table *new_eba_tbl = NULL; + struct ubi_eba_table *old_eba_tbl = NULL; int vol_id = vol->vol_id; if (ubi->ro_mode) @@ -453,10 +454,13 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) err = -ENOSPC; goto out_free; } + ubi->avail_pebs -= pebs; ubi->rsvd_pebs += pebs; ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -471,7 +475,9 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) ubi->avail_pebs -= pebs; ubi_update_reserved(ubi); ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs); - ubi_eba_replace_table(vol, new_eba_tbl); + old_eba_tbl = vol->eba_tbl; + vol->eba_tbl = new_eba_tbl; + vol->reserved_pebs = reserved_pebs; spin_unlock(&ubi->volumes_lock); } @@ -493,7 +499,6 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) if (err) goto out_acc; - vol->reserved_pebs = reserved_pebs; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { vol->used_ebs = reserved_pebs; vol->last_eb_bytes = vol->usable_leb_size; @@ -501,19 +506,24 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) (long long)vol->used_ebs * vol->usable_leb_size; } + /* destroy old table */ + ubi_eba_destroy_table(old_eba_tbl); ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED); self_check_volumes(ubi); return err; out_acc: + spin_lock(&ubi->volumes_lock); + vol->reserved_pebs = reserved_pebs - pebs; if (pebs > 0) { - spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= pebs; ubi->avail_pebs += pebs; - spin_unlock(&ubi->volumes_lock); + ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); + } else { + ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); } - return err; - + vol->eba_tbl = old_eba_tbl; + spin_unlock(&ubi->volumes_lock); out_free: ubi_eba_destroy_table(new_eba_tbl); return err; From 9277b3a64953c09e88a33adf59fb085e0a87d357 Mon Sep 17 00:00:00 2001 From: ZhaoLong Wang Date: Sat, 13 Jan 2024 21:06:01 +0800 Subject: [PATCH 070/496] ubi: Correct the number of PEBs after a volume resize failure In the error handling path `out_acc` of `ubi_resize_volume()`, when `pebs < 0`, it indicates that the volume table record failed to update when the volume was shrunk. In this case, the number of `ubi->avail_pebs` and `ubi->rsvd_pebs` should be restored to their previous values to prevent the UBI layer from reporting an incorrect number of available PEBs. Signed-off-by: ZhaoLong Wang Reviewed-by: Zhihao Cheng Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/vmt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 97294def01eb..990571287e84 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -515,13 +515,12 @@ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) out_acc: spin_lock(&ubi->volumes_lock); vol->reserved_pebs = reserved_pebs - pebs; - if (pebs > 0) { - ubi->rsvd_pebs -= pebs; - ubi->avail_pebs += pebs; + ubi->rsvd_pebs -= pebs; + ubi->avail_pebs += pebs; + if (pebs > 0) ubi_eba_copy_table(vol, old_eba_tbl, vol->reserved_pebs); - } else { + else ubi_eba_copy_table(vol, old_eba_tbl, reserved_pebs); - } vol->eba_tbl = old_eba_tbl; spin_unlock(&ubi->volumes_lock); out_free: From 31a9d5f3290c5483d41c6b5c8ebc0add130da770 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:04 +0800 Subject: [PATCH 071/496] ubifs: dbg_check_idx_size: Fix kmemleak if loading znode failed If function dbg_check_idx_size() failed by loading znode in mounting process, there are two problems: 1. Allocated znodes won't be freed, which causes kmemleak in kernel: ubifs_mount dbg_check_idx_size dbg_walk_index c->zroot.znode = ubifs_load_znode child = ubifs_load_znode // failed // Loaded znodes won't be freed in error handling path. 2. Global variable ubifs_clean_zn_cnt is not decreased, because ubifs_tnc_close() is not invoked in error handling path, which triggers a warning in ubifs_exit(): WARNING: CPU: 1 PID: 1576 at fs/ubifs/super.c:2486 ubifs_exit Modules linked in: zstd ubifs(-) ubi nandsim CPU: 1 PID: 1576 Comm: rmmod Not tainted 6.7.0-rc6 Call Trace: ubifs_exit+0xca/0xc70 [ubifs] __do_sys_delete_module+0x29a/0x4a0 do_syscall_64+0x6f/0x140 Fix it by adding error handling path in dbg_check_idx_size() to release tnc tree. Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Suggested-by: Richard Weinberger Signed-off-by: Richard Weinberger --- fs/ubifs/debug.c | 9 +++++++-- fs/ubifs/tnc.c | 9 +-------- fs/ubifs/tnc_misc.c | 22 ++++++++++++++++++++++ fs/ubifs/ubifs.h | 1 + 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index d013c5b3f1ed..ac77ac1fd73e 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) err = dbg_walk_index(c, NULL, add_size, &calc); if (err) { ubifs_err(c, "error %d while walking the index", err); - return err; + goto out_err; } if (calc != idx_size) { ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", calc, idx_size); dump_stack(); - return -EINVAL; + err = -EINVAL; + goto out_err; } return 0; + +out_err: + ubifs_destroy_tnc_tree(c); + return err; } /** diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f4728e65d1bd..45cacdcd4746 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c) void ubifs_tnc_close(struct ubifs_info *c) { tnc_destroy_cnext(c); - if (c->zroot.znode) { - long n, freed; - - n = atomic_long_read(&c->clean_zn_cnt); - freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); - ubifs_assert(c, freed == n); - atomic_long_sub(n, &ubifs_clean_zn_cnt); - } + ubifs_destroy_tnc_tree(c); kfree(c->gap_lebs); kfree(c->ilebs); destroy_old_idx(c); diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index 4d686e34e64d..d3f8a6aa1f49 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -250,6 +250,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, } } +/** + * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree. + * @c: UBIFS file-system description object + * + * This function destroys the whole TNC tree and updates clean global znode + * count. + */ +void ubifs_destroy_tnc_tree(struct ubifs_info *c) +{ + long n, freed; + + if (!c->zroot.znode) + return; + + n = atomic_long_read(&c->clean_zn_cnt); + freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode); + ubifs_assert(c, freed == n); + atomic_long_sub(n, &ubifs_clean_zn_cnt); + + c->zroot.znode = NULL; +} + /** * read_znode - read an indexing node from flash and fill znode. * @c: UBIFS file-system description object diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3916dc4f30ca..6eba287ae66c 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1903,6 +1903,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c, struct ubifs_znode *znode); long ubifs_destroy_tnc_subtree(const struct ubifs_info *c, struct ubifs_znode *zr); +void ubifs_destroy_tnc_tree(struct ubifs_info *c); struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct ubifs_znode *parent, int iip); From 6379b44cdcd67f5f5d986b73953e99700591edfa Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 8 Jan 2024 10:41:05 +0800 Subject: [PATCH 072/496] ubifs: ubifs_symlink: Fix memleak of inode->i_link in error path For error handling path in ubifs_symlink(), inode will be marked as bad first, then iput() is invoked. If inode->i_link is initialized by fscrypt_encrypt_symlink() in encryption scenario, inode->i_link won't be freed by callchain ubifs_free_inode -> fscrypt_free_inode in error handling path, because make_bad_inode() has changed 'inode->i_mode' as 'S_IFREG'. Following kmemleak is easy to be reproduced by injecting error in ubifs_jnl_update() when doing symlink in encryption scenario: unreferenced object 0xffff888103da3d98 (size 8): comm "ln", pid 1692, jiffies 4294914701 (age 12.045s) backtrace: kmemdup+0x32/0x70 __fscrypt_encrypt_symlink+0xed/0x1c0 ubifs_symlink+0x210/0x300 [ubifs] vfs_symlink+0x216/0x360 do_symlinkat+0x11a/0x190 do_syscall_64+0x3b/0xe0 There are two ways fixing it: 1. Remove make_bad_inode() in error handling path. We can do that because ubifs_evict_inode() will do same processes for good symlink inode and bad symlink inode, for inode->i_nlink checking is before is_bad_inode(). 2. Free inode->i_link before marking inode bad. Method 2 is picked, it has less influence, personally, I think. Cc: stable@vger.kernel.org Fixes: 2c58d548f570 ("fscrypt: cache decrypted symlink target in ->i_link") Signed-off-by: Zhihao Cheng Suggested-by: Eric Biggers Reviewed-by: Eric Biggers Signed-off-by: Richard Weinberger --- fs/ubifs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e413a9cf8ee3..6b3db00d9b12 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1134,6 +1134,8 @@ out_cancel: dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); out_inode: + /* Free inode->i_link before inode is marked as bad. */ + fscrypt_free_inode(inode); make_bad_inode(inode); iput(inode); out_fname: From 556c19f563b64dd5463b0030d19c8681f4f7446e Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Mon, 22 Jan 2024 14:31:03 +0800 Subject: [PATCH 073/496] ubifs: Queue up space reservation tasks if retrying many times Recently we catched ENOSPC returned by make_reservation() while doing fsstress on UBIFS, we got following information when it occurred (See details in Link): UBIFS error (ubi0:0 pid 3640152): make_reservation [ubifs]: cannot reserve 112 bytes in jhead 2, error -28 CPU: 2 PID: 3640152 Comm: kworker/u16:2 Tainted: G B W Hardware name: Hisilicon PhosphorHi1230 EMU (DT) Workqueue: writeback wb_workfn (flush-ubifs_0_0) Call trace: dump_stack+0x114/0x198 make_reservation+0x564/0x610 [ubifs] ubifs_jnl_write_data+0x328/0x48c [ubifs] do_writepage+0x2a8/0x3e4 [ubifs] ubifs_writepage+0x16c/0x374 [ubifs] generic_writepages+0xb4/0x114 do_writepages+0xcc/0x11c writeback_sb_inodes+0x2d0/0x564 wb_writeback+0x20c/0x2b4 wb_workfn+0x404/0x510 process_one_work+0x304/0x4ac worker_thread+0x31c/0x4e4 kthread+0x23c/0x290 Budgeting info: data budget sum 17576, total budget sum 17768 budg_data_growth 4144, budg_dd_growth 13432, budg_idx_growth 192 min_idx_lebs 13, old_idx_sz 988640, uncommitted_idx 0 page_budget 4144, inode_budget 160, dent_budget 312 nospace 0, nospace_rp 0 dark_wm 8192, dead_wm 4096, max_idx_node_sz 192 freeable_cnt 0, calc_idx_sz 988640, idx_gc_cnt 0 dirty_pg_cnt 4, dirty_zn_cnt 0, clean_zn_cnt 4811 gc_lnum 21, ihead_lnum 14 jhead 0 (GC) LEB 16 jhead 1 (base) LEB 34 jhead 2 (data) LEB 23 bud LEB 16 bud LEB 23 bud LEB 34 old bud LEB 33 old bud LEB 31 old bud LEB 15 commit state 4 Budgeting predictions: available: 33832, outstanding 17576, free 15356 (pid 3640152) start dumping LEB properties (pid 3640152) Lprops statistics: empty_lebs 3, idx_lebs 11 taken_empty_lebs 1, total_free 1253376, total_dirty 2445736 total_used 3438712, total_dark 65536, total_dead 17248 LEB 15 free 0 dirty 248000 used 5952 (taken) LEB 16 free 110592 dirty 896 used 142464 (taken, jhead 0 (GC)) LEB 21 free 253952 dirty 0 used 0 (taken, GC LEB) LEB 23 free 0 dirty 248104 used 5848 (taken, jhead 2 (data)) LEB 29 free 253952 dirty 0 used 0 (empty) LEB 33 free 0 dirty 253952 used 0 (taken) LEB 34 free 217088 dirty 36544 used 320 (taken, jhead 1 (base)) LEB 37 free 253952 dirty 0 used 0 (empty) OTHERS: index lebs, zero-available non-index lebs According to the budget algorithm, there are 5 LEBs reserved for budget: three journal heads(16,23,34), 1 GC LEB(21) and 1 deletion LEB(can be used in make_reservation()). There are 2 empty LEBs used for index nodes, which is calculated as min_idx_lebs - idx_lebs = 2. In theory, LEB 15 and 33 should be reclaimed as free state after committing, but it is now in taken state. After looking the realization of reserve_space(), there's a possible situation: LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (bud, taken) LEB 33: free 2000 dirty 251952 used 0 (bud, taken) wb_workfn wb_workfn_2 do_writepage // write 3000 bytes ubifs_jnl_write_data make_reservation reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken nospc_retries++ // 1 ubifs_run_commit do_commit LEB 15: free 2000 dirty 248000 used 3952 (jhead 2) LEB 23: free 2000 dirty 248104 used 3848 (dirty) LEB 33: free 2000 dirty 251952 used 0 (dirty) do_writepage // write 2000 bytes for 3 times ubifs_jnl_write_data // grabs 15\23\33 LEB 15: free 0 dirty 248000 used 5952 (bud, taken) LEB 23: free 0 dirty 248104 used 5848 (jhead 2) LEB 33: free 0 dirty 253952 used 0 (bud, taken) reserve_space ubifs_garbage_collect ubifs_find_dirty_leb // ret ENOSPC, dirty LEBs are taken if (nospc_retries++ < 2) // false ubifs_ro_mode ! Fetch a reproducer in Link. The dirty LEBs could be grabbed by other threads, which fails finding dirty LEBs of GC in current thread, so make_reservation() could try many times to invoke GC&&committing, but current realization limits the times of retrying as 'nospc_retries'(twice). Fix it by adding a wait queue, start queuing up space reservation tasks when someone task has retried gc + commit for many times. Then there is only one task making space reservation at any time, and it can always make success under the premise of correct budgeting. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218164 Fixes: 1e51764a3c2a ("UBIFS: add new flash file system") Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Signed-off-by: Richard Weinberger --- fs/ubifs/journal.c | 171 +++++++++++++++++++++++++++++++++++++++------ fs/ubifs/super.c | 2 + fs/ubifs/ubifs.h | 4 ++ 3 files changed, 155 insertions(+), 22 deletions(-) diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f0a5538c84b0..74aee92433d7 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -292,6 +292,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, return err; } +/** + * __queue_and_wait - queue a task and wait until the task is waked up. + * @c: UBIFS file-system description object + * + * This function adds current task in queue and waits until the task is waked + * up. This function should be called with @c->reserve_space_wq locked. + */ +static void __queue_and_wait(struct ubifs_info *c) +{ + DEFINE_WAIT(wait); + + __add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&c->reserve_space_wq.lock); + + schedule(); + finish_wait(&c->reserve_space_wq, &wait); +} + +/** + * wait_for_reservation - try queuing current task to wait until waked up. + * @c: UBIFS file-system description object + * + * This function queues current task to wait until waked up, if queuing is + * started(@c->need_wait_space is not %0). Returns %true if current task is + * added in queue, otherwise %false is returned. + */ +static bool wait_for_reservation(struct ubifs_info *c) +{ + if (likely(atomic_read(&c->need_wait_space) == 0)) + /* Quick path to check whether queuing is started. */ + return false; + + spin_lock(&c->reserve_space_wq.lock); + if (atomic_read(&c->need_wait_space) == 0) { + /* Queuing is not started, don't queue current task. */ + spin_unlock(&c->reserve_space_wq.lock); + return false; + } + + __queue_and_wait(c); + return true; +} + +/** + * wake_up_reservation - wake up first task in queue or stop queuing. + * @c: UBIFS file-system description object + * + * This function wakes up the first task in queue if it exists, or stops + * queuing if no tasks in queue. + */ +static void wake_up_reservation(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (waitqueue_active(&c->reserve_space_wq)) + wake_up_locked(&c->reserve_space_wq); + else + /* + * Compared with wait_for_reservation(), set @c->need_wait_space + * under the protection of wait queue lock, which can avoid that + * @c->need_wait_space is set to 0 after new task queued. + */ + atomic_set(&c->need_wait_space, 0); + spin_unlock(&c->reserve_space_wq.lock); +} + +/** + * wake_up_reservation - add current task in queue or start queuing. + * @c: UBIFS file-system description object + * + * This function starts queuing if queuing is not started, otherwise adds + * current task in queue. + */ +static void add_or_start_queue(struct ubifs_info *c) +{ + spin_lock(&c->reserve_space_wq.lock); + if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) { + /* Starts queuing, task can go on directly. */ + spin_unlock(&c->reserve_space_wq.lock); + return; + } + + /* + * There are at least two tasks have retried more than 32 times + * at certain point, first task has started queuing, just queue + * the left tasks. + */ + __queue_and_wait(c); +} + /** * make_reservation - reserve journal space. * @c: UBIFS file-system description object @@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, static int make_reservation(struct ubifs_info *c, int jhead, int len) { int err, cmt_retries = 0, nospc_retries = 0; + bool blocked = wait_for_reservation(c); again: down_read(&c->commit_sem); err = reserve_space(c, jhead, len); - if (!err) + if (!err) { /* c->commit_sem will get released via finish_reservation(). */ - return 0; + goto out_wake_up; + } up_read(&c->commit_sem); if (err == -ENOSPC) { /* * GC could not make any progress. We should try to commit - * once because it could make some dirty space and GC would - * make progress, so make the error -EAGAIN so that the below + * because it could make some dirty space and GC would make + * progress, so make the error -EAGAIN so that the below * will commit and re-try. */ - if (nospc_retries++ < 2) { - dbg_jnl("no space, retry"); - err = -EAGAIN; - } - - /* - * This means that the budgeting is incorrect. We always have - * to be able to write to the media, because all operations are - * budgeted. Deletions are not budgeted, though, but we reserve - * an extra LEB for them. - */ + nospc_retries++; + dbg_jnl("no space, retry"); + err = -EAGAIN; } if (err != -EAGAIN) @@ -349,15 +433,37 @@ again: */ if (cmt_retries > 128) { /* - * This should not happen unless the journal size limitations - * are too tough. + * This should not happen unless: + * 1. The journal size limitations are too tough. + * 2. The budgeting is incorrect. We always have to be able to + * write to the media, because all operations are budgeted. + * Deletions are not budgeted, though, but we reserve an + * extra LEB for them. */ - ubifs_err(c, "stuck in space allocation"); + ubifs_err(c, "stuck in space allocation, nospc_retries %d", + nospc_retries); err = -ENOSPC; goto out; - } else if (cmt_retries > 32) - ubifs_warn(c, "too many space allocation re-tries (%d)", - cmt_retries); + } else if (cmt_retries > 32) { + /* + * It's almost impossible to happen, unless there are many tasks + * making reservation concurrently and someone task has retried + * gc + commit for many times, generated available space during + * this period are grabbed by other tasks. + * But if it happens, start queuing up all tasks that will make + * space reservation, then there is only one task making space + * reservation at any time, and it can always make success under + * the premise of correct budgeting. + */ + ubifs_warn(c, "too many space allocation cmt_retries (%d) " + "nospc_retries (%d), start queuing tasks", + cmt_retries, nospc_retries); + + if (!blocked) { + blocked = true; + add_or_start_queue(c); + } + } dbg_jnl("-EAGAIN, commit and retry (retried %d times)", cmt_retries); @@ -365,7 +471,7 @@ again: err = ubifs_run_commit(c); if (err) - return err; + goto out_wake_up; goto again; out: @@ -380,6 +486,27 @@ out: cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); } +out_wake_up: + if (blocked) { + /* + * Only tasks that have ever started queuing or ever been queued + * can wake up other queued tasks, which can make sure that + * there is only one task waked up to make space reservation. + * For example: + * task A task B task C + * make_reservation make_reservation + * reserve_space // 0 + * wake_up_reservation + * atomic_cmpxchg // 0, start queuing + * reserve_space + * wait_for_reservation + * __queue_and_wait + * add_wait_queue + * if (blocked) // false + * // So that task C won't be waked up to race with task B + */ + wake_up_reservation(c); + } return err; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..571c9dc92b94 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) mutex_init(&c->bu_mutex); mutex_init(&c->write_reserve_mutex); init_waitqueue_head(&c->cmt_wq); + init_waitqueue_head(&c->reserve_space_wq); + atomic_set(&c->need_wait_space, 0); c->buds = RB_ROOT; c->old_idx = RB_ROOT; c->size_tree = RB_ROOT; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 6eba287ae66c..1f3ea879d93a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1047,6 +1047,8 @@ struct ubifs_debug_info; * @bg_bud_bytes: number of bud bytes when background commit is initiated * @old_buds: buds to be released after commit ends * @max_bud_cnt: maximum number of buds + * @need_wait_space: Non %0 means space reservation tasks need to wait in queue + * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0 * * @commit_sem: synchronizes committer with other processes * @cmt_state: commit state @@ -1305,6 +1307,8 @@ struct ubifs_info { long long bg_bud_bytes; struct list_head old_buds; int max_bud_cnt; + atomic_t need_wait_space; + wait_queue_head_t reserve_space_wq; struct rw_semaphore commit_sem; int cmt_state; From e17f38b736691de1cf9e842c748e2960341c9870 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:00 +0000 Subject: [PATCH 074/496] dt-bindings: mtd: add basic bindings for UBI Add basic bindings for UBI devices and volumes. Signed-off-by: Daniel Golle Reviewed-by: Rob Herring Signed-off-by: Richard Weinberger --- .../bindings/mtd/partitions/linux,ubi.yaml | 65 +++++++++++++++++++ .../bindings/mtd/partitions/ubi-volume.yaml | 35 ++++++++++ 2 files changed, 100 insertions(+) create mode 100644 Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml create mode 100644 Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml diff --git a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml new file mode 100644 index 000000000000..7084a1945b31 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mtd/partitions/linux,ubi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Unsorted Block Images + +description: | + UBI ("Unsorted Block Images") is a volume management system for raw + flash devices which manages multiple logical volumes on a single + physical flash device and spreads the I/O load (i.e wear-leveling) + across the whole flash chip. + +maintainers: + - Daniel Golle + +allOf: + - $ref: partition.yaml# + +properties: + compatible: + const: linux,ubi + + volumes: + type: object + description: UBI Volumes + + patternProperties: + "^ubi-volume-.*$": + $ref: /schemas/mtd/partitions/ubi-volume.yaml# + + unevaluatedProperties: false + +required: + - compatible + +unevaluatedProperties: false + +examples: + - | + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + reg = <0x0 0x100000>; + label = "bootloader"; + read-only; + }; + + partition@100000 { + reg = <0x100000 0x1ff00000>; + label = "ubi"; + compatible = "linux,ubi"; + + volumes { + ubi-volume-caldata { + volid = <2>; + volname = "rf"; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml new file mode 100644 index 000000000000..1e3f04dedc01 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mtd/partitions/ubi-volume.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: UBI volume + +description: | + This binding describes a single UBI volume. Volumes can be matches either + by their ID or their name, or both. + +maintainers: + - Daniel Golle + +properties: + volid: + $ref: /schemas/types.yaml#/definitions/uint32 + description: + Match UBI volume ID + + volname: + $ref: /schemas/types.yaml#/definitions/string + description: + Match UBI volume ID + +anyOf: + - required: + - volid + + - required: + - volname + +# This is a generic file other binding inherit from and extend +additionalProperties: true From a1de28dd203176bb9cf62576a90d761aa57fe924 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:11 +0000 Subject: [PATCH 075/496] dt-bindings: mtd: ubi-volume: allow UBI volumes to provide NVMEM UBI volumes may be used to contain NVMEM bits, typically device MAC addresses or wireless radio calibration data. Signed-off-by: Daniel Golle Reviewed-by: Rob Herring Signed-off-by: Richard Weinberger --- .../devicetree/bindings/mtd/partitions/linux,ubi.yaml | 10 ++++++++++ .../devicetree/bindings/mtd/partitions/ubi-volume.yaml | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml index 7084a1945b31..27e1ac1f252e 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/linux,ubi.yaml @@ -59,6 +59,16 @@ examples: ubi-volume-caldata { volid = <2>; volname = "rf"; + + nvmem-layout { + compatible = "fixed-layout"; + #address-cells = <1>; + #size-cells = <1>; + + eeprom@0 { + reg = <0x0 0x1000>; + }; + }; }; }; }; diff --git a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml index 1e3f04dedc01..19736b26056b 100644 --- a/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml +++ b/Documentation/devicetree/bindings/mtd/partitions/ubi-volume.yaml @@ -24,6 +24,11 @@ properties: description: Match UBI volume ID + nvmem-layout: + $ref: /schemas/nvmem/layouts/nvmem-layout.yaml# + description: + This container may reference an NVMEM layout parser. + anyOf: - required: - volid From 762d73cd930e3073e927b2ec0811519bde2c8fb4 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:32:35 +0000 Subject: [PATCH 076/496] mtd: ubi: block: use notifier to create ubiblock from parameter Use UBI_VOLUME_ADDED notification to create ubiblock device specified on kernel cmdline or module parameter. This makes thing more simple and has the advantage that ubiblock devices on volumes which are not present at the time the ubi module is probed will still be created. Suggested-by: Zhihao Cheng Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/block.c | 136 ++++++++++++++++++++-------------------- drivers/mtd/ubi/kapi.c | 54 +++++++++++----- drivers/mtd/ubi/ubi.h | 1 + 3 files changed, 106 insertions(+), 85 deletions(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 654bd7372cd8..b52ff32f624a 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -65,10 +65,10 @@ struct ubiblock_pdu { }; /* Numbers of elements set in the @ubiblock_param array */ -static int ubiblock_devs __initdata; +static int ubiblock_devs; /* MTD devices specification parameters */ -static struct ubiblock_param ubiblock_param[UBIBLOCK_MAX_DEVICES] __initdata; +static struct ubiblock_param ubiblock_param[UBIBLOCK_MAX_DEVICES]; struct ubiblock { struct ubi_volume_desc *desc; @@ -534,6 +534,70 @@ static int ubiblock_resize(struct ubi_volume_info *vi) return 0; } +static bool +match_volume_desc(struct ubi_volume_info *vi, const char *name, int ubi_num, int vol_id) +{ + int err, len, cur_ubi_num, cur_vol_id; + + if (ubi_num == -1) { + /* No ubi num, name must be a vol device path */ + err = ubi_get_num_by_path(name, &cur_ubi_num, &cur_vol_id); + if (err || vi->ubi_num != cur_ubi_num || vi->vol_id != cur_vol_id) + return false; + + return true; + } + + if (vol_id == -1) { + /* Got ubi_num, but no vol_id, name must be volume name */ + if (vi->ubi_num != ubi_num) + return false; + + len = strnlen(name, UBI_VOL_NAME_MAX + 1); + if (len < 1 || vi->name_len != len) + return false; + + if (strcmp(name, vi->name)) + return false; + + return true; + } + + if (vi->ubi_num != ubi_num) + return false; + + if (vi->vol_id != vol_id) + return false; + + return true; +} + +static void +ubiblock_create_from_param(struct ubi_volume_info *vi) +{ + int i, ret = 0; + struct ubiblock_param *p; + + /* + * Iterate over ubiblock cmdline parameters. If a parameter matches the + * newly added volume create the ubiblock device for it. + */ + for (i = 0; i < ubiblock_devs; i++) { + p = &ubiblock_param[i]; + + if (!match_volume_desc(vi, p->name, p->ubi_num, p->vol_id)) + continue; + + ret = ubiblock_create(vi); + if (ret) { + pr_err( + "UBI: block: can't add '%s' volume on ubi%d_%d, err=%d\n", + vi->name, p->ubi_num, p->vol_id, ret); + } + break; + } +} + static int ubiblock_notify(struct notifier_block *nb, unsigned long notification_type, void *ns_ptr) { @@ -541,10 +605,7 @@ static int ubiblock_notify(struct notifier_block *nb, switch (notification_type) { case UBI_VOLUME_ADDED: - /* - * We want to enforce explicit block device creation for - * volumes, so when a volume is added we do nothing. - */ + ubiblock_create_from_param(&nt->vi); break; case UBI_VOLUME_REMOVED: ubiblock_remove(&nt->vi); @@ -570,56 +631,6 @@ static struct notifier_block ubiblock_notifier = { .notifier_call = ubiblock_notify, }; -static struct ubi_volume_desc * __init -open_volume_desc(const char *name, int ubi_num, int vol_id) -{ - if (ubi_num == -1) - /* No ubi num, name must be a vol device path */ - return ubi_open_volume_path(name, UBI_READONLY); - else if (vol_id == -1) - /* No vol_id, must be vol_name */ - return ubi_open_volume_nm(ubi_num, name, UBI_READONLY); - else - return ubi_open_volume(ubi_num, vol_id, UBI_READONLY); -} - -static void __init ubiblock_create_from_param(void) -{ - int i, ret = 0; - struct ubiblock_param *p; - struct ubi_volume_desc *desc; - struct ubi_volume_info vi; - - /* - * If there is an error creating one of the ubiblocks, continue on to - * create the following ubiblocks. This helps in a circumstance where - * the kernel command-line specifies multiple block devices and some - * may be broken, but we still want the working ones to come up. - */ - for (i = 0; i < ubiblock_devs; i++) { - p = &ubiblock_param[i]; - - desc = open_volume_desc(p->name, p->ubi_num, p->vol_id); - if (IS_ERR(desc)) { - pr_err( - "UBI: block: can't open volume on ubi%d_%d, err=%ld\n", - p->ubi_num, p->vol_id, PTR_ERR(desc)); - continue; - } - - ubi_get_volume_info(desc, &vi); - ubi_close_volume(desc); - - ret = ubiblock_create(&vi); - if (ret) { - pr_err( - "UBI: block: can't add '%s' volume on ubi%d_%d, err=%d\n", - vi.name, p->ubi_num, p->vol_id, ret); - continue; - } - } -} - static void ubiblock_remove_all(void) { struct ubiblock *next; @@ -645,18 +656,7 @@ int __init ubiblock_init(void) if (ubiblock_major < 0) return ubiblock_major; - /* - * Attach block devices from 'block=' module param. - * Even if one block device in the param list fails to come up, - * still allow the module to load and leave any others up. - */ - ubiblock_create_from_param(); - - /* - * Block devices are only created upon user requests, so we ignore - * existing volumes. - */ - ret = ubi_register_volume_notifier(&ubiblock_notifier, 1); + ret = ubi_register_volume_notifier(&ubiblock_notifier, 0); if (ret) goto err_unreg; return 0; diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index 5db653eacbd4..fbf3a7fe2af7 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -279,6 +279,41 @@ struct ubi_volume_desc *ubi_open_volume_nm(int ubi_num, const char *name, } EXPORT_SYMBOL_GPL(ubi_open_volume_nm); +/** + * ubi_get_num_by_path - get UBI device and volume number from device path + * @pathname: volume character device node path + * @ubi_num: pointer to UBI device number to be set + * @vol_id: pointer to UBI volume ID to be set + * + * Returns 0 on success and sets ubi_num and vol_id, returns error otherwise. + */ +int ubi_get_num_by_path(const char *pathname, int *ubi_num, int *vol_id) +{ + int error; + struct path path; + struct kstat stat; + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return error; + + error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + path_put(&path); + if (error) + return error; + + if (!S_ISCHR(stat.mode)) + return -EINVAL; + + *ubi_num = ubi_major2num(MAJOR(stat.rdev)); + *vol_id = MINOR(stat.rdev) - 1; + + if (*vol_id < 0 || *ubi_num < 0) + return -ENODEV; + + return 0; +} + /** * ubi_open_volume_path - open UBI volume by its character device node path. * @pathname: volume character device node path @@ -290,32 +325,17 @@ EXPORT_SYMBOL_GPL(ubi_open_volume_nm); struct ubi_volume_desc *ubi_open_volume_path(const char *pathname, int mode) { int error, ubi_num, vol_id; - struct path path; - struct kstat stat; dbg_gen("open volume %s, mode %d", pathname, mode); if (!pathname || !*pathname) return ERR_PTR(-EINVAL); - error = kern_path(pathname, LOOKUP_FOLLOW, &path); + error = ubi_get_num_by_path(pathname, &ubi_num, &vol_id); if (error) return ERR_PTR(error); - error = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); - path_put(&path); - if (error) - return ERR_PTR(error); - - if (!S_ISCHR(stat.mode)) - return ERR_PTR(-EINVAL); - - ubi_num = ubi_major2num(MAJOR(stat.rdev)); - vol_id = MINOR(stat.rdev) - 1; - - if (vol_id >= 0 && ubi_num >= 0) - return ubi_open_volume(ubi_num, vol_id, mode); - return ERR_PTR(-ENODEV); + return ubi_open_volume(ubi_num, vol_id, mode); } EXPORT_SYMBOL_GPL(ubi_open_volume_path); diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index 0b42bb45dd84..a588381c50ad 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -955,6 +955,7 @@ void ubi_free_internal_volumes(struct ubi_device *ubi); void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di); void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol, struct ubi_volume_info *vi); +int ubi_get_num_by_path(const char *pathname, int *ubi_num, int *vol_id); /* scan.c */ int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb, int pnum, const struct ubi_vid_hdr *vid_hdr); From 927c145208b04490fb40c5c88a1086b592bfac25 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:14 +0000 Subject: [PATCH 077/496] mtd: ubi: attach from device tree Introduce device tree compatible 'linux,ubi' and attach compatible MTD devices using the MTD add notifier. This is needed for a UBI device to be available early at boot (and not only after late_initcall), so volumes on them can be used eg. as NVMEM providers for other drivers. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 149 +++++++++++++++++++++++++++------------- 1 file changed, 103 insertions(+), 46 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 7d4ff1193db6..8c3f763e4ddb 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "ubi.h" @@ -1219,44 +1220,44 @@ static struct mtd_info * __init open_mtd_device(const char *mtd_dev) return mtd; } -static int __init ubi_init(void) +static void ubi_notify_add(struct mtd_info *mtd) +{ + struct device_node *np = mtd_get_of_node(mtd); + int err; + + if (!of_device_is_compatible(np, "linux,ubi")) + return; + + /* + * we are already holding &mtd_table_mutex, but still need + * to bump refcount + */ + err = __get_mtd_device(mtd); + if (err) + return; + + /* called while holding mtd_table_mutex */ + mutex_lock_nested(&ubi_devices_mutex, SINGLE_DEPTH_NESTING); + err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, 0, 0, false, false); + mutex_unlock(&ubi_devices_mutex); + if (err < 0) + __put_mtd_device(mtd); +} + +static void ubi_notify_remove(struct mtd_info *mtd) +{ + /* do nothing for now */ +} + +static struct mtd_notifier ubi_mtd_notifier = { + .add = ubi_notify_add, + .remove = ubi_notify_remove, +}; + +static int __init ubi_init_attach(void) { int err, i, k; - /* Ensure that EC and VID headers have correct size */ - BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); - BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); - - if (mtd_devs > UBI_MAX_DEVICES) { - pr_err("UBI error: too many MTD devices, maximum is %d\n", - UBI_MAX_DEVICES); - return -EINVAL; - } - - /* Create base sysfs directory and sysfs files */ - err = class_register(&ubi_class); - if (err < 0) - return err; - - err = misc_register(&ubi_ctrl_cdev); - if (err) { - pr_err("UBI error: cannot register device\n"); - goto out; - } - - ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", - sizeof(struct ubi_wl_entry), - 0, 0, NULL); - if (!ubi_wl_entry_slab) { - err = -ENOMEM; - goto out_dev_unreg; - } - - err = ubi_debugfs_init(); - if (err) - goto out_slab; - - /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; @@ -1304,15 +1305,6 @@ static int __init ubi_init(void) } } - err = ubiblock_init(); - if (err) { - pr_err("UBI error: block: cannot initialize, error %d\n", err); - - /* See comment above re-ubi_is_module(). */ - if (ubi_is_module()) - goto out_detach; - } - return 0; out_detach: @@ -1322,7 +1314,70 @@ out_detach: ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); mutex_unlock(&ubi_devices_mutex); } - ubi_debugfs_exit(); + return err; +} +#ifndef CONFIG_MTD_UBI_MODULE +late_initcall(ubi_init_attach); +#endif + +static int __init ubi_init(void) +{ + int err; + + /* Ensure that EC and VID headers have correct size */ + BUILD_BUG_ON(sizeof(struct ubi_ec_hdr) != 64); + BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); + + if (mtd_devs > UBI_MAX_DEVICES) { + pr_err("UBI error: too many MTD devices, maximum is %d\n", + UBI_MAX_DEVICES); + return -EINVAL; + } + + /* Create base sysfs directory and sysfs files */ + err = class_register(&ubi_class); + if (err < 0) + return err; + + err = misc_register(&ubi_ctrl_cdev); + if (err) { + pr_err("UBI error: cannot register device\n"); + goto out; + } + + ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", + sizeof(struct ubi_wl_entry), + 0, 0, NULL); + if (!ubi_wl_entry_slab) { + err = -ENOMEM; + goto out_dev_unreg; + } + + err = ubi_debugfs_init(); + if (err) + goto out_slab; + + err = ubiblock_init(); + if (err) { + pr_err("UBI error: block: cannot initialize, error %d\n", err); + + /* See comment above re-ubi_is_module(). */ + if (ubi_is_module()) + goto out_slab; + } + + register_mtd_user(&ubi_mtd_notifier); + + if (ubi_is_module()) { + err = ubi_init_attach(); + if (err) + goto out_mtd_notifier; + } + + return 0; + +out_mtd_notifier: + unregister_mtd_user(&ubi_mtd_notifier); out_slab: kmem_cache_destroy(ubi_wl_entry_slab); out_dev_unreg: @@ -1332,13 +1387,15 @@ out: pr_err("UBI error: cannot initialize UBI, error %d\n", err); return err; } -late_initcall(ubi_init); +device_initcall(ubi_init); + static void __exit ubi_exit(void) { int i; ubiblock_exit(); + unregister_mtd_user(&ubi_mtd_notifier); for (i = 0; i < UBI_MAX_DEVICES; i++) if (ubi_devices[i]) { From 7e84c961b2eb062d2f47037dcca52dcd1d3615b5 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:24 +0000 Subject: [PATCH 078/496] mtd: ubi: introduce pre-removal notification for UBI volumes Introduce a new notification type UBI_VOLUME_SHUTDOWN to inform users that a volume is just about to be removed. This is needed because users (such as the NVMEM subsystem) expect that at the time their removal function is called, the parenting device is still available (for removal of sysfs nodes, for example, in case of NVMEM which otherwise WARNs on volume removal). Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 19 ++++++++++++++----- drivers/mtd/ubi/kapi.c | 2 +- drivers/mtd/ubi/ubi.h | 2 ++ drivers/mtd/ubi/vmt.c | 17 +++++++++++++++-- include/linux/mtd/ubi.h | 2 ++ 5 files changed, 34 insertions(+), 8 deletions(-) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 8c3f763e4ddb..a7e3a6246c0e 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -93,7 +93,7 @@ static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; /* Serializes UBI devices creations and removals */ DEFINE_MUTEX(ubi_devices_mutex); -/* Protects @ubi_devices and @ubi->ref_count */ +/* Protects @ubi_devices, @ubi->ref_count and @ubi->is_dead */ static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '//class/ubi/' */ @@ -261,6 +261,9 @@ struct ubi_device *ubi_get_device(int ubi_num) spin_lock(&ubi_devices_lock); ubi = ubi_devices[ubi_num]; + if (ubi && ubi->is_dead) + ubi = NULL; + if (ubi) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; @@ -298,7 +301,7 @@ struct ubi_device *ubi_get_by_major(int major) spin_lock(&ubi_devices_lock); for (i = 0; i < UBI_MAX_DEVICES; i++) { ubi = ubi_devices[i]; - if (ubi && MAJOR(ubi->cdev.dev) == major) { + if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_assert(ubi->ref_count >= 0); ubi->ref_count += 1; get_device(&ubi->dev); @@ -327,7 +330,7 @@ int ubi_major2num(int major) for (i = 0; i < UBI_MAX_DEVICES; i++) { struct ubi_device *ubi = ubi_devices[i]; - if (ubi && MAJOR(ubi->cdev.dev) == major) { + if (ubi && !ubi->is_dead && MAJOR(ubi->cdev.dev) == major) { ubi_num = ubi->ubi_num; break; } @@ -514,7 +517,7 @@ static void ubi_free_volumes_from(struct ubi_device *ubi, int from) int i; for (i = from; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { - if (!ubi->volumes[i]) + if (!ubi->volumes[i] || ubi->volumes[i]->is_dead) continue; ubi_eba_replace_table(ubi->volumes[i], NULL); ubi_fastmap_destroy_checkmap(ubi->volumes[i]); @@ -1099,7 +1102,6 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway) return -EINVAL; spin_lock(&ubi_devices_lock); - put_device(&ubi->dev); ubi->ref_count -= 1; if (ubi->ref_count) { if (!anyway) { @@ -1110,6 +1112,13 @@ int ubi_detach_mtd_dev(int ubi_num, int anyway) ubi_err(ubi, "%s reference count %d, destroy anyway", ubi->ubi_name, ubi->ref_count); } + ubi->is_dead = true; + spin_unlock(&ubi_devices_lock); + + ubi_notify_all(ubi, UBI_VOLUME_SHUTDOWN, NULL); + + spin_lock(&ubi_devices_lock); + put_device(&ubi->dev); ubi_devices[ubi_num] = NULL; spin_unlock(&ubi_devices_lock); diff --git a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c index fbf3a7fe2af7..f1ea8677467f 100644 --- a/drivers/mtd/ubi/kapi.c +++ b/drivers/mtd/ubi/kapi.c @@ -152,7 +152,7 @@ struct ubi_volume_desc *ubi_open_volume(int ubi_num, int vol_id, int mode) spin_lock(&ubi->volumes_lock); vol = ubi->volumes[vol_id]; - if (!vol) + if (!vol || vol->is_dead) goto out_unlock; err = -EBUSY; diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index a588381c50ad..32009a24869e 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -337,6 +337,7 @@ struct ubi_volume { int writers; int exclusive; int metaonly; + bool is_dead; int reserved_pebs; int vol_type; @@ -561,6 +562,7 @@ struct ubi_device { spinlock_t volumes_lock; int ref_count; int image_seq; + bool is_dead; int rsvd_pebs; int avail_pebs; diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 990571287e84..eaf8328f6fc3 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -59,7 +59,7 @@ static ssize_t vol_attribute_show(struct device *dev, struct ubi_device *ubi = vol->ubi; spin_lock(&ubi->volumes_lock); - if (!ubi->volumes[vol->vol_id]) { + if (!ubi->volumes[vol->vol_id] || ubi->volumes[vol->vol_id]->is_dead) { spin_unlock(&ubi->volumes_lock); return -ENODEV; } @@ -189,7 +189,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) /* Ensure that the name is unique */ for (i = 0; i < ubi->vtbl_slots; i++) - if (ubi->volumes[i] && + if (ubi->volumes[i] && !ubi->volumes[i]->is_dead && ubi->volumes[i]->name_len == req->name_len && !strcmp(ubi->volumes[i]->name, req->name)) { ubi_err(ubi, "volume \"%s\" exists (ID %d)", @@ -352,6 +352,19 @@ int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl) err = -EBUSY; goto out_unlock; } + + /* + * Mark volume as dead at this point to prevent that anyone + * can take a reference to the volume from now on. + * This is necessary as we have to release the spinlock before + * calling ubi_volume_notify. + */ + vol->is_dead = true; + spin_unlock(&ubi->volumes_lock); + + ubi_volume_notify(ubi, vol, UBI_VOLUME_SHUTDOWN); + + spin_lock(&ubi->volumes_lock); ubi->volumes[vol_id] = NULL; spin_unlock(&ubi->volumes_lock); diff --git a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h index a529347fd75b..562f92504f2b 100644 --- a/include/linux/mtd/ubi.h +++ b/include/linux/mtd/ubi.h @@ -192,6 +192,7 @@ struct ubi_device_info { * or a volume was removed) * @UBI_VOLUME_RESIZED: a volume has been re-sized * @UBI_VOLUME_RENAMED: a volume has been re-named + * @UBI_VOLUME_SHUTDOWN: a volume is going to removed, shutdown users * @UBI_VOLUME_UPDATED: data has been written to a volume * * These constants define which type of event has happened when a volume @@ -202,6 +203,7 @@ enum { UBI_VOLUME_REMOVED, UBI_VOLUME_RESIZED, UBI_VOLUME_RENAMED, + UBI_VOLUME_SHUTDOWN, UBI_VOLUME_UPDATED, }; From 51932f9fc4871619e93abf4de6f1282ec23d936c Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:38 +0000 Subject: [PATCH 079/496] mtd: ubi: populate ubi volume fwnode Look for the 'volumes' subnode of an MTD partition attached to a UBI device and attach matching child nodes to UBI volumes. This allows UBI volumes to be referenced in device tree, e.g. for use as NVMEM providers. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/vmt.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index eaf8328f6fc3..5a3558bbb903 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -124,6 +124,31 @@ static void vol_release(struct device *dev) kfree(vol); } +static struct fwnode_handle *find_volume_fwnode(struct ubi_volume *vol) +{ + struct fwnode_handle *fw_vols, *fw_vol; + const char *volname; + u32 volid; + + fw_vols = device_get_named_child_node(vol->dev.parent->parent, "volumes"); + if (!fw_vols) + return NULL; + + fwnode_for_each_child_node(fw_vols, fw_vol) { + if (!fwnode_property_read_string(fw_vol, "volname", &volname) && + strncmp(volname, vol->name, vol->name_len)) + continue; + + if (!fwnode_property_read_u32(fw_vol, "volid", &volid) && + vol->vol_id != volid) + continue; + + return fw_vol; + } + + return NULL; +} + /** * ubi_create_volume - create volume. * @ubi: UBI device description object @@ -223,6 +248,7 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) vol->name_len = req->name_len; memcpy(vol->name, req->name, vol->name_len); vol->ubi = ubi; + device_set_node(&vol->dev, find_volume_fwnode(vol)); /* * Finish all pending erases because there may be some LEBs belonging @@ -614,6 +640,7 @@ int ubi_add_volume(struct ubi_device *ubi, struct ubi_volume *vol) vol->dev.class = &ubi_class; vol->dev.groups = volume_dev_groups; dev_set_name(&vol->dev, "%s_%d", ubi->ubi_name, vol->vol_id); + device_set_node(&vol->dev, find_volume_fwnode(vol)); err = device_register(&vol->dev); if (err) { cdev_del(&vol->cdev); From 3ce485803da1b79b2692b6d0c2792829292ad838 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 19 Dec 2023 02:33:48 +0000 Subject: [PATCH 080/496] mtd: ubi: provide NVMEM layer over UBI volumes In an ideal world we would like UBI to be used where ever possible on a NAND chip. And with UBI support in ARM Trusted Firmware and U-Boot it is possible to achieve an (almost-)all-UBI flash layout. Hence the need for a way to also use UBI volumes to store board-level constants, such as MAC addresses and calibration data of wireless interfaces. Add UBI volume NVMEM driver module exposing UBI volumes as NVMEM providers. Allow UBI devices to have a "volumes" firmware subnode with volumes which may be compatible with "nvmem-cells". Access to UBI volumes via the NVMEM interface at this point is read-only, and it is slow, opening and closing the UBI volume for each access due to limitations of the NVMEM provider API. Signed-off-by: Daniel Golle Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/Kconfig | 13 +++ drivers/mtd/ubi/Makefile | 1 + drivers/mtd/ubi/nvmem.c | 188 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 drivers/mtd/ubi/nvmem.c diff --git a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig index 7499a540121e..e28a3af83c0e 100644 --- a/drivers/mtd/ubi/Kconfig +++ b/drivers/mtd/ubi/Kconfig @@ -113,4 +113,17 @@ config MTD_UBI_FAULT_INJECTION testing purposes. If in doubt, say "N". + +config MTD_UBI_NVMEM + tristate "UBI virtual NVMEM" + default n + depends on NVMEM + help + This option enabled an additional driver exposing UBI volumes as NVMEM + providers, intended for platforms where UBI is part of the firmware + specification and used to store also e.g. MAC addresses or board- + specific Wi-Fi calibration data. + + If in doubt, say "N". + endif # MTD_UBI diff --git a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile index 543673605ca7..4b51aaf00d1a 100644 --- a/drivers/mtd/ubi/Makefile +++ b/drivers/mtd/ubi/Makefile @@ -7,3 +7,4 @@ ubi-$(CONFIG_MTD_UBI_FASTMAP) += fastmap.o ubi-$(CONFIG_MTD_UBI_BLOCK) += block.o obj-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o +obj-$(CONFIG_MTD_UBI_NVMEM) += nvmem.o diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c new file mode 100644 index 000000000000..b7a93c495d17 --- /dev/null +++ b/drivers/mtd/ubi/nvmem.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023 Daniel Golle + */ + +/* UBI NVMEM provider */ +#include "ubi.h" +#include +#include + +/* List of all NVMEM devices */ +static LIST_HEAD(nvmem_devices); +static DEFINE_MUTEX(devices_mutex); + +struct ubi_nvmem { + struct nvmem_device *nvmem; + int ubi_num; + int vol_id; + int usable_leb_size; + struct list_head list; +}; + +static int ubi_nvmem_reg_read(void *priv, unsigned int from, + void *val, size_t bytes) +{ + int err = 0, lnum = from, offs, bytes_left = bytes, to_read; + struct ubi_nvmem *unv = priv; + struct ubi_volume_desc *desc; + + desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + offs = do_div(lnum, unv->usable_leb_size); + while (bytes_left) { + to_read = unv->usable_leb_size - offs; + + if (to_read > bytes_left) + to_read = bytes_left; + + err = ubi_read(desc, lnum, val, offs, to_read); + if (err) + break; + + lnum += 1; + offs = 0; + bytes_left -= to_read; + val += to_read; + } + ubi_close_volume(desc); + + if (err) + return err; + + return bytes_left == 0 ? 0 : -EIO; +} + +static int ubi_nvmem_add(struct ubi_volume_info *vi) +{ + struct device_node *np = dev_of_node(vi->dev); + struct nvmem_config config = {}; + struct ubi_nvmem *unv; + int ret; + + if (!np) + return 0; + + if (!of_get_child_by_name(np, "nvmem-layout")) + return 0; + + if (WARN_ON_ONCE(vi->usable_leb_size <= 0) || + WARN_ON_ONCE(vi->size <= 0)) + return -EINVAL; + + unv = kzalloc(sizeof(struct ubi_nvmem), GFP_KERNEL); + if (!unv) + return -ENOMEM; + + config.id = NVMEM_DEVID_NONE; + config.dev = vi->dev; + config.name = dev_name(vi->dev); + config.owner = THIS_MODULE; + config.priv = unv; + config.reg_read = ubi_nvmem_reg_read; + config.size = vi->usable_leb_size * vi->size; + config.word_size = 1; + config.stride = 1; + config.read_only = true; + config.root_only = true; + config.ignore_wp = true; + config.of_node = np; + + unv->ubi_num = vi->ubi_num; + unv->vol_id = vi->vol_id; + unv->usable_leb_size = vi->usable_leb_size; + unv->nvmem = nvmem_register(&config); + if (IS_ERR(unv->nvmem)) { + ret = dev_err_probe(vi->dev, PTR_ERR(unv->nvmem), + "Failed to register NVMEM device\n"); + kfree(unv); + return ret; + } + + mutex_lock(&devices_mutex); + list_add_tail(&unv->list, &nvmem_devices); + mutex_unlock(&devices_mutex); + + return 0; +} + +static void ubi_nvmem_remove(struct ubi_volume_info *vi) +{ + struct ubi_nvmem *unv_c, *unv = NULL; + + mutex_lock(&devices_mutex); + list_for_each_entry(unv_c, &nvmem_devices, list) + if (unv_c->ubi_num == vi->ubi_num && unv_c->vol_id == vi->vol_id) { + unv = unv_c; + break; + } + + if (!unv) { + mutex_unlock(&devices_mutex); + return; + } + + list_del(&unv->list); + mutex_unlock(&devices_mutex); + nvmem_unregister(unv->nvmem); + kfree(unv); +} + +/** + * nvmem_notify - UBI notification handler. + * @nb: registered notifier block + * @l: notification type + * @ns_ptr: pointer to the &struct ubi_notification object + */ +static int nvmem_notify(struct notifier_block *nb, unsigned long l, + void *ns_ptr) +{ + struct ubi_notification *nt = ns_ptr; + + switch (l) { + case UBI_VOLUME_RESIZED: + ubi_nvmem_remove(&nt->vi); + fallthrough; + case UBI_VOLUME_ADDED: + ubi_nvmem_add(&nt->vi); + break; + case UBI_VOLUME_SHUTDOWN: + ubi_nvmem_remove(&nt->vi); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block nvmem_notifier = { + .notifier_call = nvmem_notify, +}; + +static int __init ubi_nvmem_init(void) +{ + return ubi_register_volume_notifier(&nvmem_notifier, 0); +} + +static void __exit ubi_nvmem_exit(void) +{ + struct ubi_nvmem *unv, *tmp; + + mutex_lock(&devices_mutex); + list_for_each_entry_safe(unv, tmp, &nvmem_devices, list) { + nvmem_unregister(unv->nvmem); + list_del(&unv->list); + kfree(unv); + } + mutex_unlock(&devices_mutex); + + ubi_unregister_volume_notifier(&nvmem_notifier); +} + +module_init(ubi_nvmem_init); +module_exit(ubi_nvmem_exit); +MODULE_DESCRIPTION("NVMEM layer over UBI volumes"); +MODULE_AUTHOR("Daniel Golle"); +MODULE_LICENSE("GPL"); From f31c204850f9d93906b5ac8c203b2066524ff245 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 25 Feb 2024 16:13:34 +0100 Subject: [PATCH 081/496] clocksource/drivers/arm_global_timer: Make gt_target_rate unsigned long Change the data type of gt_target_rate to unsigned long as this is what we get back from clk_get_rate(). Signed-off-by: Martin Blumenstingl Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240225151336.2728533-2-martin.blumenstingl@googlemail.com --- drivers/clocksource/arm_global_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index d749dee1d41d..fd39cfabe526 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -52,7 +52,8 @@ */ static void __iomem *gt_base; static struct notifier_block gt_clk_rate_change_nb; -static u32 gt_psv_new, gt_psv_bck, gt_target_rate; +static u32 gt_psv_new, gt_psv_bck; +static unsigned long gt_target_rate; static int gt_ppi; static struct clock_event_device __percpu *gt_evt; From e651f2fae33634175fae956d896277cf916f5d09 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 25 Feb 2024 16:13:35 +0100 Subject: [PATCH 082/496] clocksource/drivers/arm_global_timer: Guard against division by zero The result of the division of new_rate by gt_target_rate can be zero (if new_rate is smaller than gt_target_rate). Using that result as divisor without checking can result in a division by zero error. Guard against this by checking for a zero value earlier. While here, also change the psv variable to an unsigned long to make sure we don't overflow the datatype as all other types involved are also unsiged long. Signed-off-by: Martin Blumenstingl Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240225151336.2728533-3-martin.blumenstingl@googlemail.com --- drivers/clocksource/arm_global_timer.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index fd39cfabe526..4726a15b0afc 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -291,18 +291,17 @@ static int gt_clk_rate_change_cb(struct notifier_block *nb, switch (event) { case PRE_RATE_CHANGE: { - int psv; + unsigned long psv; - psv = DIV_ROUND_CLOSEST(ndata->new_rate, - gt_target_rate); - - if (abs(gt_target_rate - (ndata->new_rate / psv)) > MAX_F_ERR) + psv = DIV_ROUND_CLOSEST(ndata->new_rate, gt_target_rate); + if (!psv || + abs(gt_target_rate - (ndata->new_rate / psv)) > MAX_F_ERR) return NOTIFY_BAD; psv--; /* prescaler within legal range? */ - if (psv < 0 || psv > GT_CONTROL_PRESCALER_MAX) + if (psv > GT_CONTROL_PRESCALER_MAX) return NOTIFY_BAD; /* From 755350bcfb4ac8cbbb62bd7ee6be8271d4b2a88a Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sun, 25 Feb 2024 16:13:36 +0100 Subject: [PATCH 083/496] clocksource/drivers/arm_global_timer: Simplify prescaler register access Use GENMASK() to define the prescaler mask and make the whole driver use the mask (together with helpers such as FIELD_{GET,PREP,FIT}) instead of needing an additional shift and max value constant. Signed-off-by: Martin Blumenstingl Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240225151336.2728533-4-martin.blumenstingl@googlemail.com --- drivers/clocksource/arm_global_timer.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index 4726a15b0afc..ab1c8c2b66b8 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -31,10 +32,7 @@ #define GT_CONTROL_COMP_ENABLE BIT(1) /* banked */ #define GT_CONTROL_IRQ_ENABLE BIT(2) /* banked */ #define GT_CONTROL_AUTO_INC BIT(3) /* banked */ -#define GT_CONTROL_PRESCALER_SHIFT 8 -#define GT_CONTROL_PRESCALER_MAX 0xFF -#define GT_CONTROL_PRESCALER_MASK (GT_CONTROL_PRESCALER_MAX << \ - GT_CONTROL_PRESCALER_SHIFT) +#define GT_CONTROL_PRESCALER_MASK GENMASK(15, 8) #define GT_INT_STATUS 0x0c #define GT_INT_STATUS_EVENT_FLAG BIT(0) @@ -248,7 +246,7 @@ static void gt_write_presc(u32 psv) reg = readl(gt_base + GT_CONTROL); reg &= ~GT_CONTROL_PRESCALER_MASK; - reg |= psv << GT_CONTROL_PRESCALER_SHIFT; + reg |= FIELD_PREP(GT_CONTROL_PRESCALER_MASK, psv); writel(reg, gt_base + GT_CONTROL); } @@ -257,8 +255,7 @@ static u32 gt_read_presc(void) u32 reg; reg = readl(gt_base + GT_CONTROL); - reg &= GT_CONTROL_PRESCALER_MASK; - return reg >> GT_CONTROL_PRESCALER_SHIFT; + return FIELD_GET(GT_CONTROL_PRESCALER_MASK, reg); } static void __init gt_delay_timer_init(void) @@ -273,9 +270,9 @@ static int __init gt_clocksource_init(void) writel(0, gt_base + GT_COUNTER0); writel(0, gt_base + GT_COUNTER1); /* set prescaler and enable timer on all the cores */ - writel(((CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) << - GT_CONTROL_PRESCALER_SHIFT) - | GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); + writel(FIELD_PREP(GT_CONTROL_PRESCALER_MASK, + CONFIG_ARM_GT_INITIAL_PRESCALER_VAL - 1) | + GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK sched_clock_register(gt_sched_clock_read, 64, gt_target_rate); @@ -301,7 +298,7 @@ static int gt_clk_rate_change_cb(struct notifier_block *nb, psv--; /* prescaler within legal range? */ - if (psv > GT_CONTROL_PRESCALER_MAX) + if (!FIELD_FIT(GT_CONTROL_PRESCALER_MASK, psv)) return NOTIFY_BAD; /* From b9920fdd5a751df129808e7fa512e9928223ee05 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Feb 2024 08:03:24 +0100 Subject: [PATCH 084/496] ARM: 9352/1: iwmmxt: Remove support for PJ4/PJ4B cores PJ4 is a v7 core that incorporates a iWMMXt coprocessor. However, GCC does not support this combination (its iWMMXt configuration always implies v5te), and so there is no v6/v7 user space that actually makes use of this, beyond generic support for things like setjmp() that preserve/restore the iWMMXt register file using generic LDC/STC instructions emitted in assembler. As [0] appears to imply, this logic is triggered for the init process at boot, and so most user threads will have a iWMMXt register context associated with it, even though it is never used. At this point, it is highly unlikely that such GCC support will ever materialize (and Clang does not implement support for iWMMXt to begin with). This means that advertising iWMMXt support on these cores results in context switch overhead without any associated benefit, and so it is better to simply ignore the iWMMXt unit on these systems. So rip out the support. Doing so also fixes the issue reported in [0] related to UNDEF handling of co-processor #0/#1 instructions issued from user space running in Thumb2 mode. The PJ4 cores are used in four platforms: Armada 370/xp, Dove (Cubox, d2plug), MMP2 (xo-1.75) and Berlin (Google TV). Out of these, only the first is still widely used, but that one actually doesn't have iWMMXt but instead has only VFPV3-D16, and so it is not impacted by this change. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218427 [0] Fixes: 8bcba70cb5c22 ("ARM: entry: Disregard Thumb undef exception ...") Acked-by: Linus Walleij Acked-by: Arnd Bergmann Acked-by: Nicolas Pitre Reviewed-by: Jisheng Zhang Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 4 +- arch/arm/kernel/Makefile | 2 - arch/arm/kernel/iwmmxt.S | 51 ++++---------- arch/arm/kernel/pj4-cp0.c | 135 -------------------------------------- 4 files changed, 15 insertions(+), 177 deletions(-) delete mode 100644 arch/arm/kernel/pj4-cp0.c diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0af6709570d1..0d4e316a389e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -503,8 +503,8 @@ source "arch/arm/mm/Kconfig" config IWMMXT bool "Enable iWMMXt support" - depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4 || CPU_PJ4B - default y if PXA27x || PXA3xx || ARCH_MMP || CPU_PJ4 || CPU_PJ4B + depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK + default y if PXA27x || PXA3xx || ARCH_MMP help Enable support for iWMMXt context switching at run time if running on a CPU that supports it. diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 771264d4726a..ae2f2b2b4e5a 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -75,8 +75,6 @@ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_CPU_XSCALE) += xscale-cp0.o obj-$(CONFIG_CPU_XSC3) += xscale-cp0.o obj-$(CONFIG_CPU_MOHAWK) += xscale-cp0.o -obj-$(CONFIG_CPU_PJ4) += pj4-cp0.o -obj-$(CONFIG_CPU_PJ4B) += pj4-cp0.o obj-$(CONFIG_IWMMXT) += iwmmxt.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o perf_callchain.o obj-$(CONFIG_HW_PERF_EVENTS) += perf_event_xscale.o perf_event_v6.o \ diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S index a0218c4867b9..4a335d3c5969 100644 --- a/arch/arm/kernel/iwmmxt.S +++ b/arch/arm/kernel/iwmmxt.S @@ -18,18 +18,6 @@ #include #include "iwmmxt.h" -#if defined(CONFIG_CPU_PJ4) || defined(CONFIG_CPU_PJ4B) -#define PJ4(code...) code -#define XSC(code...) -#elif defined(CONFIG_CPU_MOHAWK) || \ - defined(CONFIG_CPU_XSC3) || \ - defined(CONFIG_CPU_XSCALE) -#define PJ4(code...) -#define XSC(code...) code -#else -#error "Unsupported iWMMXt architecture" -#endif - #define MMX_WR0 (0x00) #define MMX_WR1 (0x08) #define MMX_WR2 (0x10) @@ -81,17 +69,13 @@ ENDPROC(iwmmxt_undef_handler) ENTRY(iwmmxt_task_enable) inc_preempt_count r10, r3 - XSC(mrc p15, 0, r2, c15, c1, 0) - PJ4(mrc p15, 0, r2, c1, c0, 2) + mrc p15, 0, r2, c15, c1, 0 @ CP0 and CP1 accessible? - XSC(tst r2, #0x3) - PJ4(tst r2, #0xf) + tst r2, #0x3 bne 4f @ if so no business here @ enable access to CP0 and CP1 - XSC(orr r2, r2, #0x3) - XSC(mcr p15, 0, r2, c15, c1, 0) - PJ4(orr r2, r2, #0xf) - PJ4(mcr p15, 0, r2, c1, c0, 2) + orr r2, r2, #0x3 + mcr p15, 0, r2, c15, c1, 0 ldr r3, =concan_owner ldr r2, [r0, #S_PC] @ current task pc value @@ -218,12 +202,9 @@ ENTRY(iwmmxt_task_disable) bne 1f @ no: quit @ enable access to CP0 and CP1 - XSC(mrc p15, 0, r4, c15, c1, 0) - XSC(orr r4, r4, #0x3) - XSC(mcr p15, 0, r4, c15, c1, 0) - PJ4(mrc p15, 0, r4, c1, c0, 2) - PJ4(orr r4, r4, #0xf) - PJ4(mcr p15, 0, r4, c1, c0, 2) + mrc p15, 0, r4, c15, c1, 0 + orr r4, r4, #0x3 + mcr p15, 0, r4, c15, c1, 0 mov r0, #0 @ nothing to load str r0, [r3] @ no more current owner @@ -232,10 +213,8 @@ ENTRY(iwmmxt_task_disable) bl concan_save @ disable access to CP0 and CP1 - XSC(bic r4, r4, #0x3) - XSC(mcr p15, 0, r4, c15, c1, 0) - PJ4(bic r4, r4, #0xf) - PJ4(mcr p15, 0, r4, c1, c0, 2) + bic r4, r4, #0x3 + mcr p15, 0, r4, c15, c1, 0 mrc p15, 0, r2, c2, c0, 0 mov r2, r2 @ cpwait @@ -330,11 +309,9 @@ ENDPROC(iwmmxt_task_restore) */ ENTRY(iwmmxt_task_switch) - XSC(mrc p15, 0, r1, c15, c1, 0) - PJ4(mrc p15, 0, r1, c1, c0, 2) + mrc p15, 0, r1, c15, c1, 0 @ CP0 and CP1 accessible? - XSC(tst r1, #0x3) - PJ4(tst r1, #0xf) + tst r1, #0x3 bne 1f @ yes: block them for next task ldr r2, =concan_owner @@ -344,10 +321,8 @@ ENTRY(iwmmxt_task_switch) retne lr @ no: leave Concan disabled 1: @ flip Concan access - XSC(eor r1, r1, #0x3) - XSC(mcr p15, 0, r1, c15, c1, 0) - PJ4(eor r1, r1, #0xf) - PJ4(mcr p15, 0, r1, c1, c0, 2) + eor r1, r1, #0x3 + mcr p15, 0, r1, c15, c1, 0 mrc p15, 0, r1, c2, c0, 0 sub pc, lr, r1, lsr #32 @ cpwait and return diff --git a/arch/arm/kernel/pj4-cp0.c b/arch/arm/kernel/pj4-cp0.c deleted file mode 100644 index 4bca8098c4ff..000000000000 --- a/arch/arm/kernel/pj4-cp0.c +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/arch/arm/kernel/pj4-cp0.c - * - * PJ4 iWMMXt coprocessor context switching and handling - * - * Copyright (c) 2010 Marvell International Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static int iwmmxt_do(struct notifier_block *self, unsigned long cmd, void *t) -{ - struct thread_info *thread = t; - - switch (cmd) { - case THREAD_NOTIFY_FLUSH: - /* - * flush_thread() zeroes thread->fpstate, so no need - * to do anything here. - * - * FALLTHROUGH: Ensure we don't try to overwrite our newly - * initialised state information on the first fault. - */ - - case THREAD_NOTIFY_EXIT: - iwmmxt_task_release(thread); - break; - - case THREAD_NOTIFY_SWITCH: - iwmmxt_task_switch(thread); - break; - } - - return NOTIFY_DONE; -} - -static struct notifier_block __maybe_unused iwmmxt_notifier_block = { - .notifier_call = iwmmxt_do, -}; - - -static u32 __init pj4_cp_access_read(void) -{ - u32 value; - - __asm__ __volatile__ ( - "mrc p15, 0, %0, c1, c0, 2\n\t" - : "=r" (value)); - return value; -} - -static void __init pj4_cp_access_write(u32 value) -{ - u32 temp; - - __asm__ __volatile__ ( - "mcr p15, 0, %1, c1, c0, 2\n\t" -#ifdef CONFIG_THUMB2_KERNEL - "isb\n\t" -#else - "mrc p15, 0, %0, c1, c0, 2\n\t" - "mov %0, %0\n\t" - "sub pc, pc, #4\n\t" -#endif - : "=r" (temp) : "r" (value)); -} - -static int __init pj4_get_iwmmxt_version(void) -{ - u32 cp_access, wcid; - - cp_access = pj4_cp_access_read(); - pj4_cp_access_write(cp_access | 0xf); - - /* check if coprocessor 0 and 1 are available */ - if ((pj4_cp_access_read() & 0xf) != 0xf) { - pj4_cp_access_write(cp_access); - return -ENODEV; - } - - /* read iWMMXt coprocessor id register p1, c0 */ - __asm__ __volatile__ ("mrc p1, 0, %0, c0, c0, 0\n" : "=r" (wcid)); - - pj4_cp_access_write(cp_access); - - /* iWMMXt v1 */ - if ((wcid & 0xffffff00) == 0x56051000) - return 1; - /* iWMMXt v2 */ - if ((wcid & 0xffffff00) == 0x56052000) - return 2; - - return -EINVAL; -} - -/* - * Disable CP0/CP1 on boot, and let call_fpe() and the iWMMXt lazy - * switch code handle iWMMXt context switching. - */ -static int __init pj4_cp0_init(void) -{ - u32 __maybe_unused cp_access; - int vers; - - if (!cpu_is_pj4()) - return 0; - - vers = pj4_get_iwmmxt_version(); - if (vers < 0) - return 0; - -#ifndef CONFIG_IWMMXT - pr_info("PJ4 iWMMXt coprocessor detected, but kernel support is missing.\n"); -#else - cp_access = pj4_cp_access_read() & ~0xf; - pj4_cp_access_write(cp_access); - - pr_info("PJ4 iWMMXt v%d coprocessor enabled.\n", vers); - elf_hwcap |= HWCAP_IWMMXT; - thread_register_notifier(&iwmmxt_notifier_block); - register_iwmmxt_undef_handler(); -#endif - - return 0; -} - -late_initcall(pj4_cp0_init); From c819dbd078321f948101ef7a19f1e171164bb3cf Mon Sep 17 00:00:00 2001 From: Mubin Sayyed Date: Mon, 26 Feb 2024 15:03:33 +0530 Subject: [PATCH 085/496] dt-bindings: timer: Add support for cadence TTC PWM Cadence TTC can act as PWM device, it will be supported through separate PWM framework based driver. Decision to configure specific TTC device as PWM or clocksource/clockevent would be done based on presence of "#pwm-cells" property. Also, interrupt property is not required for TTC PWM driver. Update bindings to support TTC PWM configuration. Signed-off-by: Mubin Sayyed Reviewed-by: Krzysztof Kozlowski Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240226093333.2581092-1-mubin.sayyed@amd.com --- .../devicetree/bindings/timer/cdns,ttc.yaml | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/timer/cdns,ttc.yaml b/Documentation/devicetree/bindings/timer/cdns,ttc.yaml index dbba780c9b02..da342464d32e 100644 --- a/Documentation/devicetree/bindings/timer/cdns,ttc.yaml +++ b/Documentation/devicetree/bindings/timer/cdns,ttc.yaml @@ -32,12 +32,23 @@ properties: description: | Bit width of the timer, necessary if not 16. + "#pwm-cells": + const: 3 + required: - compatible - reg - - interrupts - clocks +allOf: + - if: + not: + required: + - "#pwm-cells" + then: + required: + - interrupts + additionalProperties: false examples: @@ -50,3 +61,12 @@ examples: clocks = <&cpu_clk 3>; timer-width = <32>; }; + + - | + pwm: pwm@f8002000 { + compatible = "cdns,ttc"; + reg = <0xf8002000 0x1000>; + clocks = <&cpu_clk 3>; + timer-width = <32>; + #pwm-cells = <3>; + }; From 96b171d6dba6a66c63312f35e3ac6465b2c2ca94 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:35 -0800 Subject: [PATCH 086/496] scsi: core: Query the Block Limits Extension VPD page Parse the Reduced Stream Control Supported (RSCS) bit from the block limits extension VPD page. The RSCS bit is defined in SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf). Reviewed-by: Avri Altman Reviewed-by: Daejun Park Cc: Martin K. Petersen Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-10-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi.c | 2 ++ drivers/scsi/scsi_sysfs.c | 10 ++++++++++ drivers/scsi/sd.c | 13 +++++++++++++ drivers/scsi/sd.h | 1 + include/scsi/scsi_device.h | 1 + 5 files changed, 27 insertions(+) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 76d369343c7a..74cc3369dd8d 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -499,6 +499,8 @@ void scsi_attach_vpd(struct scsi_device *sdev) scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); if (vpd_buf->data[i] == 0xb2) scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); + if (vpd_buf->data[i] == 0xb7) + scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7); } kfree(vpd_buf); } diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 24f6eefb6803..93652a786a46 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -449,6 +449,7 @@ static void scsi_device_dev_release(struct device *dev) struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; struct scsi_vpd *vpd_pg0 = NULL, *vpd_pg89 = NULL; struct scsi_vpd *vpd_pgb0 = NULL, *vpd_pgb1 = NULL, *vpd_pgb2 = NULL; + struct scsi_vpd *vpd_pgb7 = NULL; unsigned long flags; might_sleep(); @@ -494,6 +495,8 @@ static void scsi_device_dev_release(struct device *dev) lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb2 = rcu_replace_pointer(sdev->vpd_pgb2, vpd_pgb2, lockdep_is_held(&sdev->inquiry_mutex)); + vpd_pgb7 = rcu_replace_pointer(sdev->vpd_pgb7, vpd_pgb7, + lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_pg0) @@ -510,6 +513,8 @@ static void scsi_device_dev_release(struct device *dev) kfree_rcu(vpd_pgb1, rcu); if (vpd_pgb2) kfree_rcu(vpd_pgb2, rcu); + if (vpd_pgb7) + kfree_rcu(vpd_pgb7, rcu); kfree(sdev->inquiry); kfree(sdev); @@ -921,6 +926,7 @@ sdev_vpd_pg_attr(pg89); sdev_vpd_pg_attr(pgb0); sdev_vpd_pg_attr(pgb1); sdev_vpd_pg_attr(pgb2); +sdev_vpd_pg_attr(pgb7); sdev_vpd_pg_attr(pg0); static ssize_t show_inquiry(struct file *filep, struct kobject *kobj, @@ -1295,6 +1301,9 @@ static umode_t scsi_sdev_bin_attr_is_visible(struct kobject *kobj, if (attr == &dev_attr_vpd_pgb2 && !sdev->vpd_pgb2) return 0; + if (attr == &dev_attr_vpd_pgb7 && !sdev->vpd_pgb7) + return 0; + return S_IRUGO; } @@ -1347,6 +1356,7 @@ static struct bin_attribute *scsi_sdev_bin_attrs[] = { &dev_attr_vpd_pgb0, &dev_attr_vpd_pgb1, &dev_attr_vpd_pgb2, + &dev_attr_vpd_pgb7, &dev_attr_inquiry, NULL }; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 0833b3e6aa6e..86b819fa04d9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3108,6 +3108,18 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) rcu_read_unlock(); } +/* Parse the Block Limits Extension VPD page (0xb7) */ +static void sd_read_block_limits_ext(struct scsi_disk *sdkp) +{ + struct scsi_vpd *vpd; + + rcu_read_lock(); + vpd = rcu_dereference(sdkp->device->vpd_pgb7); + if (vpd && vpd->len >= 2) + sdkp->rscs = vpd->data[5] & 1; + rcu_read_unlock(); +} + /** * sd_read_block_characteristics - Query block dev. characteristics * @sdkp: disk to query @@ -3459,6 +3471,7 @@ static int sd_revalidate_disk(struct gendisk *disk) if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp); + sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp); sd_zbc_read_zones(sdkp, buffer); sd_read_cpr(sdkp); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 409dda5350d1..e4539122f2a2 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -151,6 +151,7 @@ struct scsi_disk { unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; + bool rscs : 1; /* reduced stream control support */ }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 5ec1e71a09de..f670b55d803a 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -153,6 +153,7 @@ struct scsi_device { struct scsi_vpd __rcu *vpd_pgb0; struct scsi_vpd __rcu *vpd_pgb1; struct scsi_vpd __rcu *vpd_pgb2; + struct scsi_vpd __rcu *vpd_pgb7; struct scsi_target *sdev_target; From 4977c0f4523e1d6c87df4eedceb33d38f055c5fb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:36 -0800 Subject: [PATCH 087/496] scsi: scsi_proto: Add structures and constants related to I/O groups and streams Prepare for adding code that will query the I/O advice hints group descriptors and for adding code that will retrieve the stream status. Cc: Martin K. Petersen Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-11-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/Kconfig | 5 +++ drivers/scsi/Makefile | 2 + drivers/scsi/scsi_proto_test.c | 56 ++++++++++++++++++++++++ include/scsi/scsi_proto.h | 78 ++++++++++++++++++++++++++++++++++ 4 files changed, 141 insertions(+) create mode 100644 drivers/scsi/scsi_proto_test.c diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index addac7fbe37b..83b542abfc29 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -232,6 +232,11 @@ config SCSI_SCAN_ASYNC Note that this setting also affects whether resuming from system suspend will be performed asynchronously. +config SCSI_PROTO_TEST + tristate "scsi_proto.h unit tests" if !KUNIT_ALL_TESTS + depends on SCSI && KUNIT + default KUNIT_ALL_TESTS + menu "SCSI Transports" depends on SCSI diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile index f055bfd54a68..1313ddf2fd1a 100644 --- a/drivers/scsi/Makefile +++ b/drivers/scsi/Makefile @@ -24,6 +24,8 @@ obj-$(CONFIG_SCSI_COMMON) += scsi_common.o obj-$(CONFIG_RAID_ATTRS) += raid_class.o +obj-$(CONFIG_SCSI_PROTO_TEST) += scsi_proto_test.o + # --- NOTE ORDERING HERE --- # For kernel non-modular link, transport attributes need to # be initialised before drivers diff --git a/drivers/scsi/scsi_proto_test.c b/drivers/scsi/scsi_proto_test.c new file mode 100644 index 000000000000..7fa0a78a2ad1 --- /dev/null +++ b/drivers/scsi/scsi_proto_test.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Google LLC + */ +#include +#include +#include + +static void test_scsi_proto(struct kunit *test) +{ + static const union { + struct scsi_io_group_descriptor desc; + u8 arr[sizeof(struct scsi_io_group_descriptor)]; + } d = { .arr = { 0x45, 0, 0, 0, 0xb0, 0xe4, 0xe3 } }; + KUNIT_EXPECT_EQ(test, d.desc.io_advice_hints_mode + 0, 1); + KUNIT_EXPECT_EQ(test, d.desc.st_enble + 0, 1); + KUNIT_EXPECT_EQ(test, d.desc.cs_enble + 0, 0); + KUNIT_EXPECT_EQ(test, d.desc.ic_enable + 0, 1); + KUNIT_EXPECT_EQ(test, d.desc.acdlu + 0, 1); + KUNIT_EXPECT_EQ(test, d.desc.rlbsr + 0, 3); + KUNIT_EXPECT_EQ(test, d.desc.lbm_descriptor_type + 0, 0); + KUNIT_EXPECT_EQ(test, d.desc.params[0] + 0, 0xe4); + KUNIT_EXPECT_EQ(test, d.desc.params[1] + 0, 0xe3); + + static const union { + struct scsi_stream_status s; + u8 arr[sizeof(struct scsi_stream_status)]; + } ss = { .arr = { 0x80, 0, 0x12, 0x34, 0x3f } }; + KUNIT_EXPECT_EQ(test, ss.s.perm + 0, 1); + KUNIT_EXPECT_EQ(test, get_unaligned_be16(&ss.s.stream_identifier), + 0x1234); + KUNIT_EXPECT_EQ(test, ss.s.rel_lifetime + 0, 0x3f); + + static const union { + struct scsi_stream_status_header h; + u8 arr[sizeof(struct scsi_stream_status_header)]; + } sh = { .arr = { 1, 2, 3, 4, 0, 0, 5, 6 } }; + KUNIT_EXPECT_EQ(test, get_unaligned_be32(&sh.h.len), 0x1020304); + KUNIT_EXPECT_EQ(test, get_unaligned_be16(&sh.h.number_of_open_streams), + 0x506); +} + +static struct kunit_case scsi_proto_test_cases[] = { + KUNIT_CASE(test_scsi_proto), + {} +}; + +static struct kunit_suite scsi_proto_test_suite = { + .name = "scsi_proto", + .test_cases = scsi_proto_test_cases, +}; +kunit_test_suite(scsi_proto_test_suite); + +MODULE_DESCRIPTION(" unit tests"); +MODULE_AUTHOR("Bart Van Assche"); +MODULE_LICENSE("GPL"); diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index 07d65c1f59db..843106e1109f 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -10,6 +10,7 @@ #ifndef _SCSI_PROTO_H_ #define _SCSI_PROTO_H_ +#include #include /* @@ -126,6 +127,7 @@ #define SAI_READ_CAPACITY_16 0x10 #define SAI_GET_LBA_STATUS 0x12 #define SAI_REPORT_REFERRALS 0x13 +#define SAI_GET_STREAM_STATUS 0x16 /* values for maintenance in */ #define MI_REPORT_IDENTIFYING_INFORMATION 0x05 #define MI_REPORT_TARGET_PGS 0x0a @@ -275,6 +277,82 @@ struct scsi_lun { __u8 scsi_lun[8]; }; +/* SBC-5 IO advice hints group descriptor */ +struct scsi_io_group_descriptor { +#if defined(__BIG_ENDIAN) + u8 io_advice_hints_mode: 2; + u8 reserved1: 3; + u8 st_enble: 1; + u8 cs_enble: 1; + u8 ic_enable: 1; +#elif defined(__LITTLE_ENDIAN) + u8 ic_enable: 1; + u8 cs_enble: 1; + u8 st_enble: 1; + u8 reserved1: 3; + u8 io_advice_hints_mode: 2; +#else +#error +#endif + u8 reserved2[3]; + /* Logical block markup descriptor */ +#if defined(__BIG_ENDIAN) + u8 acdlu: 1; + u8 reserved3: 1; + u8 rlbsr: 2; + u8 lbm_descriptor_type: 4; +#elif defined(__LITTLE_ENDIAN) + u8 lbm_descriptor_type: 4; + u8 rlbsr: 2; + u8 reserved3: 1; + u8 acdlu: 1; +#else +#error +#endif + u8 params[2]; + u8 reserved4; + u8 reserved5[8]; +}; + +static_assert(sizeof(struct scsi_io_group_descriptor) == 16); + +/* SCSI stream status descriptor */ +struct scsi_stream_status { +#if defined(__BIG_ENDIAN) + u8 perm: 1; + u8 reserved1: 7; +#elif defined(__LITTLE_ENDIAN) + u8 reserved1: 7; + u8 perm: 1; +#else +#error +#endif + u8 reserved2; + __be16 stream_identifier; +#if defined(__BIG_ENDIAN) + u8 reserved3: 2; + u8 rel_lifetime: 6; +#elif defined(__LITTLE_ENDIAN) + u8 rel_lifetime: 6; + u8 reserved3: 2; +#else +#error +#endif + u8 reserved4[3]; +}; + +static_assert(sizeof(struct scsi_stream_status) == 8); + +/* GET STREAM STATUS parameter data */ +struct scsi_stream_status_header { + __be32 len; /* length in bytes of stream_status[] array. */ + u16 reserved; + __be16 number_of_open_streams; + DECLARE_FLEX_ARRAY(struct scsi_stream_status, stream_status); +}; + +static_assert(sizeof(struct scsi_stream_status_header) == 8); + /* SPC asymmetric access states */ #define SCSI_ACCESS_STATE_OPTIMAL 0x00 #define SCSI_ACCESS_STATE_ACTIVE 0x01 From 4f53138fffc2b18396859aa4ff3e7ef2b0839c2b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:37 -0800 Subject: [PATCH 088/496] scsi: sd: Translate data lifetime information Recently T10 standardized SBC constrained streams. This mechanism allows to pass data lifetime information to SCSI devices in the group number field. Add support for translating write hint information into a permanent stream number in the sd driver. Use WRITE(10) instead of WRITE(6) if data lifetime information is present because the WRITE(6) command does not have a GROUP NUMBER field. Cc: Martin K. Petersen Cc: Christoph Hellwig Cc: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-12-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 98 +++++++++++++++++++++++++++++++++++++++++++++-- drivers/scsi/sd.h | 4 +- 2 files changed, 98 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 86b819fa04d9..bce5b13b020b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1080,12 +1081,38 @@ static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) return BLK_STS_OK; } +/** + * sd_group_number() - Compute the GROUP NUMBER field + * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER + * field. + * + * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf): + * 0: no relative lifetime. + * 1: shortest relative lifetime. + * 2: second shortest relative lifetime. + * 3 - 0x3d: intermediate relative lifetimes. + * 0x3e: second longest relative lifetime. + * 0x3f: longest relative lifetime. + */ +static u8 sd_group_number(struct scsi_cmnd *cmd) +{ + const struct request *rq = scsi_cmd_to_rq(cmd); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); + + if (!sdkp->rscs) + return 0; + + return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count, + 0x3fu); +} + static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; + cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; @@ -1104,7 +1131,7 @@ static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); - cmd->cmnd[14] = (dld & 0x03) << 6; + cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd); cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); @@ -1119,7 +1146,7 @@ static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; - cmd->cmnd[6] = 0; + cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); @@ -1256,7 +1283,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || - sdp->use_10_for_rw || protect) { + sdp->use_10_for_rw || protect || rq->write_hint) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { @@ -3001,6 +3028,70 @@ defaults: sdkp->DPOFUA = 0; } +static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id) +{ + u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS }; + struct { + struct scsi_stream_status_header h; + struct scsi_stream_status s; + } buf; + struct scsi_device *sdev = sdkp->device; + struct scsi_sense_hdr sshdr; + const struct scsi_exec_args exec_args = { + .sshdr = &sshdr, + }; + int res; + + put_unaligned_be16(stream_id, &cdb[4]); + put_unaligned_be32(sizeof(buf), &cdb[10]); + + res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf), + SD_TIMEOUT, sdkp->max_retries, &exec_args); + if (res < 0) + return false; + if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) + sd_print_sense_hdr(sdkp, &sshdr); + if (res) + return false; + if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status)) + return false; + return buf.h.stream_status[0].perm; +} + +static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) +{ + struct scsi_device *sdp = sdkp->device; + const struct scsi_io_group_descriptor *desc, *start, *end; + struct scsi_sense_hdr sshdr; + struct scsi_mode_data data; + int res; + + res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, + /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, + sdkp->max_retries, &data, &sshdr); + if (res < 0) + return; + start = (void *)buffer + data.header_length + 16; + end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length, + sizeof(*end)); + /* + * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs + * should assign the lowest numbered stream identifiers to permanent + * streams. + */ + for (desc = start; desc < end; desc++) + if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) + break; + sdkp->permanent_stream_count = desc - start; + if (sdkp->rscs && sdkp->permanent_stream_count < 2) + sd_printk(KERN_INFO, sdkp, + "Unexpected: RSCS has been set and the permanent stream count is %u\n", + sdkp->permanent_stream_count); + else if (sdkp->permanent_stream_count) + sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", + sdkp->permanent_stream_count); +} + /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. @@ -3481,6 +3572,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); + sd_read_io_hints(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index e4539122f2a2..5c4285a582b2 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -125,6 +125,8 @@ struct scsi_disk { unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; + /* number of permanent streams */ + u16 permanent_stream_count; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ @@ -151,7 +153,7 @@ struct scsi_disk { unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; - bool rscs : 1; /* reduced stream control support */ + unsigned rscs : 1; /* reduced stream control support */ }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) From a5fe98eb8f630f3ad3d1d5c16374621e8c0cd702 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:38 -0800 Subject: [PATCH 089/496] scsi: scsi_debug: Reduce code duplication All VPD pages have the page code in byte one. Reduce code duplication by storing the VPD page code once. Reviewed-by: Avri Altman Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-13-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index d03d66f11493..994a2b829b94 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1903,7 +1903,8 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) u32 len; char lu_id_str[6]; int host_no = devip->sdbg_host->shost->host_no; - + + arr[1] = cmd[2]; port_group_id = (((host_no + 1) & 0x7f) << 8) + (devip->channel & 0x7f); if (sdebug_vpd_use_hostno == 0) @@ -1914,7 +1915,6 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) (devip->target * 1000) - 3; len = scnprintf(lu_id_str, 6, "%d", lu_id_num); if (0 == cmd[2]) { /* supported vital product data pages */ - arr[1] = cmd[2]; /*sanity */ n = 4; arr[n++] = 0x0; /* this page */ arr[n++] = 0x80; /* unit serial number */ @@ -1935,23 +1935,18 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } arr[3] = n - 4; /* number of supported VPD pages */ } else if (0x80 == cmd[2]) { /* unit serial number */ - arr[1] = cmd[2]; /*sanity */ arr[3] = len; memcpy(&arr[4], lu_id_str, len); } else if (0x83 == cmd[2]) { /* device identification */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_83(&arr[4], port_group_id, target_dev_id, lu_id_num, lu_id_str, len, &devip->lu_name); } else if (0x84 == cmd[2]) { /* Software interface ident. */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_84(&arr[4]); } else if (0x85 == cmd[2]) { /* Management network addresses */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_85(&arr[4]); } else if (0x86 == cmd[2]) { /* extended inquiry */ - arr[1] = cmd[2]; /*sanity */ arr[3] = 0x3c; /* number of following entries */ if (sdebug_dif == T10_PI_TYPE3_PROTECTION) arr[4] = 0x4; /* SPT: GRD_CHK:1 */ @@ -1961,30 +1956,23 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) arr[4] = 0x0; /* no protection stuff */ arr[5] = 0x7; /* head of q, ordered + simple q's */ } else if (0x87 == cmd[2]) { /* mode page policy */ - arr[1] = cmd[2]; /*sanity */ arr[3] = 0x8; /* number of following entries */ arr[4] = 0x2; /* disconnect-reconnect mp */ arr[6] = 0x80; /* mlus, shared */ arr[8] = 0x18; /* protocol specific lu */ arr[10] = 0x82; /* mlus, per initiator port */ } else if (0x88 == cmd[2]) { /* SCSI Ports */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_88(&arr[4], target_dev_id); } else if (is_disk_zbc && 0x89 == cmd[2]) { /* ATA info */ - arr[1] = cmd[2]; /*sanity */ n = inquiry_vpd_89(&arr[4]); put_unaligned_be16(n, arr + 2); } else if (is_disk_zbc && 0xb0 == cmd[2]) { /* Block limits */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_b0(&arr[4]); } else if (is_disk_zbc && 0xb1 == cmd[2]) { /* Block char. */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_b1(devip, &arr[4]); } else if (is_disk && 0xb2 == cmd[2]) { /* LB Prov. */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_b2(&arr[4]); } else if (is_zbc && cmd[2] == 0xb6) { /* ZB dev. charact. */ - arr[1] = cmd[2]; /*sanity */ arr[3] = inquiry_vpd_b6(devip, &arr[4]); } else { mk_sense_invalid_fld(scp, SDEB_IN_CDB, 2, -1); From b1e5c0b34db8e7dac04af618e53c64e70c86aac8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:39 -0800 Subject: [PATCH 090/496] scsi: scsi_debug: Support the block limits extension VPD page >From SBC-5 r05: "Reduced stream control: a) reduces the maximum number of streams that the device server supports; and b) increases the number of write commands that are able to specify a stream to be written in any write command that contains the GROUP NUMBER field in its CDB. If the RSCS bit (see 6.6.5) is set to one, then the device server shall: a) support per group stream identifier usage as described in 4.32.2; b) support the IO Advice Hints Grouping mode page (see 6.5.7); and c) set the MAXIMUM NUMBER OF STREAMS field (see 6.6.5) to a value that is less than 64. Device servers that set the RSCS bit to one may support other features (e.g., permanent streams (see 4.32.4)). 4.32.4 Permanent streams A permanent stream is a stream for which the device server does not allow closing or otherwise modifying the configuration of that stream. The PERM bit (see 5.9.2.3) indicates whether a stream is a permanent stream. If a STREAM CONTROL command (see 5.32) specifies the closing of a permanent stream, the device server terminates that command with CHECK CONDITION status instead of closing the specified stream. A permanent stream is always an open stream. Device severs should assign the lowest numbered stream identifiers to permanent streams." Report that reduced stream control is supported. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-14-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 994a2b829b94..8dba838ef983 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1867,6 +1867,19 @@ static int inquiry_vpd_b6(struct sdebug_dev_info *devip, unsigned char *arr) return 0x3c; } +#define SDEBUG_BLE_LEN_AFTER_B4 28 /* thus vpage 32 bytes long */ + +enum { MAXIMUM_NUMBER_OF_STREAMS = 6, PERMANENT_STREAM_COUNT = 5 }; + +/* Block limits extension VPD page (SBC-4) */ +static int inquiry_vpd_b7(unsigned char *arrb4) +{ + memset(arrb4, 0, SDEBUG_BLE_LEN_AFTER_B4); + arrb4[1] = 1; /* Reduced stream control support (RSCS) */ + put_unaligned_be16(MAXIMUM_NUMBER_OF_STREAMS, &arrb4[2]); + return SDEBUG_BLE_LEN_AFTER_B4; +} + #define SDEBUG_LONG_INQ_SZ 96 #define SDEBUG_MAX_INQ_ARR_SZ 584 @@ -1932,6 +1945,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) arr[n++] = 0xb2; /* LB Provisioning */ if (is_zbc) arr[n++] = 0xb6; /* ZB dev. char. */ + arr[n++] = 0xb7; /* Block limits extension */ } arr[3] = n - 4; /* number of supported VPD pages */ } else if (0x80 == cmd[2]) { /* unit serial number */ @@ -1974,6 +1988,8 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) arr[3] = inquiry_vpd_b2(&arr[4]); } else if (is_zbc && cmd[2] == 0xb6) { /* ZB dev. charact. */ arr[3] = inquiry_vpd_b6(devip, &arr[4]); + } else if (cmd[2] == 0xb7) { /* block limits extension page */ + arr[3] = inquiry_vpd_b7(&arr[4]); } else { mk_sense_invalid_fld(scp, SDEB_IN_CDB, 2, -1); kfree(arr); From b2f860903fe9774f755a917edc674ba6e879fa55 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:40 -0800 Subject: [PATCH 091/496] scsi: scsi_debug: Rework page code error handling Instead of tracking whether or not the page code is valid in a boolean variable, jump to error handling code if an unsupported page code is encountered. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-15-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 8dba838ef983..ba0a29e6a86d 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2644,7 +2644,7 @@ static int resp_mode_sense(struct scsi_cmnd *scp, unsigned char *ap; unsigned char arr[SDEBUG_MAX_MSENSE_SZ]; unsigned char *cmd = scp->cmnd; - bool dbd, llbaa, msense_6, is_disk, is_zbc, bad_pcode; + bool dbd, llbaa, msense_6, is_disk, is_zbc; dbd = !!(cmd[1] & 0x8); /* disable block descriptors */ pcontrol = (cmd[2] & 0xc0) >> 6; @@ -2708,7 +2708,6 @@ static int resp_mode_sense(struct scsi_cmnd *scp, mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); return check_condition_result; } - bad_pcode = false; switch (pcode) { case 0x1: /* Read-Write error recovery page, direct access */ @@ -2723,15 +2722,17 @@ static int resp_mode_sense(struct scsi_cmnd *scp, if (is_disk) { len = resp_format_pg(ap, pcontrol, target); offset += len; - } else - bad_pcode = true; + } else { + goto bad_pcode; + } break; case 0x8: /* Caching page, direct access */ if (is_disk || is_zbc) { len = resp_caching_pg(ap, pcontrol, target); offset += len; - } else - bad_pcode = true; + } else { + goto bad_pcode; + } break; case 0xa: /* Control Mode page, all devices */ len = resp_ctrl_m_pg(ap, pcontrol, target); @@ -2784,18 +2785,17 @@ static int resp_mode_sense(struct scsi_cmnd *scp, } break; default: - bad_pcode = true; - break; - } - if (bad_pcode) { - mk_sense_invalid_fld(scp, SDEB_IN_CDB, 2, 5); - return check_condition_result; + goto bad_pcode; } if (msense_6) arr[0] = offset - 1; else put_unaligned_be16((offset - 2), arr + 0); return fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, offset)); + +bad_pcode: + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 2, 5); + return check_condition_result; } #define SDEBUG_MAX_MSELECT_SZ 512 From f19c3e4fe2542d7b145d294386666958c9fabe17 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:41 -0800 Subject: [PATCH 092/496] scsi: scsi_debug: Rework subpage code error handling Move the subpage code checks into the switch statement to make it easier to add support for new page code / subpage code combinations. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-16-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 70 ++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index ba0a29e6a86d..67a8e6243e5e 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2703,22 +2703,22 @@ static int resp_mode_sense(struct scsi_cmnd *scp, ap = arr + offset; } - if ((subpcode > 0x0) && (subpcode < 0xff) && (0x19 != pcode)) { - /* TODO: Control Extension page */ - mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); - return check_condition_result; - } - switch (pcode) { case 0x1: /* Read-Write error recovery page, direct access */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; len = resp_err_recov_pg(ap, pcontrol, target); offset += len; break; case 0x2: /* Disconnect-Reconnect page, all devices */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; len = resp_disconnect_pg(ap, pcontrol, target); offset += len; break; case 0x3: /* Format device page, direct access */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; if (is_disk) { len = resp_format_pg(ap, pcontrol, target); offset += len; @@ -2727,6 +2727,8 @@ static int resp_mode_sense(struct scsi_cmnd *scp, } break; case 0x8: /* Caching page, direct access */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; if (is_disk || is_zbc) { len = resp_caching_pg(ap, pcontrol, target); offset += len; @@ -2735,14 +2737,14 @@ static int resp_mode_sense(struct scsi_cmnd *scp, } break; case 0xa: /* Control Mode page, all devices */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; len = resp_ctrl_m_pg(ap, pcontrol, target); offset += len; break; case 0x19: /* if spc==1 then sas phy, control+discover */ - if ((subpcode > 0x2) && (subpcode < 0xff)) { - mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); - return check_condition_result; - } + if (subpcode > 0x2 && subpcode < 0xff) + goto bad_subpcode; len = 0; if ((0x0 == subpcode) || (0xff == subpcode)) len += resp_sas_sf_m_pg(ap + len, pcontrol, target); @@ -2754,35 +2756,31 @@ static int resp_mode_sense(struct scsi_cmnd *scp, offset += len; break; case 0x1c: /* Informational Exceptions Mode page, all devices */ + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; len = resp_iec_m_pg(ap, pcontrol, target); offset += len; break; case 0x3f: /* Read all Mode pages */ - if ((0 == subpcode) || (0xff == subpcode)) { - len = resp_err_recov_pg(ap, pcontrol, target); - len += resp_disconnect_pg(ap + len, pcontrol, target); - if (is_disk) { - len += resp_format_pg(ap + len, pcontrol, - target); - len += resp_caching_pg(ap + len, pcontrol, - target); - } else if (is_zbc) { - len += resp_caching_pg(ap + len, pcontrol, - target); - } - len += resp_ctrl_m_pg(ap + len, pcontrol, target); - len += resp_sas_sf_m_pg(ap + len, pcontrol, target); - if (0xff == subpcode) { - len += resp_sas_pcd_m_spg(ap + len, pcontrol, - target, target_dev_id); - len += resp_sas_sha_m_spg(ap + len, pcontrol); - } - len += resp_iec_m_pg(ap + len, pcontrol, target); - offset += len; - } else { - mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); - return check_condition_result; + if (subpcode > 0x0 && subpcode < 0xff) + goto bad_subpcode; + len = resp_err_recov_pg(ap, pcontrol, target); + len += resp_disconnect_pg(ap + len, pcontrol, target); + if (is_disk) { + len += resp_format_pg(ap + len, pcontrol, target); + len += resp_caching_pg(ap + len, pcontrol, target); + } else if (is_zbc) { + len += resp_caching_pg(ap + len, pcontrol, target); } + len += resp_ctrl_m_pg(ap + len, pcontrol, target); + len += resp_sas_sf_m_pg(ap + len, pcontrol, target); + if (0xff == subpcode) { + len += resp_sas_pcd_m_spg(ap + len, pcontrol, target, + target_dev_id); + len += resp_sas_sha_m_spg(ap + len, pcontrol); + } + len += resp_iec_m_pg(ap + len, pcontrol, target); + offset += len; break; default: goto bad_pcode; @@ -2796,6 +2794,10 @@ static int resp_mode_sense(struct scsi_cmnd *scp, bad_pcode: mk_sense_invalid_fld(scp, SDEB_IN_CDB, 2, 5); return check_condition_result; + +bad_subpcode: + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); + return check_condition_result; } #define SDEBUG_MAX_MSELECT_SZ 512 From b952eb270df38bc0d930a1ef965666ecf54a2097 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:42 -0800 Subject: [PATCH 093/496] scsi: scsi_debug: Allocate the MODE SENSE response from the heap Make the MODE SENSE response buffer larger and allocate it from the heap. This patch prepares for adding support for the IO Advice Hints Grouping mode page. Suggested-by: Douglas Gilbert Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-17-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 67a8e6243e5e..b544498324f6 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -43,6 +43,7 @@ #include #include #include +#include #include @@ -2631,7 +2632,8 @@ static int resp_sas_sha_m_spg(unsigned char *p, int pcontrol) return sizeof(sas_sha_m_pg); } -#define SDEBUG_MAX_MSENSE_SZ 256 +/* PAGE_SIZE is more than necessary but provides room for future expansion. */ +#define SDEBUG_MAX_MSENSE_SZ PAGE_SIZE static int resp_mode_sense(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) @@ -2642,10 +2644,13 @@ static int resp_mode_sense(struct scsi_cmnd *scp, int target_dev_id; int target = scp->device->id; unsigned char *ap; - unsigned char arr[SDEBUG_MAX_MSENSE_SZ]; + unsigned char *arr __free(kfree); unsigned char *cmd = scp->cmnd; bool dbd, llbaa, msense_6, is_disk, is_zbc; + arr = kzalloc(SDEBUG_MAX_MSENSE_SZ, GFP_ATOMIC); + if (!arr) + return -ENOMEM; dbd = !!(cmd[1] & 0x8); /* disable block descriptors */ pcontrol = (cmd[2] & 0xc0) >> 6; pcode = cmd[2] & 0x3f; From f8ab2710177a762ce0f9b8426f9fc292394949df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:43 -0800 Subject: [PATCH 094/496] scsi: scsi_debug: Implement the IO Advice Hints Grouping mode page Implement an IO Advice Hints Grouping mode page with three permanent streams. A permanent stream is a stream for which the device server does not allow closing or otherwise modifying the configuration of that stream. The stream identifier enable (ST_ENBLE) bit specifies whether the stream identifier may be used in the GROUP NUMBER field of SCSI WRITE commands. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-18-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 61 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index b544498324f6..ccf59b3e7602 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1969,7 +1969,11 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) arr[4] = 0x5; /* SPT: GRD_CHK:1, REF_CHK:1 */ else arr[4] = 0x0; /* no protection stuff */ - arr[5] = 0x7; /* head of q, ordered + simple q's */ + /* + * GROUP_SUP=1; HEADSUP=1 (HEAD OF QUEUE); ORDSUP=1 + * (ORDERED queuing); SIMPSUP=1 (SIMPLE queuing). + */ + arr[5] = 0x17; } else if (0x87 == cmd[2]) { /* mode page policy */ arr[3] = 0x8; /* number of following entries */ arr[4] = 0x2; /* disconnect-reconnect mp */ @@ -2559,6 +2563,40 @@ static int resp_ctrl_m_pg(unsigned char *p, int pcontrol, int target) return sizeof(ctrl_m_pg); } +/* IO Advice Hints Grouping mode page */ +static int resp_grouping_m_pg(unsigned char *p, int pcontrol, int target) +{ + /* IO Advice Hints Grouping mode page */ + struct grouping_m_pg { + u8 page_code; /* OR 0x40 when subpage_code > 0 */ + u8 subpage_code; + __be16 page_length; + u8 reserved[12]; + struct scsi_io_group_descriptor descr[MAXIMUM_NUMBER_OF_STREAMS]; + }; + static const struct grouping_m_pg gr_m_pg = { + .page_code = 0xa | 0x40, + .subpage_code = 5, + .page_length = cpu_to_be16(sizeof(gr_m_pg) - 4), + .descr = { + { .st_enble = 1 }, + { .st_enble = 1 }, + { .st_enble = 1 }, + { .st_enble = 1 }, + { .st_enble = 1 }, + { .st_enble = 0 }, + } + }; + + BUILD_BUG_ON(sizeof(struct grouping_m_pg) != + 16 + MAXIMUM_NUMBER_OF_STREAMS * 16); + memcpy(p, &gr_m_pg, sizeof(gr_m_pg)); + if (1 == pcontrol) { + /* There are no changeable values so clear from byte 4 on. */ + memset(p + 4, 0, sizeof(gr_m_pg) - 4); + } + return sizeof(gr_m_pg); +} static int resp_iec_m_pg(unsigned char *p, int pcontrol, int target) { /* Informational Exceptions control mode page for mode_sense */ @@ -2708,6 +2746,10 @@ static int resp_mode_sense(struct scsi_cmnd *scp, ap = arr + offset; } + /* + * N.B. If len>0 before resp_*_pg() call, then form of that call should be: + * len += resp_*_pg(ap + len, pcontrol, target); + */ switch (pcode) { case 0x1: /* Read-Write error recovery page, direct access */ if (subpcode > 0x0 && subpcode < 0xff) @@ -2742,9 +2784,20 @@ static int resp_mode_sense(struct scsi_cmnd *scp, } break; case 0xa: /* Control Mode page, all devices */ - if (subpcode > 0x0 && subpcode < 0xff) + switch (subpcode) { + case 0: + len = resp_ctrl_m_pg(ap, pcontrol, target); + break; + case 0x05: + len = resp_grouping_m_pg(ap, pcontrol, target); + break; + case 0xff: + len = resp_ctrl_m_pg(ap, pcontrol, target); + len += resp_grouping_m_pg(ap + len, pcontrol, target); + break; + default: goto bad_subpcode; - len = resp_ctrl_m_pg(ap, pcontrol, target); + } offset += len; break; case 0x19: /* if spc==1 then sas phy, control+discover */ @@ -2778,6 +2831,8 @@ static int resp_mode_sense(struct scsi_cmnd *scp, len += resp_caching_pg(ap + len, pcontrol, target); } len += resp_ctrl_m_pg(ap + len, pcontrol, target); + if (0xff == subpcode) + len += resp_grouping_m_pg(ap + len, pcontrol, target); len += resp_sas_sf_m_pg(ap + len, pcontrol, target); if (0xff == subpcode) { len += resp_sas_pcd_m_spg(ap + len, pcontrol, target, From ad620becda436fc02e50e5f6fe01de1d1f3794c9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:44 -0800 Subject: [PATCH 095/496] scsi: scsi_debug: Implement GET STREAM STATUS Implement the GET STREAM STATUS SCSI command. Report that the first five stream indexes correspond to permanent streams. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-19-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index ccf59b3e7602..497045e54300 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -533,6 +533,8 @@ static int resp_write_scat(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_start_stop(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_readcap16(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_get_lba_status(struct scsi_cmnd *, struct sdebug_dev_info *); +static int resp_get_stream_status(struct scsi_cmnd *scp, + struct sdebug_dev_info *devip); static int resp_report_tgtpgs(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_unmap(struct scsi_cmnd *, struct sdebug_dev_info *); static int resp_rsup_opcodes(struct scsi_cmnd *, struct sdebug_dev_info *); @@ -607,6 +609,9 @@ static const struct opcode_info_t sa_in_16_iarr[] = { {0, 0x9e, 0x12, F_SA_LOW | F_D_IN, resp_get_lba_status, NULL, {16, 0x12, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0xc7} }, /* GET LBA STATUS(16) */ + {0, 0x9e, 0x16, F_SA_LOW | F_D_IN, resp_get_stream_status, NULL, + {16, 0x16, 0, 0, 0xff, 0xff, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, + 0, 0} }, /* GET STREAM STATUS */ }; static const struct opcode_info_t vl_iarr[] = { /* VARIABLE LENGTH */ @@ -4573,6 +4578,51 @@ static int resp_get_lba_status(struct scsi_cmnd *scp, return fill_from_dev_buffer(scp, arr, SDEBUG_GET_LBA_STATUS_LEN); } +static int resp_get_stream_status(struct scsi_cmnd *scp, + struct sdebug_dev_info *devip) +{ + u16 starting_stream_id, stream_id; + const u8 *cmd = scp->cmnd; + u32 alloc_len, offset; + u8 arr[256] = {}; + struct scsi_stream_status_header *h = (void *)arr; + + starting_stream_id = get_unaligned_be16(cmd + 4); + alloc_len = get_unaligned_be32(cmd + 10); + + if (alloc_len < 8) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 10, -1); + return check_condition_result; + } + + if (starting_stream_id >= MAXIMUM_NUMBER_OF_STREAMS) { + mk_sense_invalid_fld(scp, SDEB_IN_CDB, 4, -1); + return check_condition_result; + } + + /* + * The GET STREAM STATUS command only reports status information + * about open streams. Treat the non-permanent stream as open. + */ + put_unaligned_be16(MAXIMUM_NUMBER_OF_STREAMS, + &h->number_of_open_streams); + + for (offset = 8, stream_id = starting_stream_id; + offset + 8 <= min_t(u32, alloc_len, sizeof(arr)) && + stream_id < MAXIMUM_NUMBER_OF_STREAMS; + offset += 8, stream_id++) { + struct scsi_stream_status *stream_status = (void *)arr + offset; + + stream_status->perm = stream_id < PERMANENT_STREAM_COUNT; + put_unaligned_be16(stream_id, + &stream_status->stream_identifier); + stream_status->rel_lifetime = stream_id + 1; + } + put_unaligned_be32(offset - 8, &h->len); /* PARAMETER DATA LENGTH */ + + return fill_from_dev_buffer(scp, arr, min(offset, alloc_len)); +} + static int resp_sync_cache(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { From af180c0880f9df14be31807f0bb0fa6f0d34a943 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 30 Jan 2024 13:48:45 -0800 Subject: [PATCH 096/496] scsi: scsi_debug: Maintain write statistics per group number Track per GROUP NUMBER how many write commands have been processed. Make this information available in sysfs. Reset these statistics if any data is written into the sysfs attribute. Note: SCSI devices should only interpret the information in the GROUP NUMBER field as a stream identifier if the ST_ENBLE bit has been set to one. This patch follows a simpler approach: count the number of writes per GROUP NUMBER whether or not the group number represents a stream identifier. Cc: Martin K. Petersen Cc: Douglas Gilbert Tested-by: Douglas Gilbert Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240130214911.1863909-20-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 49 +++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 497045e54300..ce52d580d1aa 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -902,6 +902,8 @@ static int sdeb_zbc_nr_conv = DEF_ZBC_NR_CONV_ZONES; static int submit_queues = DEF_SUBMIT_QUEUES; /* > 1 for multi-queue (mq) */ static int poll_queues; /* iouring iopoll interface.*/ +static atomic_long_t writes_by_group_number[64]; + static char sdebug_proc_name[] = MY_NAME; static const char *my_name = MY_NAME; @@ -3377,7 +3379,8 @@ static inline struct sdeb_store_info *devip2sip(struct sdebug_dev_info *devip, /* Returns number of bytes copied or -1 if error. */ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, - u32 sg_skip, u64 lba, u32 num, bool do_write) + u32 sg_skip, u64 lba, u32 num, bool do_write, + u8 group_number) { int ret; u64 block, rest = 0; @@ -3396,6 +3399,10 @@ static int do_device_access(struct sdeb_store_info *sip, struct scsi_cmnd *scp, return 0; if (scp->sc_data_direction != dir) return -1; + + if (do_write && group_number < ARRAY_SIZE(writes_by_group_number)) + atomic_long_inc(&writes_by_group_number[group_number]); + fsp = sip->storep; block = do_div(lba, sdebug_store_sectors); @@ -3769,7 +3776,7 @@ static int resp_read_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, false); + ret = do_device_access(sip, scp, 0, lba, num, false, 0); sdeb_read_unlock(sip); if (unlikely(ret == -1)) return DID_ERROR << 16; @@ -3954,6 +3961,7 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { bool check_prot; u32 num; + u8 group = 0; u32 ei_lba; int ret; u64 lba; @@ -3965,11 +3973,13 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) ei_lba = 0; lba = get_unaligned_be64(cmd + 2); num = get_unaligned_be32(cmd + 10); + group = cmd[14] & 0x3f; check_prot = true; break; case WRITE_10: ei_lba = 0; lba = get_unaligned_be32(cmd + 2); + group = cmd[6] & 0x3f; num = get_unaligned_be16(cmd + 7); check_prot = true; break; @@ -3984,15 +3994,18 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) ei_lba = 0; lba = get_unaligned_be32(cmd + 2); num = get_unaligned_be32(cmd + 6); + group = cmd[6] & 0x3f; check_prot = true; break; case 0x53: /* XDWRITEREAD(10) */ ei_lba = 0; lba = get_unaligned_be32(cmd + 2); + group = cmd[6] & 0x1f; num = get_unaligned_be16(cmd + 7); check_prot = false; break; default: /* assume WRITE(32) */ + group = cmd[6] & 0x3f; lba = get_unaligned_be64(cmd + 12); ei_lba = get_unaligned_be32(cmd + 20); num = get_unaligned_be32(cmd + 28); @@ -4047,7 +4060,7 @@ static int resp_write_dt0(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } } - ret = do_device_access(sip, scp, 0, lba, num, true); + ret = do_device_access(sip, scp, 0, lba, num, true, group); if (unlikely(scsi_debug_lbp())) map_region(sip, lba, num); /* If ZBC zone then bump its write pointer */ @@ -4099,12 +4112,14 @@ static int resp_write_scat(struct scsi_cmnd *scp, u32 lb_size = sdebug_sector_size; u32 ei_lba; u64 lba; + u8 group; int ret, res; bool is_16; static const u32 lrd_size = 32; /* + parameter list header size */ if (cmd[0] == VARIABLE_LENGTH_CMD) { is_16 = false; + group = cmd[6] & 0x3f; wrprotect = (cmd[10] >> 5) & 0x7; lbdof = get_unaligned_be16(cmd + 12); num_lrd = get_unaligned_be16(cmd + 16); @@ -4115,6 +4130,7 @@ static int resp_write_scat(struct scsi_cmnd *scp, lbdof = get_unaligned_be16(cmd + 4); num_lrd = get_unaligned_be16(cmd + 8); bt_len = get_unaligned_be32(cmd + 10); + group = cmd[14] & 0x3f; if (unlikely(have_dif_prot)) { if (sdebug_dif == T10_PI_TYPE2_PROTECTION && wrprotect) { @@ -4203,7 +4219,7 @@ static int resp_write_scat(struct scsi_cmnd *scp, } } - ret = do_device_access(sip, scp, sg_off, lba, num, true); + ret = do_device_access(sip, scp, sg_off, lba, num, true, group); /* If ZBC zone then bump its write pointer */ if (sdebug_dev_is_zoned(devip)) zbc_inc_wp(devip, lba, num); @@ -7298,6 +7314,30 @@ static ssize_t tur_ms_to_ready_show(struct device_driver *ddp, char *buf) } static DRIVER_ATTR_RO(tur_ms_to_ready); +static ssize_t group_number_stats_show(struct device_driver *ddp, char *buf) +{ + char *p = buf, *end = buf + PAGE_SIZE; + int i; + + for (i = 0; i < ARRAY_SIZE(writes_by_group_number); i++) + p += scnprintf(p, end - p, "%d %ld\n", i, + atomic_long_read(&writes_by_group_number[i])); + + return p - buf; +} + +static ssize_t group_number_stats_store(struct device_driver *ddp, + const char *buf, size_t count) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(writes_by_group_number); i++) + atomic_long_set(&writes_by_group_number[i], 0); + + return count; +} +static DRIVER_ATTR_RW(group_number_stats); + /* Note: The following array creates attribute files in the /sys/bus/pseudo/drivers/scsi_debug directory. The advantage of these files (over those found in the /sys/module/scsi_debug/parameters @@ -7344,6 +7384,7 @@ static struct attribute *sdebug_drv_attrs[] = { &driver_attr_cdb_len.attr, &driver_attr_tur_ms_to_ready.attr, &driver_attr_zbc.attr, + &driver_attr_group_number_stats.attr, NULL, }; ATTRIBUTE_GROUPS(sdebug_drv); From 886516fae2b73a1578600e95631436785f3e44d6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 27 Jan 2024 01:00:54 -0800 Subject: [PATCH 097/496] RISC-V: fix check for zvkb with tip-of-tree clang LLVM commit 8e01042da9d3 ("[RISCV] Add missing dependency check for Zvkb (#79467)") broke the check used by the TOOLCHAIN_HAS_VECTOR_CRYPTO kconfig symbol because it made zvkb start depending on v or zve*. Fix this by specifying both v and zvkb when checking for support for zvkb. Signed-off-by: Eric Biggers Reviewed-by: Nathan Chancellor Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240127090055.124336-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 27b1f44caa18..0bfcfec67ed5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -591,7 +591,7 @@ config TOOLCHAIN_HAS_ZBB # extensions, including Zvk*, Zvbb, and Zvbc. LLVM added all of these at once. # binutils added all except Zvkb, then added Zvkb. So we just check for Zvkb. config TOOLCHAIN_HAS_VECTOR_CRYPTO - def_bool $(as-instr, .option arch$(comma) +zvkb) + def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb) depends on AS_HAS_OPTION_ARCH config RISCV_ISA_ZBB From f4282a4303dc7eada2543ea9b87e2b9bdec9344b Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:16 +0100 Subject: [PATCH 098/496] rtc: ds1511: drop useless checks The RTC core will always pass a valid rtc_tm, it is unnecessary to check the validity of its members, especially with an open coded version of rtc_valid_tm(). Link: https://lore.kernel.org/r/20240227230431.1837717-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 1109cad83838..87c52d20d31a 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -182,9 +182,6 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) if (rtc_tm->tm_year < 1900) rtc_tm->tm_year += 1900; - if (rtc_tm->tm_year < 1970) - return -EINVAL; - yrs = rtc_tm->tm_year % 100; cen = rtc_tm->tm_year / 100; mon = rtc_tm->tm_mon + 1; /* tm_mon starts at zero */ @@ -194,15 +191,6 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) min = rtc_tm->tm_min; sec = rtc_tm->tm_sec; - if ((mon > 12) || (day == 0)) - return -EINVAL; - - if (day > rtc_month_days(rtc_tm->tm_mon, rtc_tm->tm_year)) - return -EINVAL; - - if ((hrs >= 24) || (min >= 60) || (sec >= 60)) - return -EINVAL; - /* * each register is a different number of valid bits */ From 4e7a9e2ea2f1fc145a56dff998bc8215525b0a1e Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:17 +0100 Subject: [PATCH 099/496] rtc: ds1511: drop useless computation All the callers of ds1511_rtc_set_time will use the same epoch for tm_year which is defined as the number of years minus 1900 since POSIX.1-2001. Link: https://lore.kernel.org/r/20240227230431.1837717-2-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 87c52d20d31a..a646bcf9cd56 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -166,24 +166,13 @@ ds1511_wdog_disable(void) } #endif -/* - * set the rtc chip's idea of the time. - * stupidly, some callers call with year unmolested; - * and some call with year = year - 1900. thanks. - */ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) { u8 mon, day, dow, hrs, min, sec, yrs, cen; unsigned long flags; - /* - * won't have to change this for a while - */ - if (rtc_tm->tm_year < 1900) - rtc_tm->tm_year += 1900; - yrs = rtc_tm->tm_year % 100; - cen = rtc_tm->tm_year / 100; + cen = 19 + rtc_tm->tm_year / 100; mon = rtc_tm->tm_mon + 1; /* tm_mon starts at zero */ day = rtc_tm->tm_mday; dow = rtc_tm->tm_wday & 0x7; /* automatic BCD */ From 3f31f1729d56cb31c0d45f88e7ea0f6c749c0a92 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:18 +0100 Subject: [PATCH 100/496] rtc: ds1511: drop dead code The watchdog part of the code is not reachable and should be reimplemented properly as a watchdog driver. Link: https://lore.kernel.org/r/20240227230431.1837717-3-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 44 ---------------------------------------- 1 file changed, 44 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index a646bcf9cd56..fe8dbad51c88 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -122,50 +122,6 @@ rtc_enable_update(void) rtc_write((rtc_read(RTC_CMD) | RTC_TE), RTC_CMD); } -/* - * #define DS1511_WDOG_RESET_SUPPORT - * - * Uncomment this if you want to use these routines in - * some platform code. - */ -#ifdef DS1511_WDOG_RESET_SUPPORT -/* - * just enough code to set the watchdog timer so that it - * will reboot the system - */ -void -ds1511_wdog_set(unsigned long deciseconds) -{ - /* - * the wdog timer can take 99.99 seconds - */ - deciseconds %= 10000; - /* - * set the wdog values in the wdog registers - */ - rtc_write(bin2bcd(deciseconds % 100), DS1511_WD_MSEC); - rtc_write(bin2bcd(deciseconds / 100), DS1511_WD_SEC); - /* - * set wdog enable and wdog 'steering' bit to issue a reset - */ - rtc_write(rtc_read(RTC_CMD) | DS1511_WDE | DS1511_WDS, RTC_CMD); -} - -void -ds1511_wdog_disable(void) -{ - /* - * clear wdog enable and wdog 'steering' bits - */ - rtc_write(rtc_read(RTC_CMD) & ~(DS1511_WDE | DS1511_WDS), RTC_CMD); - /* - * clear the wdog counter - */ - rtc_write(0, DS1511_WD_MSEC); - rtc_write(0, DS1511_WD_SEC); -} -#endif - static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) { u8 mon, day, dow, hrs, min, sec, yrs, cen; From 8f973799c3526545bdc9ecdfce675e6bb5b4c51a Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:19 +0100 Subject: [PATCH 101/496] rtc: ds1511: drop useless enum Use regular defines for register offsets instead of the enum that doesn't bring any benefit. Link: https://lore.kernel.org/r/20240227230431.1837717-4-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 120 ++++++++++++++++----------------------- 1 file changed, 49 insertions(+), 71 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index fe8dbad51c88..8e57c1395cf3 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -22,26 +22,24 @@ #include #include -enum ds1511reg { - DS1511_SEC = 0x0, - DS1511_MIN = 0x1, - DS1511_HOUR = 0x2, - DS1511_DOW = 0x3, - DS1511_DOM = 0x4, - DS1511_MONTH = 0x5, - DS1511_YEAR = 0x6, - DS1511_CENTURY = 0x7, - DS1511_AM1_SEC = 0x8, - DS1511_AM2_MIN = 0x9, - DS1511_AM3_HOUR = 0xa, - DS1511_AM4_DATE = 0xb, - DS1511_WD_MSEC = 0xc, - DS1511_WD_SEC = 0xd, - DS1511_CONTROL_A = 0xe, - DS1511_CONTROL_B = 0xf, - DS1511_RAMADDR_LSB = 0x10, - DS1511_RAMDATA = 0x13 -}; +#define DS1511_SEC 0x0 +#define DS1511_MIN 0x1 +#define DS1511_HOUR 0x2 +#define DS1511_DOW 0x3 +#define DS1511_DOM 0x4 +#define DS1511_MONTH 0x5 +#define DS1511_YEAR 0x6 +#define DS1511_CENTURY 0x7 +#define DS1511_AM1_SEC 0x8 +#define DS1511_AM2_MIN 0x9 +#define DS1511_AM3_HOUR 0xa +#define DS1511_AM4_DATE 0xb +#define DS1511_WD_MSEC 0xc +#define DS1511_WD_SEC 0xd +#define DS1511_CONTROL_A 0xe +#define DS1511_CONTROL_B 0xf +#define DS1511_RAMADDR_LSB 0x10 +#define DS1511_RAMDATA 0x13 #define DS1511_BLF1 0x80 #define DS1511_BLF2 0x40 @@ -61,26 +59,6 @@ enum ds1511reg { #define DS1511_WDS 0x01 #define DS1511_RAM_MAX 0x100 -#define RTC_CMD DS1511_CONTROL_B -#define RTC_CMD1 DS1511_CONTROL_A - -#define RTC_ALARM_SEC DS1511_AM1_SEC -#define RTC_ALARM_MIN DS1511_AM2_MIN -#define RTC_ALARM_HOUR DS1511_AM3_HOUR -#define RTC_ALARM_DATE DS1511_AM4_DATE - -#define RTC_SEC DS1511_SEC -#define RTC_MIN DS1511_MIN -#define RTC_HOUR DS1511_HOUR -#define RTC_DOW DS1511_DOW -#define RTC_DOM DS1511_DOM -#define RTC_MON DS1511_MONTH -#define RTC_YEAR DS1511_YEAR -#define RTC_CENTURY DS1511_CENTURY - -#define RTC_TIE DS1511_TIE -#define RTC_TE DS1511_TE - struct rtc_plat_data { struct rtc_device *rtc; void __iomem *ioaddr; /* virtual base address */ @@ -105,7 +83,7 @@ rtc_write(uint8_t val, uint32_t reg) } static noinline uint8_t -rtc_read(enum ds1511reg reg) +rtc_read(uint32_t reg) { return readb(ds1511_base + (reg * reg_spacing)); } @@ -113,13 +91,13 @@ rtc_read(enum ds1511reg reg) static inline void rtc_disable_update(void) { - rtc_write((rtc_read(RTC_CMD) & ~RTC_TE), RTC_CMD); + rtc_write((rtc_read(DS1511_CONTROL_B) & ~DS1511_TE), DS1511_CONTROL_B); } static void rtc_enable_update(void) { - rtc_write((rtc_read(RTC_CMD) | RTC_TE), RTC_CMD); + rtc_write((rtc_read(DS1511_CONTROL_B) | DS1511_TE), DS1511_CONTROL_B); } static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) @@ -149,14 +127,14 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm) spin_lock_irqsave(&ds1511_lock, flags); rtc_disable_update(); - rtc_write(cen, RTC_CENTURY); - rtc_write(yrs, RTC_YEAR); - rtc_write((rtc_read(RTC_MON) & 0xe0) | mon, RTC_MON); - rtc_write(day, RTC_DOM); - rtc_write(hrs, RTC_HOUR); - rtc_write(min, RTC_MIN); - rtc_write(sec, RTC_SEC); - rtc_write(dow, RTC_DOW); + rtc_write(cen, DS1511_CENTURY); + rtc_write(yrs, DS1511_YEAR); + rtc_write((rtc_read(DS1511_MONTH) & 0xe0) | mon, DS1511_MONTH); + rtc_write(day, DS1511_DOM); + rtc_write(hrs, DS1511_HOUR); + rtc_write(min, DS1511_MIN); + rtc_write(sec, DS1511_SEC); + rtc_write(dow, DS1511_DOW); rtc_enable_update(); spin_unlock_irqrestore(&ds1511_lock, flags); @@ -171,14 +149,14 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) spin_lock_irqsave(&ds1511_lock, flags); rtc_disable_update(); - rtc_tm->tm_sec = rtc_read(RTC_SEC) & 0x7f; - rtc_tm->tm_min = rtc_read(RTC_MIN) & 0x7f; - rtc_tm->tm_hour = rtc_read(RTC_HOUR) & 0x3f; - rtc_tm->tm_mday = rtc_read(RTC_DOM) & 0x3f; - rtc_tm->tm_wday = rtc_read(RTC_DOW) & 0x7; - rtc_tm->tm_mon = rtc_read(RTC_MON) & 0x1f; - rtc_tm->tm_year = rtc_read(RTC_YEAR) & 0x7f; - century = rtc_read(RTC_CENTURY); + rtc_tm->tm_sec = rtc_read(DS1511_SEC) & 0x7f; + rtc_tm->tm_min = rtc_read(DS1511_MIN) & 0x7f; + rtc_tm->tm_hour = rtc_read(DS1511_HOUR) & 0x3f; + rtc_tm->tm_mday = rtc_read(DS1511_DOM) & 0x3f; + rtc_tm->tm_wday = rtc_read(DS1511_DOW) & 0x7; + rtc_tm->tm_mon = rtc_read(DS1511_MONTH) & 0x1f; + rtc_tm->tm_year = rtc_read(DS1511_YEAR) & 0x7f; + century = rtc_read(DS1511_CENTURY); rtc_enable_update(); spin_unlock_irqrestore(&ds1511_lock, flags); @@ -220,18 +198,18 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) spin_lock_irqsave(&pdata->lock, flags); rtc_write(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ? 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, - RTC_ALARM_DATE); + DS1511_AM4_DATE); rtc_write(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ? 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, - RTC_ALARM_HOUR); + DS1511_AM3_HOUR); rtc_write(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ? 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, - RTC_ALARM_MIN); + DS1511_AM2_MIN); rtc_write(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ? 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, - RTC_ALARM_SEC); - rtc_write(rtc_read(RTC_CMD) | (pdata->irqen ? RTC_TIE : 0), RTC_CMD); - rtc_read(RTC_CMD1); /* clear interrupts */ + DS1511_AM1_SEC); + rtc_write(rtc_read(DS1511_CONTROL_B) | (pdata->irqen ? DS1511_TIE : 0), DS1511_CONTROL_B); + rtc_read(DS1511_CONTROL_A); /* clear interrupts */ spin_unlock_irqrestore(&pdata->lock, flags); } @@ -281,9 +259,9 @@ ds1511_interrupt(int irq, void *dev_id) /* * read and clear interrupt */ - if (rtc_read(RTC_CMD1) & DS1511_IRQF) { + if (rtc_read(DS1511_CONTROL_A) & DS1511_IRQF) { events = RTC_IRQF; - if (rtc_read(RTC_ALARM_SEC) & 0x80) + if (rtc_read(DS1511_AM1_SEC) & 0x80) events |= RTC_UF; else events |= RTC_AF; @@ -366,8 +344,8 @@ static int ds1511_rtc_probe(struct platform_device *pdev) /* * turn on the clock and the crystal, etc. */ - rtc_write(DS1511_BME, RTC_CMD); - rtc_write(0, RTC_CMD1); + rtc_write(DS1511_BME, DS1511_CONTROL_B); + rtc_write(0, DS1511_CONTROL_A); /* * clear the wdog counter */ @@ -381,7 +359,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev) /* * check for a dying bat-tree */ - if (rtc_read(RTC_CMD1) & DS1511_BLF1) + if (rtc_read(DS1511_CONTROL_A) & DS1511_BLF1) dev_warn(&pdev->dev, "voltage-low detected.\n"); spin_lock_init(&pdata->lock); @@ -404,7 +382,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev) * then by all means, set it */ if (pdata->irq > 0) { - rtc_read(RTC_CMD1); + rtc_read(DS1511_CONTROL_A); if (devm_request_irq(&pdev->dev, pdata->irq, ds1511_interrupt, IRQF_SHARED, pdev->name, pdev) < 0) { From 22e1b2c7a4e81b33a88e6739787d823a630506bf Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:20 +0100 Subject: [PATCH 102/496] rtc: ds1511: fix function definition Use proper style for function definition. Link: https://lore.kernel.org/r/20240227230431.1837717-5-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 8e57c1395cf3..1765f76dda58 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -76,26 +76,22 @@ static DEFINE_SPINLOCK(ds1511_lock); static __iomem char *ds1511_base; static u32 reg_spacing = 1; -static noinline void -rtc_write(uint8_t val, uint32_t reg) +static noinline void rtc_write(uint8_t val, uint32_t reg) { writeb(val, ds1511_base + (reg * reg_spacing)); } -static noinline uint8_t -rtc_read(uint32_t reg) +static noinline uint8_t rtc_read(uint32_t reg) { return readb(ds1511_base + (reg * reg_spacing)); } -static inline void -rtc_disable_update(void) +static inline void rtc_disable_update(void) { rtc_write((rtc_read(DS1511_CONTROL_B) & ~DS1511_TE), DS1511_CONTROL_B); } -static void -rtc_enable_update(void) +static void rtc_enable_update(void) { rtc_write((rtc_read(DS1511_CONTROL_B) | DS1511_TE), DS1511_CONTROL_B); } @@ -190,8 +186,7 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) * date/hours/mins/secs matches. the ds1511 has many more * permutations, but the kernel doesn't. */ -static void -ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) +static void ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) { unsigned long flags; @@ -213,8 +208,7 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) spin_unlock_irqrestore(&pdata->lock, flags); } -static int -ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_plat_data *pdata = dev_get_drvdata(dev); @@ -232,8 +226,7 @@ ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -static int -ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_plat_data *pdata = dev_get_drvdata(dev); @@ -248,8 +241,7 @@ ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) return 0; } -static irqreturn_t -ds1511_interrupt(int irq, void *dev_id) +static irqreturn_t ds1511_interrupt(int irq, void *dev_id) { struct platform_device *pdev = dev_id; struct rtc_plat_data *pdata = platform_get_drvdata(pdev); From 6529ab38c8a57debc58dea483711b4bfec0c7b98 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:21 +0100 Subject: [PATCH 103/496] rtc: ds1511: remove incomplete UIE support There is no way to enable UIE in the driver, drop RTC_UF support. Link: https://lore.kernel.org/r/20240227230431.1837717-6-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 1765f76dda58..4ac8988d4124 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -191,17 +191,13 @@ static void ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) unsigned long flags; spin_lock_irqsave(&pdata->lock, flags); - rtc_write(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, + rtc_write(pdata->alrm_mday < 0 ? 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, DS1511_AM4_DATE); - rtc_write(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, + rtc_write(pdata->alrm_hour < 0 ? 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, DS1511_AM3_HOUR); - rtc_write(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, + rtc_write(pdata->alrm_min < 0 ? 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, DS1511_AM2_MIN); - rtc_write(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ? - 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, + rtc_write(pdata->alrm_sec < 0 ? 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, DS1511_AM1_SEC); rtc_write(rtc_read(DS1511_CONTROL_B) | (pdata->irqen ? DS1511_TIE : 0), DS1511_CONTROL_B); rtc_read(DS1511_CONTROL_A); /* clear interrupts */ @@ -252,11 +248,7 @@ static irqreturn_t ds1511_interrupt(int irq, void *dev_id) * read and clear interrupt */ if (rtc_read(DS1511_CONTROL_A) & DS1511_IRQF) { - events = RTC_IRQF; - if (rtc_read(DS1511_AM1_SEC) & 0x80) - events |= RTC_UF; - else - events |= RTC_AF; + events = RTC_IRQF | RTC_AF; rtc_update_irq(pdata->rtc, 1, events); } spin_unlock(&pdata->lock); From 434c9d03ea0db6bb8b1aebb6950781a0496028dd Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:22 +0100 Subject: [PATCH 104/496] rtc: ds1511: remove ds1511_rtc_update_alarm ds1511_rtc_update_alarm is called twice but one of the call is overkill as it only has to enable or disable the alarm instead of updating all the alarm registers. Merge it in its main call site and introduce a new finction to enable or disable the alarm. Link: https://lore.kernel.org/r/20240227230431.1837717-7-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 52 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 4ac8988d4124..b0dfdda2c8fc 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -178,35 +178,15 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm) return 0; } -/* - * write the alarm register settings - * - * we only have the use to interrupt every second, otherwise - * known as the update interrupt, or the interrupt if the whole - * date/hours/mins/secs matches. the ds1511 has many more - * permutations, but the kernel doesn't. - */ -static void ds1511_rtc_update_alarm(struct rtc_plat_data *pdata) +static void ds1511_rtc_alarm_enable(unsigned int enabled) { - unsigned long flags; - - spin_lock_irqsave(&pdata->lock, flags); - rtc_write(pdata->alrm_mday < 0 ? 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, - DS1511_AM4_DATE); - rtc_write(pdata->alrm_hour < 0 ? 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, - DS1511_AM3_HOUR); - rtc_write(pdata->alrm_min < 0 ? 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, - DS1511_AM2_MIN); - rtc_write(pdata->alrm_sec < 0 ? 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, - DS1511_AM1_SEC); - rtc_write(rtc_read(DS1511_CONTROL_B) | (pdata->irqen ? DS1511_TIE : 0), DS1511_CONTROL_B); - rtc_read(DS1511_CONTROL_A); /* clear interrupts */ - spin_unlock_irqrestore(&pdata->lock, flags); + rtc_write(rtc_read(DS1511_CONTROL_B) | (enabled ? DS1511_TIE : 0), DS1511_CONTROL_B); } static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_plat_data *pdata = dev_get_drvdata(dev); + unsigned long flags; if (pdata->irq <= 0) return -EINVAL; @@ -218,7 +198,20 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) if (alrm->enabled) pdata->irqen |= RTC_AF; - ds1511_rtc_update_alarm(pdata); + spin_lock_irqsave(&pdata->lock, flags); + rtc_write(pdata->alrm_mday < 0 ? 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, + DS1511_AM4_DATE); + rtc_write(pdata->alrm_hour < 0 ? 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, + DS1511_AM3_HOUR); + rtc_write(pdata->alrm_min < 0 ? 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, + DS1511_AM2_MIN); + rtc_write(pdata->alrm_sec < 0 ? 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, + DS1511_AM1_SEC); + ds1511_rtc_alarm_enable(alrm->enabled); + + rtc_read(DS1511_CONTROL_A); /* clear interrupts */ + spin_unlock_irqrestore(&pdata->lock, flags); + return 0; } @@ -258,14 +251,15 @@ static irqreturn_t ds1511_interrupt(int irq, void *dev_id) static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct rtc_plat_data *pdata = dev_get_drvdata(dev); + unsigned long flags; if (pdata->irq <= 0) return -EINVAL; - if (enabled) - pdata->irqen |= RTC_AF; - else - pdata->irqen &= ~RTC_AF; - ds1511_rtc_update_alarm(pdata); + + spin_lock_irqsave(&pdata->lock, flags); + ds1511_rtc_alarm_enable(enabled); + spin_unlock_irqrestore(&pdata->lock, flags); + return 0; } From f891570be594de6d5404354ab2c2ecbdc508cbd7 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:23 +0100 Subject: [PATCH 105/496] rtc: ds1511: let the core know when alarm are not supported Instead of failing function calls, let the core know alarms are not supported so it can fail early and avoid unnecessary calls. Link: https://lore.kernel.org/r/20240227230431.1837717-8-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index b0dfdda2c8fc..c81f464e6a50 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -188,9 +188,6 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct rtc_plat_data *pdata = dev_get_drvdata(dev); unsigned long flags; - if (pdata->irq <= 0) - return -EINVAL; - pdata->alrm_mday = alrm->time.tm_mday; pdata->alrm_hour = alrm->time.tm_hour; pdata->alrm_min = alrm->time.tm_min; @@ -219,9 +216,6 @@ static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_plat_data *pdata = dev_get_drvdata(dev); - if (pdata->irq <= 0) - return -EINVAL; - alrm->time.tm_mday = pdata->alrm_mday < 0 ? 0 : pdata->alrm_mday; alrm->time.tm_hour = pdata->alrm_hour < 0 ? 0 : pdata->alrm_hour; alrm->time.tm_min = pdata->alrm_min < 0 ? 0 : pdata->alrm_min; @@ -253,9 +247,6 @@ static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) struct rtc_plat_data *pdata = dev_get_drvdata(dev); unsigned long flags; - if (pdata->irq <= 0) - return -EINVAL; - spin_lock_irqsave(&pdata->lock, flags); ds1511_rtc_alarm_enable(enabled); spin_unlock_irqrestore(&pdata->lock, flags); @@ -349,12 +340,6 @@ static int ds1511_rtc_probe(struct platform_device *pdev) pdata->rtc->ops = &ds1511_rtc_ops; - ret = devm_rtc_register_device(pdata->rtc); - if (ret) - return ret; - - devm_rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg); - /* * if the platform has an interrupt in mind for this device, * then by all means, set it @@ -369,6 +354,15 @@ static int ds1511_rtc_probe(struct platform_device *pdev) } } + if (pdata->irq == 0) + clear_bit(RTC_FEATURE_ALARM, pdata->rtc->features); + + ret = devm_rtc_register_device(pdata->rtc); + if (ret) + return ret; + + devm_rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg); + return 0; } From d949f040a0dc54ad76aa1f84391a62077dea096c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:24 +0100 Subject: [PATCH 106/496] rtc: ds1511: remove partial alarm support The RTC core will always provide an alarm with all its members set, it is not necessary to support partial alarms. Link: https://lore.kernel.org/r/20240227230431.1837717-9-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index c81f464e6a50..d5d59a948c59 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -196,14 +196,10 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) pdata->irqen |= RTC_AF; spin_lock_irqsave(&pdata->lock, flags); - rtc_write(pdata->alrm_mday < 0 ? 0x80 : bin2bcd(pdata->alrm_mday) & 0x3f, - DS1511_AM4_DATE); - rtc_write(pdata->alrm_hour < 0 ? 0x80 : bin2bcd(pdata->alrm_hour) & 0x3f, - DS1511_AM3_HOUR); - rtc_write(pdata->alrm_min < 0 ? 0x80 : bin2bcd(pdata->alrm_min) & 0x7f, - DS1511_AM2_MIN); - rtc_write(pdata->alrm_sec < 0 ? 0x80 : bin2bcd(pdata->alrm_sec) & 0x7f, - DS1511_AM1_SEC); + rtc_write(bin2bcd(pdata->alrm_mday) & 0x3f, DS1511_AM4_DATE); + rtc_write(bin2bcd(pdata->alrm_hour) & 0x3f, DS1511_AM3_HOUR); + rtc_write(bin2bcd(pdata->alrm_min) & 0x7f, DS1511_AM2_MIN); + rtc_write(bin2bcd(pdata->alrm_sec) & 0x7f, DS1511_AM1_SEC); ds1511_rtc_alarm_enable(alrm->enabled); rtc_read(DS1511_CONTROL_A); /* clear interrupts */ From 418501fd53f178eaa3e737804fc1e88e5f04343c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:25 +0100 Subject: [PATCH 107/496] rtc: ds1511: implement ds1511_rtc_read_alarm properly ds1511_rtc_read_alarm was useless as it is only called at boot time so the alarm members of pdata have not yet been set. Read the actual registers so there is a chance to get a meaningful value. Then, drop the alarm related members of pdata as they are not used anymore. Link: https://lore.kernel.org/r/20240227230431.1837717-10-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index d5d59a948c59..c3b1376b731f 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -63,11 +63,6 @@ struct rtc_plat_data { struct rtc_device *rtc; void __iomem *ioaddr; /* virtual base address */ int irq; - unsigned int irqen; - int alrm_sec; - int alrm_min; - int alrm_hour; - int alrm_mday; spinlock_t lock; }; @@ -188,18 +183,11 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct rtc_plat_data *pdata = dev_get_drvdata(dev); unsigned long flags; - pdata->alrm_mday = alrm->time.tm_mday; - pdata->alrm_hour = alrm->time.tm_hour; - pdata->alrm_min = alrm->time.tm_min; - pdata->alrm_sec = alrm->time.tm_sec; - if (alrm->enabled) - pdata->irqen |= RTC_AF; - spin_lock_irqsave(&pdata->lock, flags); - rtc_write(bin2bcd(pdata->alrm_mday) & 0x3f, DS1511_AM4_DATE); - rtc_write(bin2bcd(pdata->alrm_hour) & 0x3f, DS1511_AM3_HOUR); - rtc_write(bin2bcd(pdata->alrm_min) & 0x7f, DS1511_AM2_MIN); - rtc_write(bin2bcd(pdata->alrm_sec) & 0x7f, DS1511_AM1_SEC); + rtc_write(bin2bcd(alrm->time.tm_mday) & 0x3f, DS1511_AM4_DATE); + rtc_write(bin2bcd(alrm->time.tm_hour) & 0x3f, DS1511_AM3_HOUR); + rtc_write(bin2bcd(alrm->time.tm_min) & 0x7f, DS1511_AM2_MIN); + rtc_write(bin2bcd(alrm->time.tm_sec) & 0x7f, DS1511_AM1_SEC); ds1511_rtc_alarm_enable(alrm->enabled); rtc_read(DS1511_CONTROL_A); /* clear interrupts */ @@ -210,13 +198,12 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) { - struct rtc_plat_data *pdata = dev_get_drvdata(dev); + alrm->time.tm_mday = bcd2bin(rtc_read(DS1511_AM4_DATE) & 0x3f); + alrm->time.tm_hour = bcd2bin(rtc_read(DS1511_AM3_HOUR) & 0x3f); + alrm->time.tm_min = bcd2bin(rtc_read(DS1511_AM2_MIN) & 0x7f); + alrm->time.tm_sec = bcd2bin(rtc_read(DS1511_AM1_SEC) & 0x7f); + alrm->enabled = !!(rtc_read(DS1511_CONTROL_B) & DS1511_TIE); - alrm->time.tm_mday = pdata->alrm_mday < 0 ? 0 : pdata->alrm_mday; - alrm->time.tm_hour = pdata->alrm_hour < 0 ? 0 : pdata->alrm_hour; - alrm->time.tm_min = pdata->alrm_min < 0 ? 0 : pdata->alrm_min; - alrm->time.tm_sec = pdata->alrm_sec < 0 ? 0 : pdata->alrm_sec; - alrm->enabled = (pdata->irqen & RTC_AF) ? 1 : 0; return 0; } From 19922e879997858eec0cb5ce275b48834dcbc209 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:26 +0100 Subject: [PATCH 108/496] rtc: ds1511: rename pdata pdata is not actually about patform_data, rename it to something local to the driver. Link: https://lore.kernel.org/r/20240227230431.1837717-11-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 58 ++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index c3b1376b731f..39efd432e8ea 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -59,7 +59,7 @@ #define DS1511_WDS 0x01 #define DS1511_RAM_MAX 0x100 -struct rtc_plat_data { +struct ds1511_data { struct rtc_device *rtc; void __iomem *ioaddr; /* virtual base address */ int irq; @@ -180,10 +180,10 @@ static void ds1511_rtc_alarm_enable(unsigned int enabled) static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { - struct rtc_plat_data *pdata = dev_get_drvdata(dev); + struct ds1511_data *ds1511 = dev_get_drvdata(dev); unsigned long flags; - spin_lock_irqsave(&pdata->lock, flags); + spin_lock_irqsave(&ds1511->lock, flags); rtc_write(bin2bcd(alrm->time.tm_mday) & 0x3f, DS1511_AM4_DATE); rtc_write(bin2bcd(alrm->time.tm_hour) & 0x3f, DS1511_AM3_HOUR); rtc_write(bin2bcd(alrm->time.tm_min) & 0x7f, DS1511_AM2_MIN); @@ -191,7 +191,7 @@ static int ds1511_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) ds1511_rtc_alarm_enable(alrm->enabled); rtc_read(DS1511_CONTROL_A); /* clear interrupts */ - spin_unlock_irqrestore(&pdata->lock, flags); + spin_unlock_irqrestore(&ds1511->lock, flags); return 0; } @@ -210,29 +210,29 @@ static int ds1511_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) static irqreturn_t ds1511_interrupt(int irq, void *dev_id) { struct platform_device *pdev = dev_id; - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); + struct ds1511_data *ds1511 = platform_get_drvdata(pdev); unsigned long events = 0; - spin_lock(&pdata->lock); + spin_lock(&ds1511->lock); /* * read and clear interrupt */ if (rtc_read(DS1511_CONTROL_A) & DS1511_IRQF) { events = RTC_IRQF | RTC_AF; - rtc_update_irq(pdata->rtc, 1, events); + rtc_update_irq(ds1511->rtc, 1, events); } - spin_unlock(&pdata->lock); + spin_unlock(&ds1511->lock); return events ? IRQ_HANDLED : IRQ_NONE; } static int ds1511_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { - struct rtc_plat_data *pdata = dev_get_drvdata(dev); + struct ds1511_data *ds1511 = dev_get_drvdata(dev); unsigned long flags; - spin_lock_irqsave(&pdata->lock, flags); + spin_lock_irqsave(&ds1511->lock, flags); ds1511_rtc_alarm_enable(enabled); - spin_unlock_irqrestore(&pdata->lock, flags); + spin_unlock_irqrestore(&ds1511->lock, flags); return 0; } @@ -271,7 +271,7 @@ static int ds1511_nvram_write(void *priv, unsigned int pos, void *buf, static int ds1511_rtc_probe(struct platform_device *pdev) { - struct rtc_plat_data *pdata; + struct ds1511_data *ds1511; int ret = 0; struct nvmem_config ds1511_nvmem_cfg = { .name = "ds1511_nvram", @@ -283,15 +283,15 @@ static int ds1511_rtc_probe(struct platform_device *pdev) .priv = &pdev->dev, }; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) + ds1511 = devm_kzalloc(&pdev->dev, sizeof(*ds1511), GFP_KERNEL); + if (!ds1511) return -ENOMEM; ds1511_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(ds1511_base)) return PTR_ERR(ds1511_base); - pdata->ioaddr = ds1511_base; - pdata->irq = platform_get_irq(pdev, 0); + ds1511->ioaddr = ds1511_base; + ds1511->irq = platform_get_irq(pdev, 0); /* * turn on the clock and the crystal, etc. @@ -314,37 +314,37 @@ static int ds1511_rtc_probe(struct platform_device *pdev) if (rtc_read(DS1511_CONTROL_A) & DS1511_BLF1) dev_warn(&pdev->dev, "voltage-low detected.\n"); - spin_lock_init(&pdata->lock); - platform_set_drvdata(pdev, pdata); + spin_lock_init(&ds1511->lock); + platform_set_drvdata(pdev, ds1511); - pdata->rtc = devm_rtc_allocate_device(&pdev->dev); - if (IS_ERR(pdata->rtc)) - return PTR_ERR(pdata->rtc); + ds1511->rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(ds1511->rtc)) + return PTR_ERR(ds1511->rtc); - pdata->rtc->ops = &ds1511_rtc_ops; + ds1511->rtc->ops = &ds1511_rtc_ops; /* * if the platform has an interrupt in mind for this device, * then by all means, set it */ - if (pdata->irq > 0) { + if (ds1511->irq > 0) { rtc_read(DS1511_CONTROL_A); - if (devm_request_irq(&pdev->dev, pdata->irq, ds1511_interrupt, + if (devm_request_irq(&pdev->dev, ds1511->irq, ds1511_interrupt, IRQF_SHARED, pdev->name, pdev) < 0) { dev_warn(&pdev->dev, "interrupt not available.\n"); - pdata->irq = 0; + ds1511->irq = 0; } } - if (pdata->irq == 0) - clear_bit(RTC_FEATURE_ALARM, pdata->rtc->features); + if (ds1511->irq == 0) + clear_bit(RTC_FEATURE_ALARM, ds1511->rtc->features); - ret = devm_rtc_register_device(pdata->rtc); + ret = devm_rtc_register_device(ds1511->rtc); if (ret) return ret; - devm_rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg); + devm_rtc_nvmem_register(ds1511->rtc, &ds1511_nvmem_cfg); return 0; } From 29c411f242ea3873c65efa95cfd3780b4b16f278 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:04:27 +0100 Subject: [PATCH 109/496] rtc: ds1511: drop inline/noinline hints There is no reason to not let the compiler optimise those functions as it wants. Link: https://lore.kernel.org/r/20240227230431.1837717-12-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 39efd432e8ea..edb8d90812c5 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -71,17 +71,17 @@ static DEFINE_SPINLOCK(ds1511_lock); static __iomem char *ds1511_base; static u32 reg_spacing = 1; -static noinline void rtc_write(uint8_t val, uint32_t reg) +static void rtc_write(uint8_t val, uint32_t reg) { writeb(val, ds1511_base + (reg * reg_spacing)); } -static noinline uint8_t rtc_read(uint32_t reg) +static uint8_t rtc_read(uint32_t reg) { return readb(ds1511_base + (reg * reg_spacing)); } -static inline void rtc_disable_update(void) +static void rtc_disable_update(void) { rtc_write((rtc_read(DS1511_CONTROL_B) & ~DS1511_TE), DS1511_CONTROL_B); } From e40512a4f5cbfbbe034efc2d556283a470f97bf5 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:13:54 +0100 Subject: [PATCH 110/496] rtc: ds1511: set range The ds1511 leap year calculation fails in 2100. Link: https://lore.kernel.org/r/20240227231356.1840523-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index edb8d90812c5..6869d28d34cc 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -322,6 +322,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev) return PTR_ERR(ds1511->rtc); ds1511->rtc->ops = &ds1511_rtc_ops; + ds1511->rtc->range_max = RTC_TIMESTAMP_END_2099; /* * if the platform has an interrupt in mind for this device, From 50891bd19f1ec23fa8cefbabc1f0e55d94925933 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 28 Feb 2024 00:13:55 +0100 Subject: [PATCH 111/496] rtc: ds1511: set alarm offset limit The ds1511 can only support alarms up to a month in the future (which we currently limit to 28 days). Link: https://lore.kernel.org/r/20240227231356.1840523-2-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-ds1511.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 6869d28d34cc..8b087d9556be 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -323,6 +323,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev) ds1511->rtc->ops = &ds1511_rtc_ops; ds1511->rtc->range_max = RTC_TIMESTAMP_END_2099; + ds1511->rtc->alarm_offset_max = 28 * 24 * 60 * 60 - 1; /* * if the platform has an interrupt in mind for this device, From 787bcc982cd6f4fe81d86f31435de31db8502bd2 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 27 Feb 2024 22:18:32 +0100 Subject: [PATCH 112/496] rtc: pcf8523: add suspend handlers for alarm IRQ Ensure the RTC is able to wake up the system from suspend. Link: https://lore.kernel.org/r/20240227211833.1820800-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-pcf8523.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index d1efde3e7a80..98b77f790b0c 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -370,6 +370,30 @@ static int pcf8523_rtc_set_offset(struct device *dev, long offset) return regmap_write(pcf8523->regmap, PCF8523_REG_OFFSET, value); } +#ifdef CONFIG_PM_SLEEP +static int pcf8523_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + if (client->irq > 0 && device_may_wakeup(dev)) + enable_irq_wake(client->irq); + + return 0; +} + +static int pcf8523_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + + if (client->irq > 0 && device_may_wakeup(dev)) + disable_irq_wake(client->irq); + + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(pcf8523_pm, pcf8523_suspend, pcf8523_resume); + static const struct rtc_class_ops pcf8523_rtc_ops = { .read_time = pcf8523_rtc_read_time, .set_time = pcf8523_rtc_set_time, @@ -487,6 +511,7 @@ static struct i2c_driver pcf8523_driver = { .driver = { .name = "rtc-pcf8523", .of_match_table = pcf8523_of_match, + .pm = &pcf8523_pm, }, .probe = pcf8523_probe, .id_table = pcf8523_id, From e8c0498505b0495f2b67df22c2c28970e69a54d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 22 Jan 2024 13:49:49 +0100 Subject: [PATCH 113/496] dt-bindings: rtc: convert MT2717 RTC to the json-schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helps validating DTS files. Introduced changes: 1. Reworded title 2. Dropper redundant properties descriptions 3. Added required #include and adjusted "reg" in example Signed-off-by: Rafał Miłecki Reviewed-by: Matthias Brugger Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240122124949.29577-1-zajec5@gmail.com Signed-off-by: Alexandre Belloni --- .../bindings/rtc/mediatek,mt2712-rtc.yaml | 39 +++++++++++++++++++ .../devicetree/bindings/rtc/rtc-mt2712.txt | 14 ------- 2 files changed, 39 insertions(+), 14 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/mediatek,mt2712-rtc.yaml delete mode 100644 Documentation/devicetree/bindings/rtc/rtc-mt2712.txt diff --git a/Documentation/devicetree/bindings/rtc/mediatek,mt2712-rtc.yaml b/Documentation/devicetree/bindings/rtc/mediatek,mt2712-rtc.yaml new file mode 100644 index 000000000000..75624ddf6d4d --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/mediatek,mt2712-rtc.yaml @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/mediatek,mt2712-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT2712 on-SoC RTC + +allOf: + - $ref: rtc.yaml# + +maintainers: + - Ran Bi + +properties: + compatible: + const: mediatek,mt2712-rtc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - reg + - interrupts + +unevaluatedProperties: false + +examples: + - | + #include + + rtc@10011000 { + compatible = "mediatek,mt2712-rtc"; + reg = <0x10011000 0x1000>; + interrupts = ; + }; diff --git a/Documentation/devicetree/bindings/rtc/rtc-mt2712.txt b/Documentation/devicetree/bindings/rtc/rtc-mt2712.txt deleted file mode 100644 index c33d87e5e753..000000000000 --- a/Documentation/devicetree/bindings/rtc/rtc-mt2712.txt +++ /dev/null @@ -1,14 +0,0 @@ -Device-Tree bindings for MediaTek SoC based RTC - -Required properties: -- compatible : Should be "mediatek,mt2712-rtc" : for MT2712 SoC -- reg : Specifies base physical address and size of the registers; -- interrupts : Should contain the interrupt for RTC alarm; - -Example: - -rtc: rtc@10011000 { - compatible = "mediatek,mt2712-rtc"; - reg = <0 0x10011000 0 0x1000>; - interrupts = ; -}; From aef3952ec13fd2f882225ea36fc7c369f681d682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 23 Jan 2024 13:50:43 +0100 Subject: [PATCH 114/496] dt-bindings: rtc: convert MT7622 RTC to the json-schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helps validating DTS files. Introduced changes: 1. Reworded title 2. Dropper redundant properties descriptions 3. Added required #include-s and adjusted "reg" in example Signed-off-by: Rafał Miłecki Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240123125043.27192-1-zajec5@gmail.com Signed-off-by: Alexandre Belloni --- .../bindings/rtc/mediatek,mt7622-rtc.yaml | 52 +++++++++++++++++++ .../devicetree/bindings/rtc/rtc-mt7622.txt | 21 -------- 2 files changed, 52 insertions(+), 21 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/mediatek,mt7622-rtc.yaml delete mode 100644 Documentation/devicetree/bindings/rtc/rtc-mt7622.txt diff --git a/Documentation/devicetree/bindings/rtc/mediatek,mt7622-rtc.yaml b/Documentation/devicetree/bindings/rtc/mediatek,mt7622-rtc.yaml new file mode 100644 index 000000000000..e74dfc161cfc --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/mediatek,mt7622-rtc.yaml @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/mediatek,mt7622-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek MT7622 on-SoC RTC + +allOf: + - $ref: rtc.yaml# + +maintainers: + - Sean Wang + +properties: + compatible: + items: + - const: mediatek,mt7622-rtc + - const: mediatek,soc-rtc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + const: rtc + +required: + - reg + - interrupts + - clocks + - clock-names + +unevaluatedProperties: false + +examples: + - | + #include + #include + + rtc@10212800 { + compatible = "mediatek,mt7622-rtc", "mediatek,soc-rtc"; + reg = <0x10212800 0x200>; + interrupts = ; + clocks = <&topckgen CLK_TOP_RTC>; + clock-names = "rtc"; + }; diff --git a/Documentation/devicetree/bindings/rtc/rtc-mt7622.txt b/Documentation/devicetree/bindings/rtc/rtc-mt7622.txt deleted file mode 100644 index 09fe8f51476f..000000000000 --- a/Documentation/devicetree/bindings/rtc/rtc-mt7622.txt +++ /dev/null @@ -1,21 +0,0 @@ -Device-Tree bindings for MediaTek SoC based RTC - -Required properties: -- compatible : Should be - "mediatek,mt7622-rtc", "mediatek,soc-rtc" : for MT7622 SoC -- reg : Specifies base physical address and size of the registers; -- interrupts : Should contain the interrupt for RTC alarm; -- clocks : Specifies list of clock specifiers, corresponding to - entries in clock-names property; -- clock-names : Should contain "rtc" entries - -Example: - -rtc: rtc@10212800 { - compatible = "mediatek,mt7622-rtc", - "mediatek,soc-rtc"; - reg = <0 0x10212800 0 0x200>; - interrupts = ; - clocks = <&topckgen CLK_TOP_RTC>; - clock-names = "rtc"; -}; From 16816e6a36939d2a3f70b80ac5a0ba0a8a155523 Mon Sep 17 00:00:00 2001 From: Varshini Rajendran Date: Fri, 23 Feb 2024 22:55:52 +0530 Subject: [PATCH 115/496] dt-bindings: at91rm9260-rtt: add sam9x7 compatible Add compatible for SAM9X7 RTT. Signed-off-by: Varshini Rajendran Acked-by: Conor Dooley Link: https://lore.kernel.org/r/20240223172552.672094-1-varshini.rajendran@microchip.com Signed-off-by: Alexandre Belloni --- .../devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml b/Documentation/devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml index b80b85c394ac..a7f6c1d1a08a 100644 --- a/Documentation/devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml +++ b/Documentation/devicetree/bindings/rtc/atmel,at91sam9260-rtt.yaml @@ -19,7 +19,9 @@ properties: - items: - const: atmel,at91sam9260-rtt - items: - - const: microchip,sam9x60-rtt + - enum: + - microchip,sam9x60-rtt + - microchip,sam9x7-rtt - const: atmel,at91sam9260-rtt - items: - const: microchip,sama7g5-rtt From 3100fd1aa8e4b7fdb3f352614e3b55bd44c07efe Mon Sep 17 00:00:00 2001 From: Curtis Klein Date: Wed, 21 Feb 2024 17:11:29 -0800 Subject: [PATCH 116/496] rtc: m41t80: Use the unified property API get the wakeup-source property This allows both ACPI and Device Tree systems to specify the m41t80 as a wakeup-source. Signed-off-by: Curtis Klein Link: https://lore.kernel.org/r/20240222011129.79241-1-curtis.klein@hpe.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-m41t80.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index 866489ad56d6..0013bff0447d 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -909,10 +909,7 @@ static int m41t80_probe(struct i2c_client *client) if (IS_ERR(m41t80_data->rtc)) return PTR_ERR(m41t80_data->rtc); -#ifdef CONFIG_OF - wakeup_source = of_property_read_bool(client->dev.of_node, - "wakeup-source"); -#endif + wakeup_source = device_property_read_bool(&client->dev, "wakeup-source"); if (client->irq > 0) { unsigned long irqflags = IRQF_TRIGGER_LOW; From 626e2b54645a4e9b58bcb479a565b4a06ff76a26 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Mon, 19 Feb 2024 15:29:45 +0100 Subject: [PATCH 117/496] dt-bindings: rtc: abx80x: convert to yaml Convert the abracon abx80x rtc text bindings to dt-schema format. In addition to the text description reference generic interrupts properties and add an example. Signed-off-by: Josua Mayer Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240219-rtc-abracon-convert-bindings-v7-1-aca4fc3b8cec@solid-run.com Signed-off-by: Alexandre Belloni --- .../bindings/rtc/abracon,abx80x.txt | 31 -------- .../bindings/rtc/abracon,abx80x.yaml | 79 +++++++++++++++++++ 2 files changed, 79 insertions(+), 31 deletions(-) delete mode 100644 Documentation/devicetree/bindings/rtc/abracon,abx80x.txt create mode 100644 Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml diff --git a/Documentation/devicetree/bindings/rtc/abracon,abx80x.txt b/Documentation/devicetree/bindings/rtc/abracon,abx80x.txt deleted file mode 100644 index 2405e35a1bc0..000000000000 --- a/Documentation/devicetree/bindings/rtc/abracon,abx80x.txt +++ /dev/null @@ -1,31 +0,0 @@ -Abracon ABX80X I2C ultra low power RTC/Alarm chip - -The Abracon ABX80X family consist of the ab0801, ab0803, ab0804, ab0805, ab1801, -ab1803, ab1804 and ab1805. The ab0805 is the superset of ab080x and the ab1805 -is the superset of ab180x. - -Required properties: - - - "compatible": should one of: - "abracon,abx80x" - "abracon,ab0801" - "abracon,ab0803" - "abracon,ab0804" - "abracon,ab0805" - "abracon,ab1801" - "abracon,ab1803" - "abracon,ab1804" - "abracon,ab1805" - "microcrystal,rv1805" - Using "abracon,abx80x" will enable chip autodetection. - - "reg": I2C bus address of the device - -Optional properties: - -The abx804 and abx805 have a trickle charger that is able to charge the -connected battery or supercap. Both the following properties have to be defined -and valid to enable charging: - - - "abracon,tc-diode": should be "standard" (0.6V) or "schottky" (0.3V) - - "abracon,tc-resistor": should be <0>, <3>, <6> or <11>. 0 disables the output - resistor, the other values are in kOhm. diff --git a/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml b/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml new file mode 100644 index 000000000000..58dbbca27deb --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/abracon,abx80x.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Abracon ABX80X I2C ultra low power RTC/Alarm chip + +maintainers: + - linux-rtc@vger.kernel.org + +allOf: + - $ref: rtc.yaml# + +properties: + compatible: + description: + The wildcard 'abracon,abx80x' may be used to support a mix + of different abracon rtc`s. In this case the driver + must perform auto-detection from ID register. + enum: + - abracon,abx80x + - abracon,ab0801 + - abracon,ab0803 + - abracon,ab0804 + - abracon,ab0805 + - abracon,ab1801 + - abracon,ab1803 + - abracon,ab1804 + - abracon,ab1805 + - microcrystal,rv1805 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + abracon,tc-diode: + description: + Trickle-charge diode type. + Required to enable charging backup battery. + + Supported are 'standard' diodes with a 0.6V drop + and 'schottky' diodes with a 0.3V drop. + $ref: /schemas/types.yaml#/definitions/string + enum: + - standard + - schottky + + abracon,tc-resistor: + description: + Trickle-charge resistor value in kOhm. + Required to enable charging backup battery. + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 3, 6, 11] + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + rtc@69 { + compatible = "abracon,abx80x"; + reg = <0x69>; + abracon,tc-diode = "schottky"; + abracon,tc-resistor = <3>; + interrupts = <44 IRQ_TYPE_EDGE_FALLING>; + }; + }; From 544c42f798e1651dcb04fb0395219bf0f1c2607e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 12 Feb 2024 21:02:58 -0800 Subject: [PATCH 118/496] rtc: mt6397: select IRQ_DOMAIN instead of depending on it IRQ_DOMAIN is a hidden (not user visible) symbol. Users cannot set it directly thru "make *config", so drivers should select it instead of depending on it if they need it. Relying on it being set for a dependency is risky. Consistently using "select" or "depends on" can also help reduce Kconfig circular dependency issues. Therefore, change the use of "depends on" for IRQ_DOMAIN to "select" for RTC_DRV_MT6397. Fixes: 04d3ba70a3c9 ("rtc: mt6397: add IRQ domain dependency") Cc: Arnd Bergmann Cc: Eddie Huang Cc: Sean Wang Cc: Matthias Brugger Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mediatek@lists.infradead.org Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: linux-rtc@vger.kernel.org Cc: Marc Zyngier Cc: Philipp Zabel Cc: Peter Rosin Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20240213050258.6167-1-rdunlap@infradead.org Signed-off-by: Alexandre Belloni --- drivers/rtc/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index e37a4341f442..c63e32d012f2 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1858,7 +1858,8 @@ config RTC_DRV_MT2712 config RTC_DRV_MT6397 tristate "MediaTek PMIC based RTC" - depends on MFD_MT6397 || (COMPILE_TEST && IRQ_DOMAIN) + depends on MFD_MT6397 || COMPILE_TEST + select IRQ_DOMAIN help This selects the MediaTek(R) RTC driver. RTC is part of MediaTek MT6397 PMIC. You should enable MT6397 PMIC MFD before select From c12e67e076cbcb86fd9c3cb003a344ec684138a6 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 19 Feb 2024 11:16:15 +0200 Subject: [PATCH 119/496] rtc: max31335: fix interrupt status reg Fix the register value comparison in the `max31335_volatile_reg` function for the interrupt status register. MAX31335_STATUS1 macro definition corresponds to the actual interrupt status register. Fixes: dedaf03b99d6 ("rtc: max31335: add driver support") Signed-off-by: Antoniu Miclaus Reviewed-by: Nuno Sa Link: https://lore.kernel.org/r/20240219091616.24480-1-antoniu.miclaus@analog.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-max31335.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-max31335.c b/drivers/rtc/rtc-max31335.c index 402fda8fd548..a2441e5c2c74 100644 --- a/drivers/rtc/rtc-max31335.c +++ b/drivers/rtc/rtc-max31335.c @@ -204,7 +204,7 @@ static bool max31335_volatile_reg(struct device *dev, unsigned int reg) return true; /* interrupt status register */ - if (reg == MAX31335_INT_EN1_A1IE) + if (reg == MAX31335_STATUS1) return true; /* temperature registers */ From babfeb9cbe7ebc657bd5b3e4f9fde79f560b6acc Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 29 Feb 2024 23:21:27 +0100 Subject: [PATCH 120/496] rtc: nct3018y: fix possible NULL dereference alarm_enable and alarm_flag are allowed to be NULL but will be dereferenced later by the dev_dbg call. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202305180042.DEzW1pSd-lkp@intel.com/ Link: https://lore.kernel.org/r/20240229222127.1878176-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-nct3018y.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-nct3018y.c b/drivers/rtc/rtc-nct3018y.c index f488a189a465..076d8b99f913 100644 --- a/drivers/rtc/rtc-nct3018y.c +++ b/drivers/rtc/rtc-nct3018y.c @@ -102,6 +102,8 @@ static int nct3018y_get_alarm_mode(struct i2c_client *client, unsigned char *ala if (flags < 0) return flags; *alarm_enable = flags & NCT3018Y_BIT_AIE; + dev_dbg(&client->dev, "%s:alarm_enable:%x\n", __func__, *alarm_enable); + } if (alarm_flag) { @@ -110,11 +112,9 @@ static int nct3018y_get_alarm_mode(struct i2c_client *client, unsigned char *ala if (flags < 0) return flags; *alarm_flag = flags & NCT3018Y_BIT_AF; + dev_dbg(&client->dev, "%s:alarm_flag:%x\n", __func__, *alarm_flag); } - dev_dbg(&client->dev, "%s:alarm_enable:%x alarm_flag:%x\n", - __func__, *alarm_enable, *alarm_flag); - return 0; } From 1e60ac6b8b571be31d5bfe3dae08f384a720b1e5 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 1 Mar 2024 15:59:07 +0100 Subject: [PATCH 121/496] MAINTAINERS: adjust file entry in ARM/Mediatek RTC DRIVER Commit e8c0498505b0 ("dt-bindings: rtc: convert MT2717 RTC to the json-schema") and commit aef3952ec13f ("dt-bindings: rtc: convert MT7622 RTC to the json-schema") convert rtc-mt{2712,7622}.txt to mediatek,mt{2712,7622}-rtc.yaml, but misses to adjust the file entries in MAINTAINERS. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Repair these file entries in ARM/Mediatek RTC DRIVER. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20240301145907.32732-1-lukas.bulwahn@gmail.com Signed-off-by: Alexandre Belloni --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8d1052fa6a69..eeba9c259449 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2357,8 +2357,8 @@ M: Sean Wang L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) L: linux-mediatek@lists.infradead.org (moderated for non-subscribers) S: Maintained -F: Documentation/devicetree/bindings/rtc/rtc-mt2712.txt -F: Documentation/devicetree/bindings/rtc/rtc-mt7622.txt +F: Documentation/devicetree/bindings/rtc/mediatek,mt2712-rtc.yaml +F: Documentation/devicetree/bindings/rtc/mediatek,mt7622-rtc.yaml F: drivers/rtc/rtc-mt2712.c F: drivers/rtc/rtc-mt6397.c F: drivers/rtc/rtc-mt7622.c From 32a6be0858356190ac3bce4b75693549e1da2f16 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 5 Mar 2024 10:09:44 +0200 Subject: [PATCH 122/496] dt-bindings: rtc: abx80x: Improve checks on trickle charger constraints The abracon,tc-diode and abracon,tc-resistor DT properties are only valid for the ABx0804 and ABx0805. Furthermore, they must both be present, or neither of them must be specified. Add rules to check this. The generic abracon,abx08x compatible string doesn't indicate which chip variant is used, but performs auto-detection at runtime. It must this also allow the two above properties. Signed-off-by: Laurent Pinchart Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20240305080944.17991-1-laurent.pinchart@ideasonboard.com Signed-off-by: Alexandre Belloni --- .../bindings/rtc/abracon,abx80x.yaml | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml b/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml index 58dbbca27deb..355b0598411a 100644 --- a/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml +++ b/Documentation/devicetree/bindings/rtc/abracon,abx80x.yaml @@ -9,9 +9,6 @@ title: Abracon ABX80X I2C ultra low power RTC/Alarm chip maintainers: - linux-rtc@vger.kernel.org -allOf: - - $ref: rtc.yaml# - properties: compatible: description: @@ -55,10 +52,32 @@ properties: $ref: /schemas/types.yaml#/definitions/uint32 enum: [0, 3, 6, 11] +dependentRequired: + abracon,tc-diode: ["abracon,tc-resistor"] + abracon,tc-resistor: ["abracon,tc-diode"] + required: - compatible - reg +allOf: + - $ref: rtc.yaml# + - if: + properties: + compatible: + not: + contains: + enum: + - abracon,abx80x + - abracon,ab0804 + - abracon,ab1804 + - abracon,ab0805 + - abracon,ab1805 + then: + properties: + abracon,tc-diode: false + abracon,tc-resistor: false + unevaluatedProperties: false examples: From 6b6ca096115e5b7a85e8313f4e68a72d52db91b3 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 15:22:28 -0300 Subject: [PATCH 123/496] rtc: class: make rtc_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the rtc_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Link: https://lore.kernel.org/r/20240305-class_cleanup-abelloni-v1-1-944c026137c8@marliere.net Signed-off-by: Alexandre Belloni --- drivers/rtc/class.c | 21 +++++++++++++-------- drivers/rtc/interface.c | 2 +- include/linux/rtc.h | 2 +- kernel/power/suspend_test.c | 2 +- kernel/time/alarmtimer.c | 2 +- 5 files changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 921ee1827974..e31fa0ad127e 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -21,7 +21,6 @@ #include "rtc-core.h" static DEFINE_IDA(rtc_ida); -struct class *rtc_class; static void rtc_device_release(struct device *dev) { @@ -199,6 +198,11 @@ static SIMPLE_DEV_PM_OPS(rtc_class_dev_pm_ops, rtc_suspend, rtc_resume); #define RTC_CLASS_DEV_PM_OPS NULL #endif +const struct class rtc_class = { + .name = "rtc", + .pm = RTC_CLASS_DEV_PM_OPS, +}; + /* Ensure the caller will set the id before releasing the device */ static struct rtc_device *rtc_allocate_device(void) { @@ -220,7 +224,7 @@ static struct rtc_device *rtc_allocate_device(void) rtc->irq_freq = 1; rtc->max_user_freq = 64; - rtc->dev.class = rtc_class; + rtc->dev.class = &rtc_class; rtc->dev.groups = rtc_get_dev_attribute_groups(); rtc->dev.release = rtc_device_release; @@ -475,13 +479,14 @@ EXPORT_SYMBOL_GPL(devm_rtc_device_register); static int __init rtc_init(void) { - rtc_class = class_create("rtc"); - if (IS_ERR(rtc_class)) { - pr_err("couldn't create class\n"); - return PTR_ERR(rtc_class); - } - rtc_class->pm = RTC_CLASS_DEV_PM_OPS; + int err; + + err = class_register(&rtc_class); + if (err) + return err; + rtc_dev_init(); + return 0; } subsys_initcall(rtc_init); diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 1b63111cdda2..5faafb4aa55c 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -696,7 +696,7 @@ struct rtc_device *rtc_class_open(const char *name) struct device *dev; struct rtc_device *rtc = NULL; - dev = class_find_device_by_name(rtc_class, name); + dev = class_find_device_by_name(&rtc_class, name); if (dev) rtc = to_rtc_device(dev); diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 5f8e438a0312..3f4d315aaec9 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -42,7 +42,7 @@ static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs) #include #include -extern struct class *rtc_class; +extern const struct class rtc_class; /* * For these RTC methods the device parameter is the physical device diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index b663a97f5867..d4856ec61570 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -201,7 +201,7 @@ static int __init test_suspend(void) } /* RTCs have initialized by now too ... can we use one? */ - dev = class_find_device(rtc_class, NULL, NULL, has_wakealarm); + dev = class_find_device(&rtc_class, NULL, NULL, has_wakealarm); if (dev) { rtc = rtc_class_open(dev_name(dev)); put_device(dev); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4657cb8e8b1f..5abfa4390673 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -134,7 +134,7 @@ static struct class_interface alarmtimer_rtc_interface = { static int alarmtimer_rtc_interface_setup(void) { - alarmtimer_rtc_interface.class = rtc_class; + alarmtimer_rtc_interface.class = &rtc_class; return class_interface_register(&alarmtimer_rtc_interface); } static void alarmtimer_rtc_interface_remove(void) From f0109900462db14e3f213a41c7f14b350252c7e2 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 7 Mar 2024 10:43:46 +0100 Subject: [PATCH 124/496] dt-bindings: rtc: zynqmp: Add support for Versal/Versal NET SoCs Add support for Versal and Versal NET SoCs. Both of them should use the same IP core but differences can be in integration part that's why create separate compatible strings. Also describe optional power-domains property. It is optional because power domain doesn't need to be onwed by non secure firmware hence no access to control it via any driver. Signed-off-by: Michal Simek Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/5ecd775e6083f86aa744c4e9dfb7f6a13082c78a.1709804617.git.michal.simek@amd.com Signed-off-by: Alexandre Belloni --- .../devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml b/Documentation/devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml index d1f5eb996dba..01cc90fee81e 100644 --- a/Documentation/devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/xlnx,zynqmp-rtc.yaml @@ -18,7 +18,13 @@ allOf: properties: compatible: - const: xlnx,zynqmp-rtc + oneOf: + - const: xlnx,zynqmp-rtc + - items: + - enum: + - xlnx,versal-rtc + - xlnx,versal-net-rtc + - const: xlnx,zynqmp-rtc reg: maxItems: 1 @@ -48,6 +54,9 @@ properties: default: 0x198233 deprecated: true + power-domains: + maxItems: 1 + required: - compatible - reg From 34485c37ea931d4a086701de1d61a986b9707b7d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 7 Mar 2024 08:55:42 -0800 Subject: [PATCH 125/496] nvme: change shutdown timeout setting message User visible messages containing the word "timeout" can be alarming. This one from nvme is just reporting a potentially informative device configuration, and everything is working as designed. Change the text to report the less concerning "D3 entry latency", which is where this value comes from anyway. Reported-by: Len Brown Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2baf5786a92f..0dcaf3973dc4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3233,7 +3233,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (ctrl->shutdown_timeout != shutdown_timeout) dev_info(ctrl->device, - "Shutdown timeout set to %u seconds\n", + "D3 entry latency set to %u seconds\n", ctrl->shutdown_timeout); } else ctrl->shutdown_timeout = shutdown_timeout; From 0889d13b9e1cbef49e802ae09f3b516911ad82a1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Mar 2024 08:11:05 +0100 Subject: [PATCH 126/496] nvmet-tcp: do not continue for invalid icreq When the length check for an icreq sqe fails we should not continue processing but rather return immediately as all other contents of that sqe cannot be relied on. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index c8655fc5aa5b..022d17bd36bf 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -898,6 +898,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) pr_err("bad nvme-tcp pdu length (%d)\n", le32_to_cpu(icreq->hdr.plen)); nvmet_tcp_fatal_error(queue); + return -EPROTO; } if (icreq->pfv != NVME_TCP_PFV_1_0) { From 1843671f86e93311e12b7dde72736167a07158fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Mar 2024 09:51:10 +0100 Subject: [PATCH 127/496] nvme-apple: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index a480cdeac288..dd6ec0865141 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1532,7 +1532,7 @@ put_dev: return ret; } -static int apple_nvme_remove(struct platform_device *pdev) +static void apple_nvme_remove(struct platform_device *pdev) { struct apple_nvme *anv = platform_get_drvdata(pdev); @@ -1547,8 +1547,6 @@ static int apple_nvme_remove(struct platform_device *pdev) apple_rtkit_shutdown(anv->rtk); apple_nvme_detach_genpd(anv); - - return 0; } static void apple_nvme_shutdown(struct platform_device *pdev) @@ -1598,7 +1596,7 @@ static struct platform_driver apple_nvme_driver = { .pm = pm_sleep_ptr(&apple_nvme_pm_ops), }, .probe = apple_nvme_probe, - .remove = apple_nvme_remove, + .remove_new = apple_nvme_remove, .shutdown = apple_nvme_shutdown, }; module_platform_driver(apple_nvme_driver); From 8fc3b0f1f47b8b6dd2801deeccae59a3b46c4c39 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 28 Feb 2024 16:37:54 +0800 Subject: [PATCH 128/496] nvmet: add tracing of authentication commands Add nvme_fabrics_type_auth_send and nvme_fabrics_type_auth_receive to the nvme target's tracing facility. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6ee1f3db81d0..ca537fae3730 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -176,6 +176,34 @@ static const char *nvmet_trace_fabrics_property_get(struct trace_seq *p, return ret; } +static const char *nvmet_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 tl = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u", + spsp0, spsp1, secp, tl); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 al = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u", + spsp0, spsp1, secp, al); + trace_seq_putc(p, 0); + return ret; +} + static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); @@ -195,6 +223,10 @@ const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, return nvmet_trace_fabrics_connect(p, spc); case nvme_fabrics_type_property_get: return nvmet_trace_fabrics_property_get(p, spc); + case nvme_fabrics_type_auth_send: + return nvmet_trace_fabrics_auth_send(p, spc); + case nvme_fabrics_type_auth_receive: + return nvmet_trace_fabrics_auth_receive(p, spc); default: return nvmet_trace_fabrics_common(p, spc); } From 2bc91743096756d7c97d10c7079617192211369b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 28 Feb 2024 16:37:55 +0800 Subject: [PATCH 129/496] nvmet: add tracing of zns commands Add nvme_cmd_zone_append, nvme_cmd_zone_mgmt_send and nvme_cmd_zone_mgmt_recv parse to nvme target tracing. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index ca537fae3730..8d1806a82887 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -119,6 +119,67 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, } } +static const char *nvmet_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) +{ + static const char * const zsa_strs[] = { + [0x01] = "close zone", + [0x02] = "finish zone", + [0x03] = "open zone", + [0x04] = "reset zone", + [0x05] = "offline zone", + [0x10] = "set zone descriptor extension" + }; + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + const char *zsa_str; + u8 zsa = cdw10[12]; + u8 all = cdw10[13]; + + if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa]) + zsa_str = zsa_strs[zsa]; + else + zsa_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u", + slba, zsa, zsa_str, all); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) +{ + static const char * const zrasf_strs[] = { + [0x00] = "list all zones", + [0x01] = "list the zones in the ZSE: Empty state", + [0x02] = "list the zones in the ZSIO: Implicitly Opened state", + [0x03] = "list the zones in the ZSEO: Explicitly Opened state", + [0x04] = "list the zones in the ZSC: Closed state", + [0x05] = "list the zones in the ZSF: Full state", + [0x06] = "list the zones in the ZSRO: Read Only state", + [0x07] = "list the zones in the ZSO: Offline state", + [0x09] = "list the zones that have the zone attribute" + }; + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 numd = get_unaligned_le32(&cdw10[8]); + u8 zra = cdw10[12]; + u8 zrasf = cdw10[13]; + const char *zrasf_str; + u8 pr = cdw10[14]; + + if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf]) + zrasf_str = zrasf_strs[zrasf]; + else + zrasf_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u", + slba, numd, zra, zrasf, zrasf_str, pr); + trace_seq_putc(p, 0); + + return ret; +} + const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10) { @@ -126,9 +187,14 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, case nvme_cmd_read: case nvme_cmd_write: case nvme_cmd_write_zeroes: + case nvme_cmd_zone_append: return nvmet_trace_read_write(p, cdw10); case nvme_cmd_dsm: return nvmet_trace_dsm(p, cdw10); + case nvme_cmd_zone_mgmt_send: + return nvmet_trace_zone_mgmt_send(p, cdw10); + case nvme_cmd_zone_mgmt_recv: + return nvmet_trace_zone_mgmt_recv(p, cdw10); default: return nvmet_trace_common(p, cdw10); } From 2c12932b8e65227030cd079d18ff6ad42d262491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 19 Feb 2024 08:46:29 +0100 Subject: [PATCH 130/496] siox: Don't pass the reference on a master in siox_master_register() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While it's technically fine to pass the ownership of the reference on a struct siox_master from the caller of siox_master_register() to the framework this is hard to use. Instead let the framework take its own reference (that is freed in siox_master_unregister()) and drop the bus driver's reference in its remove callback. Acked-by: Thorsten Scherer Link: https://lore.kernel.org/r/1e8d09d17848e58e8fc6a46278b5e8fb0cf4618a.1708328466.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/siox/siox-bus-gpio.c | 2 ++ drivers/siox/siox-core.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/siox/siox-bus-gpio.c b/drivers/siox/siox-bus-gpio.c index aeefeb725524..fdf20fe80059 100644 --- a/drivers/siox/siox-bus-gpio.c +++ b/drivers/siox/siox-bus-gpio.c @@ -149,6 +149,8 @@ static int siox_gpio_remove(struct platform_device *pdev) siox_master_unregister(master); + siox_master_put(master); + return 0; } diff --git a/drivers/siox/siox-core.c b/drivers/siox/siox-core.c index 561408583b2b..d4acab7036d6 100644 --- a/drivers/siox/siox-core.c +++ b/drivers/siox/siox-core.c @@ -717,6 +717,8 @@ int siox_master_register(struct siox_master *smaster) if (!smaster->pushpull) return -EINVAL; + get_device(&smaster->dev); + dev_set_name(&smaster->dev, "siox-%d", smaster->busno); mutex_init(&smaster->lock); From 9ecfbf70537fe1209d0d27b5378260eb3e473c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 19 Feb 2024 08:46:30 +0100 Subject: [PATCH 131/496] siox: Provide a devm variant of siox_master_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to simplify siox master drivers in the next step. Acked-by: Thorsten Scherer Link: https://lore.kernel.org/r/ad141dd22c7d95ad0bd347f257ce586e1afb22a4.1708328466.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/siox/siox-core.c | 25 +++++++++++++++++++++++++ drivers/siox/siox.h | 2 ++ 2 files changed, 27 insertions(+) diff --git a/drivers/siox/siox-core.c b/drivers/siox/siox-core.c index d4acab7036d6..86ce2f3cde91 100644 --- a/drivers/siox/siox-core.c +++ b/drivers/siox/siox-core.c @@ -707,6 +707,31 @@ struct siox_master *siox_master_alloc(struct device *dev, } EXPORT_SYMBOL_GPL(siox_master_alloc); +static void devm_siox_master_put(void *data) +{ + struct siox_master *smaster = data; + + siox_master_put(smaster); +} + +struct siox_master *devm_siox_master_alloc(struct device *dev, + size_t size) +{ + struct siox_master *smaster; + int ret; + + smaster = siox_master_alloc(dev, size); + if (!smaster) + return NULL; + + ret = devm_add_action_or_reset(dev, devm_siox_master_put, smaster); + if (ret) + return NULL; + + return smaster; +} +EXPORT_SYMBOL_GPL(devm_siox_master_alloc); + int siox_master_register(struct siox_master *smaster) { int ret; diff --git a/drivers/siox/siox.h b/drivers/siox/siox.h index f08b43b713c5..b227e18b697a 100644 --- a/drivers/siox/siox.h +++ b/drivers/siox/siox.h @@ -45,5 +45,7 @@ static inline void siox_master_put(struct siox_master *smaster) put_device(&smaster->dev); } +struct siox_master *devm_siox_master_alloc(struct device *dev, size_t size); + int siox_master_register(struct siox_master *smaster); void siox_master_unregister(struct siox_master *smaster); From 91d5bb579c3666f09c160cc3c19c0456bff03cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 19 Feb 2024 08:46:31 +0100 Subject: [PATCH 132/496] siox: Provide a devm variant of siox_master_register() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows to simplify siox master drivers in the next step. Acked-by: Thorsten Scherer Link: https://lore.kernel.org/r/e961dfb3e94f106b16f5eacff2110fc7fa0cab13.1708328466.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/siox/siox-core.c | 19 +++++++++++++++++++ drivers/siox/siox.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/drivers/siox/siox-core.c b/drivers/siox/siox-core.c index 86ce2f3cde91..5be32e756d02 100644 --- a/drivers/siox/siox-core.c +++ b/drivers/siox/siox-core.c @@ -795,6 +795,25 @@ void siox_master_unregister(struct siox_master *smaster) } EXPORT_SYMBOL_GPL(siox_master_unregister); +static void devm_siox_master_unregister(void *data) +{ + struct siox_master *smaster = data; + + siox_master_unregister(smaster); +} + +int devm_siox_master_register(struct device *dev, struct siox_master *smaster) +{ + int ret; + + ret = siox_master_register(smaster); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, devm_siox_master_unregister, smaster); +} +EXPORT_SYMBOL_GPL(devm_siox_master_register); + static struct siox_device *siox_device_add(struct siox_master *smaster, const char *type, size_t inbytes, size_t outbytes, u8 statustype) diff --git a/drivers/siox/siox.h b/drivers/siox/siox.h index b227e18b697a..513f2c8312f7 100644 --- a/drivers/siox/siox.h +++ b/drivers/siox/siox.h @@ -49,3 +49,5 @@ struct siox_master *devm_siox_master_alloc(struct device *dev, size_t size); int siox_master_register(struct siox_master *smaster); void siox_master_unregister(struct siox_master *smaster); + +int devm_siox_master_register(struct device *dev, struct siox_master *smaster); From db418d5f1ca5b7bafc8eaa9393ea18a7901bb0ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 19 Feb 2024 08:46:32 +0100 Subject: [PATCH 133/496] siox: bus-gpio: Simplify using devm_siox_* functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the devm variant of siox_master_allocate() and siox_master_register() the remove callback can be dropped. This also simplifies the error paths in the probe function. Acked-by: Thorsten Scherer Link: https://lore.kernel.org/r/e3c598de536deadc7efef9c21ccb49d31eb240a9.1708328466.git.u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- drivers/siox/siox-bus-gpio.c | 64 +++++++++++------------------------- 1 file changed, 20 insertions(+), 44 deletions(-) diff --git a/drivers/siox/siox-bus-gpio.c b/drivers/siox/siox-bus-gpio.c index fdf20fe80059..9e01642e72de 100644 --- a/drivers/siox/siox-bus-gpio.c +++ b/drivers/siox/siox-bus-gpio.c @@ -91,65 +91,42 @@ static int siox_gpio_probe(struct platform_device *pdev) int ret; struct siox_master *smaster; - smaster = siox_master_alloc(&pdev->dev, sizeof(*ddata)); - if (!smaster) { - dev_err(dev, "failed to allocate siox master\n"); - return -ENOMEM; - } + smaster = devm_siox_master_alloc(dev, sizeof(*ddata)); + if (!smaster) + return dev_err_probe(dev, -ENOMEM, + "failed to allocate siox master\n"); platform_set_drvdata(pdev, smaster); ddata = siox_master_get_devdata(smaster); ddata->din = devm_gpiod_get(dev, "din", GPIOD_IN); - if (IS_ERR(ddata->din)) { - ret = dev_err_probe(dev, PTR_ERR(ddata->din), - "Failed to get din GPIO\n"); - goto err; - } + if (IS_ERR(ddata->din)) + return dev_err_probe(dev, PTR_ERR(ddata->din), + "Failed to get din GPIO\n"); ddata->dout = devm_gpiod_get(dev, "dout", GPIOD_OUT_LOW); - if (IS_ERR(ddata->dout)) { - ret = dev_err_probe(dev, PTR_ERR(ddata->dout), - "Failed to get dout GPIO\n"); - goto err; - } + if (IS_ERR(ddata->dout)) + return dev_err_probe(dev, PTR_ERR(ddata->dout), + "Failed to get dout GPIO\n"); ddata->dclk = devm_gpiod_get(dev, "dclk", GPIOD_OUT_LOW); - if (IS_ERR(ddata->dclk)) { - ret = dev_err_probe(dev, PTR_ERR(ddata->dclk), - "Failed to get dclk GPIO\n"); - goto err; - } + if (IS_ERR(ddata->dclk)) + return dev_err_probe(dev, PTR_ERR(ddata->dclk), + "Failed to get dclk GPIO\n"); ddata->dld = devm_gpiod_get(dev, "dld", GPIOD_OUT_LOW); - if (IS_ERR(ddata->dld)) { - ret = dev_err_probe(dev, PTR_ERR(ddata->dld), - "Failed to get dld GPIO\n"); - goto err; - } + if (IS_ERR(ddata->dld)) + return dev_err_probe(dev, PTR_ERR(ddata->dld), + "Failed to get dld GPIO\n"); smaster->pushpull = siox_gpio_pushpull; /* XXX: determine automatically like spi does */ smaster->busno = 0; - ret = siox_master_register(smaster); - if (ret) { - dev_err_probe(dev, ret, - "Failed to register siox master\n"); -err: - siox_master_put(smaster); - } - - return ret; -} - -static int siox_gpio_remove(struct platform_device *pdev) -{ - struct siox_master *master = platform_get_drvdata(pdev); - - siox_master_unregister(master); - - siox_master_put(master); + ret = devm_siox_master_register(dev, smaster); + if (ret) + return dev_err_probe(dev, ret, + "Failed to register siox master\n"); return 0; } @@ -162,7 +139,6 @@ MODULE_DEVICE_TABLE(of, siox_gpio_dt_ids); static struct platform_driver siox_gpio_driver = { .probe = siox_gpio_probe, - .remove = siox_gpio_remove, .driver = { .name = DRIVER_NAME, From 1b9a8e8af0d969ad8f2deece827e691a1b07ba1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:21 +0100 Subject: [PATCH 134/496] dt-bindings: i2c: nomadik: add mobileye,eyeq5-i2c bindings and example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add EyeQ5 bindings to the existing Nomadik I2C dt-bindings. Add the EyeQ5-specific property behind a conditional. Add an example for this compatible. Signed-off-by: Théo Lebrun Reviewed-by: Rob Herring Signed-off-by: Andi Shyti --- .../bindings/i2c/st,nomadik-i2c.yaml | 49 ++++++++++++++++--- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/st,nomadik-i2c.yaml b/Documentation/devicetree/bindings/i2c/st,nomadik-i2c.yaml index 16024415a4a7..44c54b162bb1 100644 --- a/Documentation/devicetree/bindings/i2c/st,nomadik-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/st,nomadik-i2c.yaml @@ -14,9 +14,6 @@ description: The Nomadik I2C host controller began its life in the ST maintainers: - Linus Walleij -allOf: - - $ref: /schemas/i2c/i2c-controller.yaml# - # Need a custom select here or 'arm,primecell' will match on lots of nodes select: properties: @@ -24,21 +21,23 @@ select: contains: enum: - st,nomadik-i2c + - mobileye,eyeq5-i2c required: - compatible properties: compatible: oneOf: - # The variant found in STn8815 - items: - const: st,nomadik-i2c - const: arm,primecell - # The variant found in DB8500 - items: - const: stericsson,db8500-i2c - const: st,nomadik-i2c - const: arm,primecell + - items: + - const: mobileye,eyeq5-i2c + - const: arm,primecell reg: maxItems: 1 @@ -55,7 +54,7 @@ properties: - items: - const: mclk - const: apb_pclk - # Clock name in DB8500 + # Clock name in DB8500 or EyeQ5 - items: - const: i2cclk - const: apb_pclk @@ -70,6 +69,16 @@ properties: minimum: 1 maximum: 400000 + mobileye,olb: + $ref: /schemas/types.yaml#/definitions/phandle-array + items: + - items: + - description: Phandle to OLB system controller node. + - description: Platform-wide controller ID (integer starting from zero). + description: + The phandle pointing to OLB system controller node, with the I2C + controller index. + required: - compatible - reg @@ -79,6 +88,20 @@ required: unevaluatedProperties: false +allOf: + - $ref: /schemas/i2c/i2c-controller.yaml# + - if: + properties: + compatible: + contains: + const: mobileye,eyeq5-i2c + then: + required: + - mobileye,olb + else: + properties: + mobileye,olb: false + examples: - | #include @@ -111,5 +134,19 @@ examples: clocks = <&i2c0clk>, <&pclki2c0>; clock-names = "mclk", "apb_pclk"; }; + - | + #include + i2c@300000 { + compatible = "mobileye,eyeq5-i2c", "arm,primecell"; + reg = <0x300000 0x1000>; + interrupt-parent = <&gic>; + interrupts = ; + clock-frequency = <400000>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&i2c_ser_clk>, <&i2c_clk>; + clock-names = "i2cclk", "apb_pclk"; + mobileye,olb = <&olb 0>; + }; ... From ae9977eefc4a1e6e8fda619f5b8734efb6f11b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:22 +0100 Subject: [PATCH 135/496] i2c: nomadik: rename private struct pointers from dev to priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disambiguate the usage of dev as a variable name; it is usually best to keep it reserved for struct device pointers. Avoid having multiple names for the same struct pointer (previously: dev, nmk, nmk_i2c). Fix whitespace code style; return indented twice, spacing besides infix operators, align function call arguments to opening parenthesis. Remove useless cast to unused return value from init_hw(). Introduce local dev variable in probe() to alias &adev->dev. Reviewed-by: Linus Walleij Reviewed-by: Andi Shyti Acked-by: Wolfram Sang Signed-off-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 428 +++++++++++++++---------------- 1 file changed, 214 insertions(+), 214 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index b10574d42b7a..cd511c884f99 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -206,12 +206,12 @@ static inline void i2c_clr_bit(void __iomem *reg, u32 mask) /** * flush_i2c_fifo() - This function flushes the I2C FIFO - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver * * This function flushes the I2C Tx and Rx FIFOs. It returns * 0 on successful flushing of FIFO */ -static int flush_i2c_fifo(struct nmk_i2c_dev *dev) +static int flush_i2c_fifo(struct nmk_i2c_dev *priv) { #define LOOP_ATTEMPTS 10 int i; @@ -224,19 +224,19 @@ static int flush_i2c_fifo(struct nmk_i2c_dev *dev) * bits, until then no one must access Tx, Rx FIFO and * should poll on these bits waiting for the completion. */ - writel((I2C_CR_FTX | I2C_CR_FRX), dev->virtbase + I2C_CR); + writel((I2C_CR_FTX | I2C_CR_FRX), priv->virtbase + I2C_CR); for (i = 0; i < LOOP_ATTEMPTS; i++) { - timeout = jiffies + dev->adap.timeout; + timeout = jiffies + priv->adap.timeout; while (!time_after(jiffies, timeout)) { - if ((readl(dev->virtbase + I2C_CR) & + if ((readl(priv->virtbase + I2C_CR) & (I2C_CR_FTX | I2C_CR_FRX)) == 0) - return 0; + return 0; } } - dev_err(&dev->adev->dev, + dev_err(&priv->adev->dev, "flushing operation timed out giving up after %d attempts", LOOP_ATTEMPTS); @@ -245,45 +245,45 @@ static int flush_i2c_fifo(struct nmk_i2c_dev *dev) /** * disable_all_interrupts() - Disable all interrupts of this I2c Bus - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver */ -static void disable_all_interrupts(struct nmk_i2c_dev *dev) +static void disable_all_interrupts(struct nmk_i2c_dev *priv) { u32 mask = IRQ_MASK(0); - writel(mask, dev->virtbase + I2C_IMSCR); + writel(mask, priv->virtbase + I2C_IMSCR); } /** * clear_all_interrupts() - Clear all interrupts of I2C Controller - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver */ -static void clear_all_interrupts(struct nmk_i2c_dev *dev) +static void clear_all_interrupts(struct nmk_i2c_dev *priv) { u32 mask; mask = IRQ_MASK(I2C_CLEAR_ALL_INTS); - writel(mask, dev->virtbase + I2C_ICR); + writel(mask, priv->virtbase + I2C_ICR); } /** * init_hw() - initialize the I2C hardware - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver */ -static int init_hw(struct nmk_i2c_dev *dev) +static int init_hw(struct nmk_i2c_dev *priv) { int stat; - stat = flush_i2c_fifo(dev); + stat = flush_i2c_fifo(priv); if (stat) goto exit; /* disable the controller */ - i2c_clr_bit(dev->virtbase + I2C_CR, I2C_CR_PE); + i2c_clr_bit(priv->virtbase + I2C_CR, I2C_CR_PE); - disable_all_interrupts(dev); + disable_all_interrupts(priv); - clear_all_interrupts(dev); + clear_all_interrupts(priv); - dev->cli.operation = I2C_NO_OPERATION; + priv->cli.operation = I2C_NO_OPERATION; exit: return stat; @@ -294,15 +294,15 @@ exit: /** * load_i2c_mcr_reg() - load the MCR register - * @dev: private data of controller + * @priv: private data of controller * @flags: message flags */ -static u32 load_i2c_mcr_reg(struct nmk_i2c_dev *dev, u16 flags) +static u32 load_i2c_mcr_reg(struct nmk_i2c_dev *priv, u16 flags) { u32 mcr = 0; unsigned short slave_adr_3msb_bits; - mcr |= GEN_MASK(dev->cli.slave_adr, I2C_MCR_A7, 1); + mcr |= GEN_MASK(priv->cli.slave_adr, I2C_MCR_A7, 1); if (unlikely(flags & I2C_M_TEN)) { /* 10-bit address transaction */ @@ -313,7 +313,7 @@ static u32 load_i2c_mcr_reg(struct nmk_i2c_dev *dev, u16 flags) * the extension (MSB bits) of the 7 bit address loaded * in A7 */ - slave_adr_3msb_bits = (dev->cli.slave_adr >> 7) & 0x7; + slave_adr_3msb_bits = (priv->cli.slave_adr >> 7) & 0x7; mcr |= GEN_MASK(slave_adr_3msb_bits, I2C_MCR_EA10, 8); } else { @@ -325,40 +325,40 @@ static u32 load_i2c_mcr_reg(struct nmk_i2c_dev *dev, u16 flags) mcr |= GEN_MASK(0, I2C_MCR_SB, 11); /* check the operation, master read/write? */ - if (dev->cli.operation == I2C_WRITE) + if (priv->cli.operation == I2C_WRITE) mcr |= GEN_MASK(I2C_WRITE, I2C_MCR_OP, 0); else mcr |= GEN_MASK(I2C_READ, I2C_MCR_OP, 0); /* stop or repeated start? */ - if (dev->stop) + if (priv->stop) mcr |= GEN_MASK(1, I2C_MCR_STOP, 14); else mcr &= ~(GEN_MASK(1, I2C_MCR_STOP, 14)); - mcr |= GEN_MASK(dev->cli.count, I2C_MCR_LENGTH, 15); + mcr |= GEN_MASK(priv->cli.count, I2C_MCR_LENGTH, 15); return mcr; } /** * setup_i2c_controller() - setup the controller - * @dev: private data of controller + * @priv: private data of controller */ -static void setup_i2c_controller(struct nmk_i2c_dev *dev) +static void setup_i2c_controller(struct nmk_i2c_dev *priv) { u32 brcr1, brcr2; u32 i2c_clk, div; u32 ns; u16 slsu; - writel(0x0, dev->virtbase + I2C_CR); - writel(0x0, dev->virtbase + I2C_HSMCR); - writel(0x0, dev->virtbase + I2C_TFTR); - writel(0x0, dev->virtbase + I2C_RFTR); - writel(0x0, dev->virtbase + I2C_DMAR); + writel(0x0, priv->virtbase + I2C_CR); + writel(0x0, priv->virtbase + I2C_HSMCR); + writel(0x0, priv->virtbase + I2C_TFTR); + writel(0x0, priv->virtbase + I2C_RFTR); + writel(0x0, priv->virtbase + I2C_DMAR); - i2c_clk = clk_get_rate(dev->clk); + i2c_clk = clk_get_rate(priv->clk); /* * set the slsu: @@ -373,7 +373,7 @@ static void setup_i2c_controller(struct nmk_i2c_dev *dev) * slsu = cycles / (1000000000 / f) + 1 */ ns = DIV_ROUND_UP_ULL(1000000000ULL, i2c_clk); - switch (dev->sm) { + switch (priv->sm) { case I2C_FREQ_MODE_FAST: case I2C_FREQ_MODE_FAST_PLUS: slsu = DIV_ROUND_UP(100, ns); /* Fast */ @@ -388,15 +388,15 @@ static void setup_i2c_controller(struct nmk_i2c_dev *dev) } slsu += 1; - dev_dbg(&dev->adev->dev, "calculated SLSU = %04x\n", slsu); - writel(slsu << 16, dev->virtbase + I2C_SCR); + dev_dbg(&priv->adev->dev, "calculated SLSU = %04x\n", slsu); + writel(slsu << 16, priv->virtbase + I2C_SCR); /* * The spec says, in case of std. mode the divider is * 2 whereas it is 3 for fast and fastplus mode of * operation. TODO - high speed support. */ - div = (dev->clk_freq > I2C_MAX_STANDARD_MODE_FREQ) ? 3 : 2; + div = (priv->clk_freq > I2C_MAX_STANDARD_MODE_FREQ) ? 3 : 2; /* * generate the mask for baud rate counters. The controller @@ -406,10 +406,10 @@ static void setup_i2c_controller(struct nmk_i2c_dev *dev) * so set brcr1 to 0. */ brcr1 = 0 << 16; - brcr2 = (i2c_clk/(dev->clk_freq * div)) & 0xffff; + brcr2 = (i2c_clk / (priv->clk_freq * div)) & 0xffff; /* set the baud rate counter register */ - writel((brcr1 | brcr2), dev->virtbase + I2C_BRCR); + writel((brcr1 | brcr2), priv->virtbase + I2C_BRCR); /* * set the speed mode. Currently we support @@ -417,125 +417,124 @@ static void setup_i2c_controller(struct nmk_i2c_dev *dev) * TODO - support for fast mode plus (up to 1Mb/s) * and high speed (up to 3.4 Mb/s) */ - if (dev->sm > I2C_FREQ_MODE_FAST) { - dev_err(&dev->adev->dev, + if (priv->sm > I2C_FREQ_MODE_FAST) { + dev_err(&priv->adev->dev, "do not support this mode defaulting to std. mode\n"); brcr2 = i2c_clk / (I2C_MAX_STANDARD_MODE_FREQ * 2) & 0xffff; - writel((brcr1 | brcr2), dev->virtbase + I2C_BRCR); + writel((brcr1 | brcr2), priv->virtbase + I2C_BRCR); writel(I2C_FREQ_MODE_STANDARD << 4, - dev->virtbase + I2C_CR); + priv->virtbase + I2C_CR); } - writel(dev->sm << 4, dev->virtbase + I2C_CR); + writel(priv->sm << 4, priv->virtbase + I2C_CR); /* set the Tx and Rx FIFO threshold */ - writel(dev->tft, dev->virtbase + I2C_TFTR); - writel(dev->rft, dev->virtbase + I2C_RFTR); + writel(priv->tft, priv->virtbase + I2C_TFTR); + writel(priv->rft, priv->virtbase + I2C_RFTR); } /** * read_i2c() - Read from I2C client device - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver * @flags: message flags * * This function reads from i2c client device when controller is in * master mode. There is a completion timeout. If there is no transfer * before timeout error is returned. */ -static int read_i2c(struct nmk_i2c_dev *dev, u16 flags) +static int read_i2c(struct nmk_i2c_dev *priv, u16 flags) { int status = 0; u32 mcr, irq_mask; unsigned long timeout; - mcr = load_i2c_mcr_reg(dev, flags); - writel(mcr, dev->virtbase + I2C_MCR); + mcr = load_i2c_mcr_reg(priv, flags); + writel(mcr, priv->virtbase + I2C_MCR); /* load the current CR value */ - writel(readl(dev->virtbase + I2C_CR) | DEFAULT_I2C_REG_CR, - dev->virtbase + I2C_CR); + writel(readl(priv->virtbase + I2C_CR) | DEFAULT_I2C_REG_CR, + priv->virtbase + I2C_CR); /* enable the controller */ - i2c_set_bit(dev->virtbase + I2C_CR, I2C_CR_PE); + i2c_set_bit(priv->virtbase + I2C_CR, I2C_CR_PE); - init_completion(&dev->xfer_complete); + init_completion(&priv->xfer_complete); /* enable interrupts by setting the mask */ irq_mask = (I2C_IT_RXFNF | I2C_IT_RXFF | I2C_IT_MAL | I2C_IT_BERR); - if (dev->stop || !dev->vendor->has_mtdws) + if (priv->stop || !priv->vendor->has_mtdws) irq_mask |= I2C_IT_MTD; else irq_mask |= I2C_IT_MTDWS; irq_mask = I2C_CLEAR_ALL_INTS & IRQ_MASK(irq_mask); - writel(readl(dev->virtbase + I2C_IMSCR) | irq_mask, - dev->virtbase + I2C_IMSCR); + writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, + priv->virtbase + I2C_IMSCR); timeout = wait_for_completion_timeout( - &dev->xfer_complete, dev->adap.timeout); + &priv->xfer_complete, priv->adap.timeout); if (timeout == 0) { /* Controller timed out */ - dev_err(&dev->adev->dev, "read from slave 0x%x timed out\n", - dev->cli.slave_adr); + dev_err(&priv->adev->dev, "read from slave 0x%x timed out\n", + priv->cli.slave_adr); status = -ETIMEDOUT; } return status; } -static void fill_tx_fifo(struct nmk_i2c_dev *dev, int no_bytes) +static void fill_tx_fifo(struct nmk_i2c_dev *priv, int no_bytes) { int count; for (count = (no_bytes - 2); (count > 0) && - (dev->cli.count != 0); + (priv->cli.count != 0); count--) { /* write to the Tx FIFO */ - writeb(*dev->cli.buffer, - dev->virtbase + I2C_TFR); - dev->cli.buffer++; - dev->cli.count--; - dev->cli.xfer_bytes++; + writeb(*priv->cli.buffer, priv->virtbase + I2C_TFR); + priv->cli.buffer++; + priv->cli.count--; + priv->cli.xfer_bytes++; } } /** * write_i2c() - Write data to I2C client. - * @dev: private data of I2C Driver + * @priv: private data of I2C Driver * @flags: message flags * * This function writes data to I2C client */ -static int write_i2c(struct nmk_i2c_dev *dev, u16 flags) +static int write_i2c(struct nmk_i2c_dev *priv, u16 flags) { u32 status = 0; u32 mcr, irq_mask; unsigned long timeout; - mcr = load_i2c_mcr_reg(dev, flags); + mcr = load_i2c_mcr_reg(priv, flags); - writel(mcr, dev->virtbase + I2C_MCR); + writel(mcr, priv->virtbase + I2C_MCR); /* load the current CR value */ - writel(readl(dev->virtbase + I2C_CR) | DEFAULT_I2C_REG_CR, - dev->virtbase + I2C_CR); + writel(readl(priv->virtbase + I2C_CR) | DEFAULT_I2C_REG_CR, + priv->virtbase + I2C_CR); /* enable the controller */ - i2c_set_bit(dev->virtbase + I2C_CR, I2C_CR_PE); + i2c_set_bit(priv->virtbase + I2C_CR, I2C_CR_PE); - init_completion(&dev->xfer_complete); + init_completion(&priv->xfer_complete); /* enable interrupts by settings the masks */ irq_mask = (I2C_IT_TXFOVR | I2C_IT_MAL | I2C_IT_BERR); /* Fill the TX FIFO with transmit data */ - fill_tx_fifo(dev, MAX_I2C_FIFO_THRESHOLD); + fill_tx_fifo(priv, MAX_I2C_FIFO_THRESHOLD); - if (dev->cli.count != 0) + if (priv->cli.count != 0) irq_mask |= I2C_IT_TXFNE; /* @@ -543,23 +542,23 @@ static int write_i2c(struct nmk_i2c_dev *dev, u16 flags) * set the MTDWS bit (Master Transaction Done Without Stop) * to start repeated start operation */ - if (dev->stop || !dev->vendor->has_mtdws) + if (priv->stop || !priv->vendor->has_mtdws) irq_mask |= I2C_IT_MTD; else irq_mask |= I2C_IT_MTDWS; irq_mask = I2C_CLEAR_ALL_INTS & IRQ_MASK(irq_mask); - writel(readl(dev->virtbase + I2C_IMSCR) | irq_mask, - dev->virtbase + I2C_IMSCR); + writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, + priv->virtbase + I2C_IMSCR); timeout = wait_for_completion_timeout( - &dev->xfer_complete, dev->adap.timeout); + &priv->xfer_complete, priv->adap.timeout); if (timeout == 0) { /* Controller timed out */ - dev_err(&dev->adev->dev, "write to slave 0x%x timed out\n", - dev->cli.slave_adr); + dev_err(&priv->adev->dev, "write to slave 0x%x timed out\n", + priv->cli.slave_adr); status = -ETIMEDOUT; } @@ -568,28 +567,28 @@ static int write_i2c(struct nmk_i2c_dev *dev, u16 flags) /** * nmk_i2c_xfer_one() - transmit a single I2C message - * @dev: device with a message encoded into it + * @priv: device with a message encoded into it * @flags: message flags */ -static int nmk_i2c_xfer_one(struct nmk_i2c_dev *dev, u16 flags) +static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags) { int status; if (flags & I2C_M_RD) { /* read operation */ - dev->cli.operation = I2C_READ; - status = read_i2c(dev, flags); + priv->cli.operation = I2C_READ; + status = read_i2c(priv, flags); } else { /* write operation */ - dev->cli.operation = I2C_WRITE; - status = write_i2c(dev, flags); + priv->cli.operation = I2C_WRITE; + status = write_i2c(priv, flags); } - if (status || (dev->result)) { + if (status || priv->result) { u32 i2c_sr; u32 cause; - i2c_sr = readl(dev->virtbase + I2C_SR); + i2c_sr = readl(priv->virtbase + I2C_SR); /* * Check if the controller I2C operation status * is set to ABORT(11b). @@ -597,15 +596,15 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *dev, u16 flags) if (((i2c_sr >> 2) & 0x3) == 0x3) { /* get the abort cause */ cause = (i2c_sr >> 4) & 0x7; - dev_err(&dev->adev->dev, "%s\n", + dev_err(&priv->adev->dev, "%s\n", cause >= ARRAY_SIZE(abort_causes) ? "unknown reason" : abort_causes[cause]); } - (void) init_hw(dev); + init_hw(priv); - status = status ? status : dev->result; + status = status ? status : priv->result; } return status; @@ -663,24 +662,24 @@ static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap, { int status = 0; int i; - struct nmk_i2c_dev *dev = i2c_get_adapdata(i2c_adap); + struct nmk_i2c_dev *priv = i2c_get_adapdata(i2c_adap); int j; - pm_runtime_get_sync(&dev->adev->dev); + pm_runtime_get_sync(&priv->adev->dev); /* Attempt three times to send the message queue */ for (j = 0; j < 3; j++) { /* setup the i2c controller */ - setup_i2c_controller(dev); + setup_i2c_controller(priv); for (i = 0; i < num_msgs; i++) { - dev->cli.slave_adr = msgs[i].addr; - dev->cli.buffer = msgs[i].buf; - dev->cli.count = msgs[i].len; - dev->stop = (i < (num_msgs - 1)) ? 0 : 1; - dev->result = 0; + priv->cli.slave_adr = msgs[i].addr; + priv->cli.buffer = msgs[i].buf; + priv->cli.count = msgs[i].len; + priv->stop = (i < (num_msgs - 1)) ? 0 : 1; + priv->result = 0; - status = nmk_i2c_xfer_one(dev, msgs[i].flags); + status = nmk_i2c_xfer_one(priv, msgs[i].flags); if (status != 0) break; } @@ -688,7 +687,7 @@ static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap, break; } - pm_runtime_put_sync(&dev->adev->dev); + pm_runtime_put_sync(&priv->adev->dev); /* return the no. messages processed */ if (status) @@ -699,14 +698,14 @@ static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap, /** * disable_interrupts() - disable the interrupts - * @dev: private data of controller + * @priv: private data of controller * @irq: interrupt number */ -static int disable_interrupts(struct nmk_i2c_dev *dev, u32 irq) +static int disable_interrupts(struct nmk_i2c_dev *priv, u32 irq) { irq = IRQ_MASK(irq); - writel(readl(dev->virtbase + I2C_IMSCR) & ~(I2C_CLEAR_ALL_INTS & irq), - dev->virtbase + I2C_IMSCR); + writel(readl(priv->virtbase + I2C_IMSCR) & ~(I2C_CLEAR_ALL_INTS & irq), + priv->virtbase + I2C_IMSCR); return 0; } @@ -723,17 +722,18 @@ static int disable_interrupts(struct nmk_i2c_dev *dev, u32 irq) */ static irqreturn_t i2c_irq_handler(int irq, void *arg) { - struct nmk_i2c_dev *dev = arg; + struct nmk_i2c_dev *priv = arg; + struct device *dev = &priv->adev->dev; u32 tft, rft; u32 count; u32 misr, src; /* load Tx FIFO and Rx FIFO threshold values */ - tft = readl(dev->virtbase + I2C_TFTR); - rft = readl(dev->virtbase + I2C_RFTR); + tft = readl(priv->virtbase + I2C_TFTR); + rft = readl(priv->virtbase + I2C_RFTR); /* read interrupt status register */ - misr = readl(dev->virtbase + I2C_MISR); + misr = readl(priv->virtbase + I2C_MISR); src = __ffs(misr); switch ((1 << src)) { @@ -741,20 +741,20 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) /* Transmit FIFO nearly empty interrupt */ case I2C_IT_TXFNE: { - if (dev->cli.operation == I2C_READ) { + if (priv->cli.operation == I2C_READ) { /* * in read operation why do we care for writing? * so disable the Transmit FIFO interrupt */ - disable_interrupts(dev, I2C_IT_TXFNE); + disable_interrupts(priv, I2C_IT_TXFNE); } else { - fill_tx_fifo(dev, (MAX_I2C_FIFO_THRESHOLD - tft)); + fill_tx_fifo(priv, (MAX_I2C_FIFO_THRESHOLD - tft)); /* * if done, close the transfer by disabling the * corresponding TXFNE interrupt */ - if (dev->cli.count == 0) - disable_interrupts(dev, I2C_IT_TXFNE); + if (priv->cli.count == 0) + disable_interrupts(priv, I2C_IT_TXFNE); } } break; @@ -768,60 +768,59 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) case I2C_IT_RXFNF: for (count = rft; count > 0; count--) { /* Read the Rx FIFO */ - *dev->cli.buffer = readb(dev->virtbase + I2C_RFR); - dev->cli.buffer++; + *priv->cli.buffer = readb(priv->virtbase + I2C_RFR); + priv->cli.buffer++; } - dev->cli.count -= rft; - dev->cli.xfer_bytes += rft; + priv->cli.count -= rft; + priv->cli.xfer_bytes += rft; break; /* Rx FIFO full */ case I2C_IT_RXFF: for (count = MAX_I2C_FIFO_THRESHOLD; count > 0; count--) { - *dev->cli.buffer = readb(dev->virtbase + I2C_RFR); - dev->cli.buffer++; + *priv->cli.buffer = readb(priv->virtbase + I2C_RFR); + priv->cli.buffer++; } - dev->cli.count -= MAX_I2C_FIFO_THRESHOLD; - dev->cli.xfer_bytes += MAX_I2C_FIFO_THRESHOLD; + priv->cli.count -= MAX_I2C_FIFO_THRESHOLD; + priv->cli.xfer_bytes += MAX_I2C_FIFO_THRESHOLD; break; /* Master Transaction Done with/without stop */ case I2C_IT_MTD: case I2C_IT_MTDWS: - if (dev->cli.operation == I2C_READ) { - while (!(readl(dev->virtbase + I2C_RISR) + if (priv->cli.operation == I2C_READ) { + while (!(readl(priv->virtbase + I2C_RISR) & I2C_IT_RXFE)) { - if (dev->cli.count == 0) + if (priv->cli.count == 0) break; - *dev->cli.buffer = - readb(dev->virtbase + I2C_RFR); - dev->cli.buffer++; - dev->cli.count--; - dev->cli.xfer_bytes++; + *priv->cli.buffer = + readb(priv->virtbase + I2C_RFR); + priv->cli.buffer++; + priv->cli.count--; + priv->cli.xfer_bytes++; } } - disable_all_interrupts(dev); - clear_all_interrupts(dev); + disable_all_interrupts(priv); + clear_all_interrupts(priv); - if (dev->cli.count) { - dev->result = -EIO; - dev_err(&dev->adev->dev, - "%lu bytes still remain to be xfered\n", - dev->cli.count); - (void) init_hw(dev); + if (priv->cli.count) { + priv->result = -EIO; + dev_err(dev, "%lu bytes still remain to be xfered\n", + priv->cli.count); + init_hw(priv); } - complete(&dev->xfer_complete); + complete(&priv->xfer_complete); break; /* Master Arbitration lost interrupt */ case I2C_IT_MAL: - dev->result = -EIO; - (void) init_hw(dev); + priv->result = -EIO; + init_hw(priv); - i2c_set_bit(dev->virtbase + I2C_ICR, I2C_IT_MAL); - complete(&dev->xfer_complete); + i2c_set_bit(priv->virtbase + I2C_ICR, I2C_IT_MAL); + complete(&priv->xfer_complete); break; @@ -831,13 +830,13 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) * during the transaction. */ case I2C_IT_BERR: - dev->result = -EIO; + priv->result = -EIO; /* get the status */ - if (((readl(dev->virtbase + I2C_SR) >> 2) & 0x3) == I2C_ABORT) - (void) init_hw(dev); + if (((readl(priv->virtbase + I2C_SR) >> 2) & 0x3) == I2C_ABORT) + init_hw(priv); - i2c_set_bit(dev->virtbase + I2C_ICR, I2C_IT_BERR); - complete(&dev->xfer_complete); + i2c_set_bit(priv->virtbase + I2C_ICR, I2C_IT_BERR); + complete(&priv->xfer_complete); break; @@ -847,11 +846,11 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) * the Tx FIFO is full. */ case I2C_IT_TXFOVR: - dev->result = -EIO; - (void) init_hw(dev); + priv->result = -EIO; + init_hw(priv); - dev_err(&dev->adev->dev, "Tx Fifo Over run\n"); - complete(&dev->xfer_complete); + dev_err(dev, "Tx Fifo Over run\n"); + complete(&priv->xfer_complete); break; @@ -863,10 +862,10 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) case I2C_IT_RFSE: case I2C_IT_WTSR: case I2C_IT_STD: - dev_err(&dev->adev->dev, "unhandled Interrupt\n"); + dev_err(dev, "unhandled Interrupt\n"); break; default: - dev_err(&dev->adev->dev, "spurious Interrupt..\n"); + dev_err(dev, "spurious Interrupt..\n"); break; } @@ -893,9 +892,9 @@ static int nmk_i2c_resume_early(struct device *dev) static int nmk_i2c_runtime_suspend(struct device *dev) { struct amba_device *adev = to_amba_device(dev); - struct nmk_i2c_dev *nmk_i2c = amba_get_drvdata(adev); + struct nmk_i2c_dev *priv = amba_get_drvdata(adev); - clk_disable_unprepare(nmk_i2c->clk); + clk_disable_unprepare(priv->clk); pinctrl_pm_select_idle_state(dev); return 0; } @@ -903,10 +902,10 @@ static int nmk_i2c_runtime_suspend(struct device *dev) static int nmk_i2c_runtime_resume(struct device *dev) { struct amba_device *adev = to_amba_device(dev); - struct nmk_i2c_dev *nmk_i2c = amba_get_drvdata(adev); + struct nmk_i2c_dev *priv = amba_get_drvdata(adev); int ret; - ret = clk_prepare_enable(nmk_i2c->clk); + ret = clk_prepare_enable(priv->clk); if (ret) { dev_err(dev, "can't prepare_enable clock\n"); return ret; @@ -914,9 +913,9 @@ static int nmk_i2c_runtime_resume(struct device *dev) pinctrl_pm_select_default_state(dev); - ret = init_hw(nmk_i2c); + ret = init_hw(priv); if (ret) { - clk_disable_unprepare(nmk_i2c->clk); + clk_disable_unprepare(priv->clk); pinctrl_pm_select_idle_state(dev); } @@ -939,107 +938,108 @@ static const struct i2c_algorithm nmk_i2c_algo = { }; static void nmk_i2c_of_probe(struct device_node *np, - struct nmk_i2c_dev *nmk) + struct nmk_i2c_dev *priv) { /* Default to 100 kHz if no frequency is given in the node */ - if (of_property_read_u32(np, "clock-frequency", &nmk->clk_freq)) - nmk->clk_freq = I2C_MAX_STANDARD_MODE_FREQ; + if (of_property_read_u32(np, "clock-frequency", &priv->clk_freq)) + priv->clk_freq = I2C_MAX_STANDARD_MODE_FREQ; /* This driver only supports 'standard' and 'fast' modes of operation. */ - if (nmk->clk_freq <= I2C_MAX_STANDARD_MODE_FREQ) - nmk->sm = I2C_FREQ_MODE_STANDARD; + if (priv->clk_freq <= I2C_MAX_STANDARD_MODE_FREQ) + priv->sm = I2C_FREQ_MODE_STANDARD; else - nmk->sm = I2C_FREQ_MODE_FAST; - nmk->tft = 1; /* Tx FIFO threshold */ - nmk->rft = 8; /* Rx FIFO threshold */ - nmk->timeout = 200; /* Slave response timeout(ms) */ + priv->sm = I2C_FREQ_MODE_FAST; + priv->tft = 1; /* Tx FIFO threshold */ + priv->rft = 8; /* Rx FIFO threshold */ + priv->timeout = 200; /* Slave response timeout(ms) */ } static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) { int ret = 0; + struct nmk_i2c_dev *priv; struct device_node *np = adev->dev.of_node; - struct nmk_i2c_dev *dev; + struct device *dev = &adev->dev; struct i2c_adapter *adap; struct i2c_vendor_data *vendor = id->data; u32 max_fifo_threshold = (vendor->fifodepth / 2) - 1; - dev = devm_kzalloc(&adev->dev, sizeof(*dev), GFP_KERNEL); - if (!dev) + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) return -ENOMEM; - dev->vendor = vendor; - dev->adev = adev; - nmk_i2c_of_probe(np, dev); + priv->vendor = vendor; + priv->adev = adev; + nmk_i2c_of_probe(np, priv); - if (dev->tft > max_fifo_threshold) { - dev_warn(&adev->dev, "requested TX FIFO threshold %u, adjusted down to %u\n", - dev->tft, max_fifo_threshold); - dev->tft = max_fifo_threshold; + if (priv->tft > max_fifo_threshold) { + dev_warn(dev, "requested TX FIFO threshold %u, adjusted down to %u\n", + priv->tft, max_fifo_threshold); + priv->tft = max_fifo_threshold; } - if (dev->rft > max_fifo_threshold) { - dev_warn(&adev->dev, "requested RX FIFO threshold %u, adjusted down to %u\n", - dev->rft, max_fifo_threshold); - dev->rft = max_fifo_threshold; + if (priv->rft > max_fifo_threshold) { + dev_warn(dev, "requested RX FIFO threshold %u, adjusted down to %u\n", + priv->rft, max_fifo_threshold); + priv->rft = max_fifo_threshold; } - amba_set_drvdata(adev, dev); + amba_set_drvdata(adev, priv); - dev->virtbase = devm_ioremap(&adev->dev, adev->res.start, - resource_size(&adev->res)); - if (!dev->virtbase) + priv->virtbase = devm_ioremap(dev, adev->res.start, + resource_size(&adev->res)); + if (!priv->virtbase) return -ENOMEM; - dev->irq = adev->irq[0]; - ret = devm_request_irq(&adev->dev, dev->irq, i2c_irq_handler, 0, - DRIVER_NAME, dev); + priv->irq = adev->irq[0]; + ret = devm_request_irq(dev, priv->irq, i2c_irq_handler, 0, + DRIVER_NAME, priv); if (ret) - return dev_err_probe(&adev->dev, ret, - "cannot claim the irq %d\n", dev->irq); + return dev_err_probe(dev, ret, + "cannot claim the irq %d\n", priv->irq); - dev->clk = devm_clk_get_enabled(&adev->dev, NULL); - if (IS_ERR(dev->clk)) - return dev_err_probe(&adev->dev, PTR_ERR(dev->clk), + priv->clk = devm_clk_get_enabled(dev, NULL); + if (IS_ERR(priv->clk)) + return dev_err_probe(dev, PTR_ERR(priv->clk), "could enable i2c clock\n"); - init_hw(dev); + init_hw(priv); - adap = &dev->adap; + adap = &priv->adap; adap->dev.of_node = np; - adap->dev.parent = &adev->dev; + adap->dev.parent = dev; adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; adap->algo = &nmk_i2c_algo; - adap->timeout = msecs_to_jiffies(dev->timeout); + adap->timeout = msecs_to_jiffies(priv->timeout); snprintf(adap->name, sizeof(adap->name), "Nomadik I2C at %pR", &adev->res); - i2c_set_adapdata(adap, dev); + i2c_set_adapdata(adap, priv); - dev_info(&adev->dev, + dev_info(dev, "initialize %s on virtual base %p\n", - adap->name, dev->virtbase); + adap->name, priv->virtbase); ret = i2c_add_adapter(adap); if (ret) return ret; - pm_runtime_put(&adev->dev); + pm_runtime_put(dev); return 0; } static void nmk_i2c_remove(struct amba_device *adev) { - struct nmk_i2c_dev *dev = amba_get_drvdata(adev); + struct nmk_i2c_dev *priv = amba_get_drvdata(adev); - i2c_del_adapter(&dev->adap); - flush_i2c_fifo(dev); - disable_all_interrupts(dev); - clear_all_interrupts(dev); + i2c_del_adapter(&priv->adap); + flush_i2c_fifo(priv); + disable_all_interrupts(priv); + clear_all_interrupts(priv); /* disable the controller */ - i2c_clr_bit(dev->virtbase + I2C_CR, I2C_CR_PE); + i2c_clr_bit(priv->virtbase + I2C_CR, I2C_CR_PE); } static struct i2c_vendor_data vendor_stn8815 = { From b8a77b9a5f9c2ba313f2beef8440b6f9f69768e7 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 29 Feb 2024 03:47:24 +0000 Subject: [PATCH 136/496] mtd: ubi: fix NVMEM over UBI volumes on 32-bit systems A compiler warning related to sizeof(int) != 8 when calling do_div() is triggered when building on 32-bit platforms. Address this by using integer types having a well-defined size. Fixes: 3ce485803da1 ("mtd: ubi: provide NVMEM layer over UBI volumes") Signed-off-by: Daniel Golle Reviewed-by: Zhihao Cheng Tested-by: Randy Dunlap Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/nvmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/ubi/nvmem.c b/drivers/mtd/ubi/nvmem.c index b7a93c495d17..8aeb9c428e51 100644 --- a/drivers/mtd/ubi/nvmem.c +++ b/drivers/mtd/ubi/nvmem.c @@ -23,9 +23,12 @@ struct ubi_nvmem { static int ubi_nvmem_reg_read(void *priv, unsigned int from, void *val, size_t bytes) { - int err = 0, lnum = from, offs, bytes_left = bytes, to_read; + size_t to_read, bytes_left = bytes; struct ubi_nvmem *unv = priv; struct ubi_volume_desc *desc; + uint32_t offs; + uint64_t lnum = from; + int err = 0; desc = ubi_open_volume(unv->ubi_num, unv->vol_id, UBI_READONLY); if (IS_ERR(desc)) From f31e0d0c2cad23e0cc48731634f85bb2d8707790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 10 Mar 2024 15:38:51 +0100 Subject: [PATCH 137/496] ASoC: tlv320adc3xxx: Don't strip remove function when driver is builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using __exit for the remove function results in the remove callback being discarded with SND_SOC_TLV320ADC3XXX=y. When such a device gets unbound (e.g. using sysfs or hotplug), the driver is just removed without the cleanup being performed. This results in resource leaks. Fix it by compiling in the remove callback unconditionally. This also fixes a W=1 modpost warning: WARNING: modpost: sound/soc/codecs/snd-soc-tlv320adc3xxx: section mismatch in reference: adc3xxx_i2c_driver+0x10 (section: .data) -> adc3xxx_i2c_remove (section: .exit.text) (which only happens with SND_SOC_TLV320ADC3XXX=m). Fixes: e9a3b57efd28 ("ASoC: codec: tlv320adc3xxx: New codec driver") Signed-off-by: Uwe Kleine-König Reviewed-by: Geert Uytterhoeven Link: https://msgid.link/r/20240310143852.397212-2-u.kleine-koenig@pengutronix.de Signed-off-by: Mark Brown --- sound/soc/codecs/tlv320adc3xxx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/codecs/tlv320adc3xxx.c b/sound/soc/codecs/tlv320adc3xxx.c index 420bbf588efe..e100cc9f5c19 100644 --- a/sound/soc/codecs/tlv320adc3xxx.c +++ b/sound/soc/codecs/tlv320adc3xxx.c @@ -1429,7 +1429,7 @@ err_unprepare_mclk: return ret; } -static void __exit adc3xxx_i2c_remove(struct i2c_client *client) +static void adc3xxx_i2c_remove(struct i2c_client *client) { struct adc3xxx *adc3xxx = i2c_get_clientdata(client); @@ -1452,7 +1452,7 @@ static struct i2c_driver adc3xxx_i2c_driver = { .of_match_table = tlv320adc3xxx_of_match, }, .probe = adc3xxx_i2c_probe, - .remove = __exit_p(adc3xxx_i2c_remove), + .remove = adc3xxx_i2c_remove, .id_table = adc3xxx_i2c_id, }; From e8aff71ca93026209dd0eab9b285e6808cd87d05 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:46 +0800 Subject: [PATCH 138/496] objtool/LoongArch: Enable objtool to be built Add the minimal changes to enable objtool build on LoongArch, most of the functions are stubs to only fix the build errors when make -C tools/objtool. This is similar with commit e52ec98c5ab1 ("objtool/powerpc: Enable objtool to be built on ppc"). Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/arch/loongarch/Build | 2 + tools/objtool/arch/loongarch/decode.c | 71 +++++++++++++++++++ .../arch/loongarch/include/arch/cfi_regs.h | 22 ++++++ .../objtool/arch/loongarch/include/arch/elf.h | 30 ++++++++ .../arch/loongarch/include/arch/special.h | 33 +++++++++ tools/objtool/arch/loongarch/special.c | 15 ++++ 6 files changed, 173 insertions(+) create mode 100644 tools/objtool/arch/loongarch/Build create mode 100644 tools/objtool/arch/loongarch/decode.c create mode 100644 tools/objtool/arch/loongarch/include/arch/cfi_regs.h create mode 100644 tools/objtool/arch/loongarch/include/arch/elf.h create mode 100644 tools/objtool/arch/loongarch/include/arch/special.h create mode 100644 tools/objtool/arch/loongarch/special.c diff --git a/tools/objtool/arch/loongarch/Build b/tools/objtool/arch/loongarch/Build new file mode 100644 index 000000000000..d24d5636a5b8 --- /dev/null +++ b/tools/objtool/arch/loongarch/Build @@ -0,0 +1,2 @@ +objtool-y += decode.o +objtool-y += special.o diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c new file mode 100644 index 000000000000..cc74ba4e0f54 --- /dev/null +++ b/tools/objtool/arch/loongarch/decode.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include + +int arch_ftrace_match(char *name) +{ + return !strcmp(name, "_mcount"); +} + +unsigned long arch_jump_destination(struct instruction *insn) +{ + return insn->offset + (insn->immediate << 2); +} + +unsigned long arch_dest_reloc_offset(int addend) +{ + return addend; +} + +bool arch_pc_relative_reloc(struct reloc *reloc) +{ + return false; +} + +bool arch_callee_saved_reg(unsigned char reg) +{ + switch (reg) { + case CFI_RA: + case CFI_FP: + case CFI_S0 ... CFI_S8: + return true; + default: + return false; + } +} + +int arch_decode_hint_reg(u8 sp_reg, int *base) +{ + return 0; +} + +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, + unsigned long offset, unsigned int maxlen, + struct instruction *insn) +{ + return 0; +} + +const char *arch_nop_insn(int len) +{ + return NULL; +} + +const char *arch_ret_insn(int len) +{ + return NULL; +} + +void arch_initial_func_cfi_state(struct cfi_init_state *state) +{ + int i; + + for (i = 0; i < CFI_NUM_REGS; i++) { + state->regs[i].base = CFI_UNDEFINED; + state->regs[i].offset = 0; + } + + /* initial CFA (call frame address) */ + state->cfa.base = CFI_SP; + state->cfa.offset = 0; +} diff --git a/tools/objtool/arch/loongarch/include/arch/cfi_regs.h b/tools/objtool/arch/loongarch/include/arch/cfi_regs.h new file mode 100644 index 000000000000..d183cc8f43bf --- /dev/null +++ b/tools/objtool/arch/loongarch/include/arch/cfi_regs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ARCH_CFI_REGS_H +#define _OBJTOOL_ARCH_CFI_REGS_H + +#define CFI_RA 1 +#define CFI_SP 3 +#define CFI_A0 4 +#define CFI_FP 22 +#define CFI_S0 23 +#define CFI_S1 24 +#define CFI_S2 25 +#define CFI_S3 26 +#define CFI_S4 27 +#define CFI_S5 28 +#define CFI_S6 29 +#define CFI_S7 30 +#define CFI_S8 31 +#define CFI_NUM_REGS 32 + +#define CFI_BP CFI_FP + +#endif /* _OBJTOOL_ARCH_CFI_REGS_H */ diff --git a/tools/objtool/arch/loongarch/include/arch/elf.h b/tools/objtool/arch/loongarch/include/arch/elf.h new file mode 100644 index 000000000000..9623d663220e --- /dev/null +++ b/tools/objtool/arch/loongarch/include/arch/elf.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ARCH_ELF_H +#define _OBJTOOL_ARCH_ELF_H + +/* + * See the following link for more info about ELF Relocation types: + * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html#_relocations + */ +#ifndef R_LARCH_NONE +#define R_LARCH_NONE 0 +#endif +#ifndef R_LARCH_32 +#define R_LARCH_32 1 +#endif +#ifndef R_LARCH_64 +#define R_LARCH_64 2 +#endif +#ifndef R_LARCH_32_PCREL +#define R_LARCH_32_PCREL 99 +#endif + +#define R_NONE R_LARCH_NONE +#define R_ABS32 R_LARCH_32 +#define R_ABS64 R_LARCH_64 +#define R_DATA32 R_LARCH_32_PCREL +#define R_DATA64 R_LARCH_32_PCREL +#define R_TEXT32 R_LARCH_32_PCREL +#define R_TEXT64 R_LARCH_32_PCREL + +#endif /* _OBJTOOL_ARCH_ELF_H */ diff --git a/tools/objtool/arch/loongarch/include/arch/special.h b/tools/objtool/arch/loongarch/include/arch/special.h new file mode 100644 index 000000000000..35fc979b550a --- /dev/null +++ b/tools/objtool/arch/loongarch/include/arch/special.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ARCH_SPECIAL_H +#define _OBJTOOL_ARCH_SPECIAL_H + +/* + * See more info about struct exception_table_entry + * in arch/loongarch/include/asm/extable.h + */ +#define EX_ENTRY_SIZE 12 +#define EX_ORIG_OFFSET 0 +#define EX_NEW_OFFSET 4 + +/* + * See more info about struct jump_entry + * in include/linux/jump_label.h + */ +#define JUMP_ENTRY_SIZE 16 +#define JUMP_ORIG_OFFSET 0 +#define JUMP_NEW_OFFSET 4 +#define JUMP_KEY_OFFSET 8 + +/* + * See more info about struct alt_instr + * in arch/loongarch/include/asm/alternative.h + */ +#define ALT_ENTRY_SIZE 12 +#define ALT_ORIG_OFFSET 0 +#define ALT_NEW_OFFSET 4 +#define ALT_FEATURE_OFFSET 8 +#define ALT_ORIG_LEN_OFFSET 10 +#define ALT_NEW_LEN_OFFSET 11 + +#endif /* _OBJTOOL_ARCH_SPECIAL_H */ diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c new file mode 100644 index 000000000000..9bba1e9318e0 --- /dev/null +++ b/tools/objtool/arch/loongarch/special.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc) +{ + return false; +} + +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn) +{ + return NULL; +} From b2d23158e6c881326321c2351b92568be4e57030 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 139/496] objtool/LoongArch: Implement instruction decoder Only copy the minimal definitions of instruction opcodes and formats in inst.h from arch/loongarch to tools/arch/loongarch, and also copy the definition of sign_extend64() to tools/include/linux/bitops.h to decode the following kinds of instructions: (1) stack pointer related instructions addi.d, ld.d, st.d, ldptr.d and stptr.d (2) branch and jump related instructions beq, bne, blt, bge, bltu, bgeu, beqz, bnez, bceqz, bcnez, b, bl and jirl (3) other instructions break, nop and ertn See more info about instructions in LoongArch Reference Manual: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/arch/loongarch/include/asm/inst.h | 161 ++++++++++++++ tools/include/linux/bitops.h | 11 + tools/objtool/arch/loongarch/decode.c | 273 +++++++++++++++++++++++- 3 files changed, 443 insertions(+), 2 deletions(-) create mode 100644 tools/arch/loongarch/include/asm/inst.h diff --git a/tools/arch/loongarch/include/asm/inst.h b/tools/arch/loongarch/include/asm/inst.h new file mode 100644 index 000000000000..c25b5853181d --- /dev/null +++ b/tools/arch/loongarch/include/asm/inst.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020-2022 Loongson Technology Corporation Limited + */ +#ifndef _ASM_INST_H +#define _ASM_INST_H + +#include + +#define LOONGARCH_INSN_NOP 0x03400000 + +enum reg0i15_op { + break_op = 0x54, +}; + +enum reg0i26_op { + b_op = 0x14, + bl_op = 0x15, +}; + +enum reg1i21_op { + beqz_op = 0x10, + bnez_op = 0x11, + bceqz_op = 0x12, /* bits[9:8] = 0x00 */ + bcnez_op = 0x12, /* bits[9:8] = 0x01 */ +}; + +enum reg2_op { + ertn_op = 0x1920e, +}; + +enum reg2i12_op { + addid_op = 0x0b, + andi_op = 0x0d, + ldd_op = 0xa3, + std_op = 0xa7, +}; + +enum reg2i14_op { + ldptrd_op = 0x26, + stptrd_op = 0x27, +}; + +enum reg2i16_op { + jirl_op = 0x13, + beq_op = 0x16, + bne_op = 0x17, + blt_op = 0x18, + bge_op = 0x19, + bltu_op = 0x1a, + bgeu_op = 0x1b, +}; + +struct reg0i15_format { + unsigned int immediate : 15; + unsigned int opcode : 17; +}; + +struct reg0i26_format { + unsigned int immediate_h : 10; + unsigned int immediate_l : 16; + unsigned int opcode : 6; +}; + +struct reg1i21_format { + unsigned int immediate_h : 5; + unsigned int rj : 5; + unsigned int immediate_l : 16; + unsigned int opcode : 6; +}; + +struct reg2_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int opcode : 22; +}; + +struct reg2i12_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 12; + unsigned int opcode : 10; +}; + +struct reg2i14_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 14; + unsigned int opcode : 8; +}; + +struct reg2i16_format { + unsigned int rd : 5; + unsigned int rj : 5; + unsigned int immediate : 16; + unsigned int opcode : 6; +}; + +union loongarch_instruction { + unsigned int word; + struct reg0i15_format reg0i15_format; + struct reg0i26_format reg0i26_format; + struct reg1i21_format reg1i21_format; + struct reg2_format reg2_format; + struct reg2i12_format reg2i12_format; + struct reg2i14_format reg2i14_format; + struct reg2i16_format reg2i16_format; +}; + +#define LOONGARCH_INSN_SIZE sizeof(union loongarch_instruction) + +enum loongarch_gpr { + LOONGARCH_GPR_ZERO = 0, + LOONGARCH_GPR_RA = 1, + LOONGARCH_GPR_TP = 2, + LOONGARCH_GPR_SP = 3, + LOONGARCH_GPR_A0 = 4, /* Reused as V0 for return value */ + LOONGARCH_GPR_A1, /* Reused as V1 for return value */ + LOONGARCH_GPR_A2, + LOONGARCH_GPR_A3, + LOONGARCH_GPR_A4, + LOONGARCH_GPR_A5, + LOONGARCH_GPR_A6, + LOONGARCH_GPR_A7, + LOONGARCH_GPR_T0 = 12, + LOONGARCH_GPR_T1, + LOONGARCH_GPR_T2, + LOONGARCH_GPR_T3, + LOONGARCH_GPR_T4, + LOONGARCH_GPR_T5, + LOONGARCH_GPR_T6, + LOONGARCH_GPR_T7, + LOONGARCH_GPR_T8, + LOONGARCH_GPR_FP = 22, + LOONGARCH_GPR_S0 = 23, + LOONGARCH_GPR_S1, + LOONGARCH_GPR_S2, + LOONGARCH_GPR_S3, + LOONGARCH_GPR_S4, + LOONGARCH_GPR_S5, + LOONGARCH_GPR_S6, + LOONGARCH_GPR_S7, + LOONGARCH_GPR_S8, + LOONGARCH_GPR_MAX +}; + +#define DEF_EMIT_REG2I16_FORMAT(NAME, OP) \ +static inline void emit_##NAME(union loongarch_instruction *insn, \ + enum loongarch_gpr rj, \ + enum loongarch_gpr rd, \ + int offset) \ +{ \ + insn->reg2i16_format.opcode = OP; \ + insn->reg2i16_format.immediate = offset; \ + insn->reg2i16_format.rj = rj; \ + insn->reg2i16_format.rd = rd; \ +} + +DEF_EMIT_REG2I16_FORMAT(jirl, jirl_op) + +#endif /* _ASM_INST_H */ diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index f18683b95ea6..7319f6ced108 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -87,4 +87,15 @@ static inline __u32 rol32(__u32 word, unsigned int shift) return (word << shift) | (word >> ((-shift) & 31)); } +/** + * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit + * @value: value to sign extend + * @index: 0 based bit index (0<=index<64) to sign bit + */ +static __always_inline __s64 sign_extend64(__u64 value, int index) +{ + __u8 shift = 63 - index; + return (__s64)(value << shift) >> shift; +} + #endif diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index cc74ba4e0f54..ff0b53144d12 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -1,6 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include #include +#include +#include + +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 +#endif int arch_ftrace_match(char *name) { @@ -39,21 +45,284 @@ int arch_decode_hint_reg(u8 sp_reg, int *base) return 0; } +static bool is_loongarch(const struct elf *elf) +{ + if (elf->ehdr.e_machine == EM_LOONGARCH) + return true; + + WARN("unexpected ELF machine type %d", elf->ehdr.e_machine); + return false; +} + +#define ADD_OP(op) \ + if (!(op = calloc(1, sizeof(*op)))) \ + return -1; \ + else for (*ops_list = op, ops_list = &op->next; op; op = NULL) + +static bool decode_insn_reg0i26_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg0i26_format.opcode) { + case b_op: + insn->type = INSN_JUMP_UNCONDITIONAL; + insn->immediate = sign_extend64(inst.reg0i26_format.immediate_h << 16 | + inst.reg0i26_format.immediate_l, 25); + break; + case bl_op: + insn->type = INSN_CALL; + insn->immediate = sign_extend64(inst.reg0i26_format.immediate_h << 16 | + inst.reg0i26_format.immediate_l, 25); + break; + default: + return false; + } + + return true; +} + +static bool decode_insn_reg1i21_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg1i21_format.opcode) { + case beqz_op: + case bnez_op: + case bceqz_op: + insn->type = INSN_JUMP_CONDITIONAL; + insn->immediate = sign_extend64(inst.reg1i21_format.immediate_h << 16 | + inst.reg1i21_format.immediate_l, 20); + break; + default: + return false; + } + + return true; +} + +static bool decode_insn_reg2i12_fomat(union loongarch_instruction inst, + struct instruction *insn, + struct stack_op **ops_list, + struct stack_op *op) +{ + switch (inst.reg2i12_format.opcode) { + case addid_op: + if ((inst.reg2i12_format.rd == CFI_SP) || (inst.reg2i12_format.rj == CFI_SP)) { + /* addi.d sp,sp,si12 or addi.d fp,sp,si12 */ + insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11); + ADD_OP(op) { + op->src.type = OP_SRC_ADD; + op->src.reg = inst.reg2i12_format.rj; + op->src.offset = insn->immediate; + op->dest.type = OP_DEST_REG; + op->dest.reg = inst.reg2i12_format.rd; + } + } + break; + case ldd_op: + if (inst.reg2i12_format.rj == CFI_SP) { + /* ld.d rd,sp,si12 */ + insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11); + ADD_OP(op) { + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = CFI_SP; + op->src.offset = insn->immediate; + op->dest.type = OP_DEST_REG; + op->dest.reg = inst.reg2i12_format.rd; + } + } + break; + case std_op: + if (inst.reg2i12_format.rj == CFI_SP) { + /* st.d rd,sp,si12 */ + insn->immediate = sign_extend64(inst.reg2i12_format.immediate, 11); + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = inst.reg2i12_format.rd; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = CFI_SP; + op->dest.offset = insn->immediate; + } + } + break; + case andi_op: + if (inst.reg2i12_format.rd == 0 && + inst.reg2i12_format.rj == 0 && + inst.reg2i12_format.immediate == 0) + /* andi r0,r0,0 */ + insn->type = INSN_NOP; + break; + default: + return false; + } + + return true; +} + +static bool decode_insn_reg2i14_fomat(union loongarch_instruction inst, + struct instruction *insn, + struct stack_op **ops_list, + struct stack_op *op) +{ + switch (inst.reg2i14_format.opcode) { + case ldptrd_op: + if (inst.reg2i14_format.rj == CFI_SP) { + /* ldptr.d rd,sp,si14 */ + insn->immediate = sign_extend64(inst.reg2i14_format.immediate, 13); + ADD_OP(op) { + op->src.type = OP_SRC_REG_INDIRECT; + op->src.reg = CFI_SP; + op->src.offset = insn->immediate; + op->dest.type = OP_DEST_REG; + op->dest.reg = inst.reg2i14_format.rd; + } + } + break; + case stptrd_op: + if (inst.reg2i14_format.rj == CFI_SP) { + /* stptr.d ra,sp,0 */ + if (inst.reg2i14_format.rd == LOONGARCH_GPR_RA && + inst.reg2i14_format.immediate == 0) + break; + + /* stptr.d rd,sp,si14 */ + insn->immediate = sign_extend64(inst.reg2i14_format.immediate, 13); + ADD_OP(op) { + op->src.type = OP_SRC_REG; + op->src.reg = inst.reg2i14_format.rd; + op->dest.type = OP_DEST_REG_INDIRECT; + op->dest.reg = CFI_SP; + op->dest.offset = insn->immediate; + } + } + break; + default: + return false; + } + + return true; +} + +static bool decode_insn_reg2i16_fomat(union loongarch_instruction inst, + struct instruction *insn) +{ + switch (inst.reg2i16_format.opcode) { + case jirl_op: + if (inst.reg2i16_format.rd == 0 && + inst.reg2i16_format.rj == CFI_RA && + inst.reg2i16_format.immediate == 0) { + /* jirl r0,ra,0 */ + insn->type = INSN_RETURN; + } else if (inst.reg2i16_format.rd == CFI_RA) { + /* jirl ra,rj,offs16 */ + insn->type = INSN_CALL_DYNAMIC; + } else if (inst.reg2i16_format.rd == CFI_A0 && + inst.reg2i16_format.immediate == 0) { + /* + * jirl a0,t0,0 + * this is a special case in loongarch_suspend_enter, + * just treat it as a call instruction. + */ + insn->type = INSN_CALL_DYNAMIC; + } else if (inst.reg2i16_format.rd == 0 && + inst.reg2i16_format.immediate == 0) { + /* jirl r0,rj,0 */ + insn->type = INSN_JUMP_DYNAMIC; + } else if (inst.reg2i16_format.rd == 0 && + inst.reg2i16_format.immediate != 0) { + /* + * jirl r0,t0,12 + * this is a rare case in JUMP_VIRT_ADDR, + * just ignore it due to it is harmless for tracing. + */ + break; + } else { + /* jirl rd,rj,offs16 */ + insn->type = INSN_JUMP_UNCONDITIONAL; + insn->immediate = sign_extend64(inst.reg2i16_format.immediate, 15); + } + break; + case beq_op: + case bne_op: + case blt_op: + case bge_op: + case bltu_op: + case bgeu_op: + insn->type = INSN_JUMP_CONDITIONAL; + insn->immediate = sign_extend64(inst.reg2i16_format.immediate, 15); + break; + default: + return false; + } + + return true; +} + int arch_decode_instruction(struct objtool_file *file, const struct section *sec, unsigned long offset, unsigned int maxlen, struct instruction *insn) { + struct stack_op **ops_list = &insn->stack_ops; + const struct elf *elf = file->elf; + struct stack_op *op = NULL; + union loongarch_instruction inst; + + if (!is_loongarch(elf)) + return -1; + + if (maxlen < LOONGARCH_INSN_SIZE) + return 0; + + insn->len = LOONGARCH_INSN_SIZE; + insn->type = INSN_OTHER; + insn->immediate = 0; + + inst = *(union loongarch_instruction *)(sec->data->d_buf + offset); + + if (decode_insn_reg0i26_fomat(inst, insn)) + return 0; + if (decode_insn_reg1i21_fomat(inst, insn)) + return 0; + if (decode_insn_reg2i12_fomat(inst, insn, ops_list, op)) + return 0; + if (decode_insn_reg2i14_fomat(inst, insn, ops_list, op)) + return 0; + if (decode_insn_reg2i16_fomat(inst, insn)) + return 0; + + if (inst.word == 0) + insn->type = INSN_NOP; + else if (inst.reg0i15_format.opcode == break_op) { + /* break */ + insn->type = INSN_BUG; + } else if (inst.reg2_format.opcode == ertn_op) { + /* ertn */ + insn->type = INSN_RETURN; + } + return 0; } const char *arch_nop_insn(int len) { - return NULL; + static u32 nop; + + if (len != LOONGARCH_INSN_SIZE) + WARN("invalid NOP size: %d\n", len); + + nop = LOONGARCH_INSN_NOP; + + return (const char *)&nop; } const char *arch_ret_insn(int len) { - return NULL; + static u32 ret; + + if (len != LOONGARCH_INSN_SIZE) + WARN("invalid RET size: %d\n", len); + + emit_jirl((union loongarch_instruction *)&ret, LOONGARCH_GPR_RA, LOONGARCH_GPR_ZERO, 0); + + return (const char *)&ret; } void arch_initial_func_cfi_state(struct cfi_init_state *state) From b8e85e6f3a09fc56b0ff574887798962ef8a8f80 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 140/496] objtool/x86: Separate arch-specific and generic parts Move init_orc_entry(), write_orc_entry(), reg_name(), orc_type_name() and print_reg() from generic orc_gen.c and orc_dump.c to arch-specific orc.c, then introduce a new function orc_print_dump() to print info. This is preparation for later patch, no functionality change. Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/arch/x86/Build | 1 + tools/objtool/arch/x86/orc.c | 188 ++++++++++++++++++++++++++++ tools/objtool/include/objtool/orc.h | 14 +++ tools/objtool/orc_dump.c | 69 +--------- tools/objtool/orc_gen.c | 113 +---------------- 5 files changed, 206 insertions(+), 179 deletions(-) create mode 100644 tools/objtool/arch/x86/orc.c create mode 100644 tools/objtool/include/objtool/orc.h diff --git a/tools/objtool/arch/x86/Build b/tools/objtool/arch/x86/Build index 9f7869b5c5e0..3dedb2fd8f3a 100644 --- a/tools/objtool/arch/x86/Build +++ b/tools/objtool/arch/x86/Build @@ -1,5 +1,6 @@ objtool-y += special.o objtool-y += decode.o +objtool-y += orc.o inat_tables_script = ../arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = ../arch/x86/lib/x86-opcode-map.txt diff --git a/tools/objtool/arch/x86/orc.c b/tools/objtool/arch/x86/orc.c new file mode 100644 index 000000000000..b6cd943e87f9 --- /dev/null +++ b/tools/objtool/arch/x86/orc.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include + +#include +#include +#include +#include + +int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) +{ + struct cfi_reg *bp = &cfi->regs[CFI_BP]; + + memset(orc, 0, sizeof(*orc)); + + if (!cfi) { + /* + * This is usually either unreachable nops/traps (which don't + * trigger unreachable instruction warnings), or + * STACK_FRAME_NON_STANDARD functions. + */ + orc->type = ORC_TYPE_UNDEFINED; + return 0; + } + + switch (cfi->type) { + case UNWIND_HINT_TYPE_UNDEFINED: + orc->type = ORC_TYPE_UNDEFINED; + return 0; + case UNWIND_HINT_TYPE_END_OF_STACK: + orc->type = ORC_TYPE_END_OF_STACK; + return 0; + case UNWIND_HINT_TYPE_CALL: + orc->type = ORC_TYPE_CALL; + break; + case UNWIND_HINT_TYPE_REGS: + orc->type = ORC_TYPE_REGS; + break; + case UNWIND_HINT_TYPE_REGS_PARTIAL: + orc->type = ORC_TYPE_REGS_PARTIAL; + break; + default: + WARN_INSN(insn, "unknown unwind hint type %d", cfi->type); + return -1; + } + + orc->signal = cfi->signal; + + switch (cfi->cfa.base) { + case CFI_SP: + orc->sp_reg = ORC_REG_SP; + break; + case CFI_SP_INDIRECT: + orc->sp_reg = ORC_REG_SP_INDIRECT; + break; + case CFI_BP: + orc->sp_reg = ORC_REG_BP; + break; + case CFI_BP_INDIRECT: + orc->sp_reg = ORC_REG_BP_INDIRECT; + break; + case CFI_R10: + orc->sp_reg = ORC_REG_R10; + break; + case CFI_R13: + orc->sp_reg = ORC_REG_R13; + break; + case CFI_DI: + orc->sp_reg = ORC_REG_DI; + break; + case CFI_DX: + orc->sp_reg = ORC_REG_DX; + break; + default: + WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base); + return -1; + } + + switch (bp->base) { + case CFI_UNDEFINED: + orc->bp_reg = ORC_REG_UNDEFINED; + break; + case CFI_CFA: + orc->bp_reg = ORC_REG_PREV_SP; + break; + case CFI_BP: + orc->bp_reg = ORC_REG_BP; + break; + default: + WARN_INSN(insn, "unknown BP base reg %d", bp->base); + return -1; + } + + orc->sp_offset = cfi->cfa.offset; + orc->bp_offset = bp->offset; + + return 0; +} + +int write_orc_entry(struct elf *elf, struct section *orc_sec, + struct section *ip_sec, unsigned int idx, + struct section *insn_sec, unsigned long insn_off, + struct orc_entry *o) +{ + struct orc_entry *orc; + + /* populate ORC data */ + orc = (struct orc_entry *)orc_sec->data->d_buf + idx; + memcpy(orc, o, sizeof(*orc)); + orc->sp_offset = bswap_if_needed(elf, orc->sp_offset); + orc->bp_offset = bswap_if_needed(elf, orc->bp_offset); + + /* populate reloc for ip */ + if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx, + insn_sec, insn_off)) + return -1; + + return 0; +} + +static const char *reg_name(unsigned int reg) +{ + switch (reg) { + case ORC_REG_PREV_SP: + return "prevsp"; + case ORC_REG_DX: + return "dx"; + case ORC_REG_DI: + return "di"; + case ORC_REG_BP: + return "bp"; + case ORC_REG_SP: + return "sp"; + case ORC_REG_R10: + return "r10"; + case ORC_REG_R13: + return "r13"; + case ORC_REG_BP_INDIRECT: + return "bp(ind)"; + case ORC_REG_SP_INDIRECT: + return "sp(ind)"; + default: + return "?"; + } +} + +static const char *orc_type_name(unsigned int type) +{ + switch (type) { + case ORC_TYPE_UNDEFINED: + return "(und)"; + case ORC_TYPE_END_OF_STACK: + return "end"; + case ORC_TYPE_CALL: + return "call"; + case ORC_TYPE_REGS: + return "regs"; + case ORC_TYPE_REGS_PARTIAL: + return "regs (partial)"; + default: + return "?"; + } +} + +static void print_reg(unsigned int reg, int offset) +{ + if (reg == ORC_REG_BP_INDIRECT) + printf("(bp%+d)", offset); + else if (reg == ORC_REG_SP_INDIRECT) + printf("(sp)%+d", offset); + else if (reg == ORC_REG_UNDEFINED) + printf("(und)"); + else + printf("%s%+d", reg_name(reg), offset); +} + +void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i) +{ + printf("type:%s", orc_type_name(orc[i].type)); + + printf(" sp:"); + print_reg(orc[i].sp_reg, bswap_if_needed(dummy_elf, orc[i].sp_offset)); + + printf(" bp:"); + print_reg(orc[i].bp_reg, bswap_if_needed(dummy_elf, orc[i].bp_offset)); + + printf(" signal:%d\n", orc[i].signal); +} diff --git a/tools/objtool/include/objtool/orc.h b/tools/objtool/include/objtool/orc.h new file mode 100644 index 000000000000..15a32def1071 --- /dev/null +++ b/tools/objtool/include/objtool/orc.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _OBJTOOL_ORC_H +#define _OBJTOOL_ORC_H + +#include + +int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn); +void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i); +int write_orc_entry(struct elf *elf, struct section *orc_sec, + struct section *ip_sec, unsigned int idx, + struct section *insn_sec, unsigned long insn_off, + struct orc_entry *o); + +#endif /* _OBJTOOL_ORC_H */ diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index 0e183bb1c720..a62247efb64f 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -6,65 +6,10 @@ #include #include #include +#include #include #include -static const char *reg_name(unsigned int reg) -{ - switch (reg) { - case ORC_REG_PREV_SP: - return "prevsp"; - case ORC_REG_DX: - return "dx"; - case ORC_REG_DI: - return "di"; - case ORC_REG_BP: - return "bp"; - case ORC_REG_SP: - return "sp"; - case ORC_REG_R10: - return "r10"; - case ORC_REG_R13: - return "r13"; - case ORC_REG_BP_INDIRECT: - return "bp(ind)"; - case ORC_REG_SP_INDIRECT: - return "sp(ind)"; - default: - return "?"; - } -} - -static const char *orc_type_name(unsigned int type) -{ - switch (type) { - case ORC_TYPE_UNDEFINED: - return "(und)"; - case ORC_TYPE_END_OF_STACK: - return "end"; - case ORC_TYPE_CALL: - return "call"; - case ORC_TYPE_REGS: - return "regs"; - case ORC_TYPE_REGS_PARTIAL: - return "regs (partial)"; - default: - return "?"; - } -} - -static void print_reg(unsigned int reg, int offset) -{ - if (reg == ORC_REG_BP_INDIRECT) - printf("(bp%+d)", offset); - else if (reg == ORC_REG_SP_INDIRECT) - printf("(sp)%+d", offset); - else if (reg == ORC_REG_UNDEFINED) - printf("(und)"); - else - printf("%s%+d", reg_name(reg), offset); -} - int orc_dump(const char *_objname) { int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; @@ -205,17 +150,7 @@ int orc_dump(const char *_objname) printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i])); } - printf("type:%s", orc_type_name(orc[i].type)); - - printf(" sp:"); - - print_reg(orc[i].sp_reg, bswap_if_needed(&dummy_elf, orc[i].sp_offset)); - - printf(" bp:"); - - print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset)); - - printf(" signal:%d\n", orc[i].signal); + orc_print_dump(&dummy_elf, orc, i); } elf_end(elf); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index bae343908867..922e6aac7cea 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -10,121 +10,10 @@ #include #include +#include #include #include -static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, - struct instruction *insn) -{ - struct cfi_reg *bp = &cfi->regs[CFI_BP]; - - memset(orc, 0, sizeof(*orc)); - - if (!cfi) { - /* - * This is usually either unreachable nops/traps (which don't - * trigger unreachable instruction warnings), or - * STACK_FRAME_NON_STANDARD functions. - */ - orc->type = ORC_TYPE_UNDEFINED; - return 0; - } - - switch (cfi->type) { - case UNWIND_HINT_TYPE_UNDEFINED: - orc->type = ORC_TYPE_UNDEFINED; - return 0; - case UNWIND_HINT_TYPE_END_OF_STACK: - orc->type = ORC_TYPE_END_OF_STACK; - return 0; - case UNWIND_HINT_TYPE_CALL: - orc->type = ORC_TYPE_CALL; - break; - case UNWIND_HINT_TYPE_REGS: - orc->type = ORC_TYPE_REGS; - break; - case UNWIND_HINT_TYPE_REGS_PARTIAL: - orc->type = ORC_TYPE_REGS_PARTIAL; - break; - default: - WARN_INSN(insn, "unknown unwind hint type %d", cfi->type); - return -1; - } - - orc->signal = cfi->signal; - - switch (cfi->cfa.base) { - case CFI_SP: - orc->sp_reg = ORC_REG_SP; - break; - case CFI_SP_INDIRECT: - orc->sp_reg = ORC_REG_SP_INDIRECT; - break; - case CFI_BP: - orc->sp_reg = ORC_REG_BP; - break; - case CFI_BP_INDIRECT: - orc->sp_reg = ORC_REG_BP_INDIRECT; - break; - case CFI_R10: - orc->sp_reg = ORC_REG_R10; - break; - case CFI_R13: - orc->sp_reg = ORC_REG_R13; - break; - case CFI_DI: - orc->sp_reg = ORC_REG_DI; - break; - case CFI_DX: - orc->sp_reg = ORC_REG_DX; - break; - default: - WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base); - return -1; - } - - switch (bp->base) { - case CFI_UNDEFINED: - orc->bp_reg = ORC_REG_UNDEFINED; - break; - case CFI_CFA: - orc->bp_reg = ORC_REG_PREV_SP; - break; - case CFI_BP: - orc->bp_reg = ORC_REG_BP; - break; - default: - WARN_INSN(insn, "unknown BP base reg %d", bp->base); - return -1; - } - - orc->sp_offset = cfi->cfa.offset; - orc->bp_offset = bp->offset; - - return 0; -} - -static int write_orc_entry(struct elf *elf, struct section *orc_sec, - struct section *ip_sec, unsigned int idx, - struct section *insn_sec, unsigned long insn_off, - struct orc_entry *o) -{ - struct orc_entry *orc; - - /* populate ORC data */ - orc = (struct orc_entry *)orc_sec->data->d_buf + idx; - memcpy(orc, o, sizeof(*orc)); - orc->sp_offset = bswap_if_needed(elf, orc->sp_offset); - orc->bp_offset = bswap_if_needed(elf, orc->bp_offset); - - /* populate reloc for ip */ - if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx, - insn_sec, insn_off)) - return -1; - - return 0; -} - struct orc_list_entry { struct list_head list; struct orc_entry orc; From 3c7266cd7bc5e7843b631fea73cb0e82111e3158 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 141/496] objtool/LoongArch: Enable orc to be built Implement arch-specific init_orc_entry(), write_orc_entry(), reg_name(), orc_type_name(), print_reg() and orc_print_dump(), then set BUILD_ORC as y to build the orc related files. Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/arch/loongarch/include/asm/orc_types.h | 58 +++++++ tools/objtool/Makefile | 4 + tools/objtool/arch/loongarch/Build | 1 + tools/objtool/arch/loongarch/decode.c | 16 ++ tools/objtool/arch/loongarch/orc.c | 171 +++++++++++++++++++ 5 files changed, 250 insertions(+) create mode 100644 tools/arch/loongarch/include/asm/orc_types.h create mode 100644 tools/objtool/arch/loongarch/orc.c diff --git a/tools/arch/loongarch/include/asm/orc_types.h b/tools/arch/loongarch/include/asm/orc_types.h new file mode 100644 index 000000000000..caf1f71a1057 --- /dev/null +++ b/tools/arch/loongarch/include/asm/orc_types.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and FP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous FP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_SP 2 +#define ORC_REG_FP 3 +#define ORC_REG_MAX 4 + +#define ORC_TYPE_UNDEFINED 0 +#define ORC_TYPE_END_OF_STACK 1 +#define ORC_TYPE_CALL 2 +#define ORC_TYPE_REGS 3 +#define ORC_TYPE_REGS_PARTIAL 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and FP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 fp_offset; + s16 ra_offset; + unsigned int sp_reg:4; + unsigned int fp_reg:4; + unsigned int ra_reg:4; + unsigned int type:3; + unsigned int signal:1; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 83b100c1e7f6..bf7f7f84ac62 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -57,6 +57,10 @@ ifeq ($(SRCARCH),x86) BUILD_ORC := y endif +ifeq ($(SRCARCH),loongarch) + BUILD_ORC := y +endif + export BUILD_ORC export srctree OUTPUT CFLAGS SRCARCH AWK include $(srctree)/tools/build/Makefile.include diff --git a/tools/objtool/arch/loongarch/Build b/tools/objtool/arch/loongarch/Build index d24d5636a5b8..1d4b784b6887 100644 --- a/tools/objtool/arch/loongarch/Build +++ b/tools/objtool/arch/loongarch/Build @@ -1,2 +1,3 @@ objtool-y += decode.o objtool-y += special.o +objtool-y += orc.o diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index ff0b53144d12..aee479d2191c 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #ifndef EM_LOONGARCH #define EM_LOONGARCH 258 @@ -42,6 +44,20 @@ bool arch_callee_saved_reg(unsigned char reg) int arch_decode_hint_reg(u8 sp_reg, int *base) { + switch (sp_reg) { + case ORC_REG_UNDEFINED: + *base = CFI_UNDEFINED; + break; + case ORC_REG_SP: + *base = CFI_SP; + break; + case ORC_REG_FP: + *base = CFI_FP; + break; + default: + return -1; + } + return 0; } diff --git a/tools/objtool/arch/loongarch/orc.c b/tools/objtool/arch/loongarch/orc.c new file mode 100644 index 000000000000..873536d009d9 --- /dev/null +++ b/tools/objtool/arch/loongarch/orc.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include +#include + +#include +#include +#include +#include + +int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi, struct instruction *insn) +{ + struct cfi_reg *fp = &cfi->regs[CFI_FP]; + struct cfi_reg *ra = &cfi->regs[CFI_RA]; + + memset(orc, 0, sizeof(*orc)); + + if (!cfi) { + /* + * This is usually either unreachable nops/traps (which don't + * trigger unreachable instruction warnings), or + * STACK_FRAME_NON_STANDARD functions. + */ + orc->type = ORC_TYPE_UNDEFINED; + return 0; + } + + switch (cfi->type) { + case UNWIND_HINT_TYPE_UNDEFINED: + orc->type = ORC_TYPE_UNDEFINED; + return 0; + case UNWIND_HINT_TYPE_END_OF_STACK: + orc->type = ORC_TYPE_END_OF_STACK; + return 0; + case UNWIND_HINT_TYPE_CALL: + orc->type = ORC_TYPE_CALL; + break; + case UNWIND_HINT_TYPE_REGS: + orc->type = ORC_TYPE_REGS; + break; + case UNWIND_HINT_TYPE_REGS_PARTIAL: + orc->type = ORC_TYPE_REGS_PARTIAL; + break; + default: + WARN_INSN(insn, "unknown unwind hint type %d", cfi->type); + return -1; + } + + orc->signal = cfi->signal; + + switch (cfi->cfa.base) { + case CFI_SP: + orc->sp_reg = ORC_REG_SP; + break; + case CFI_FP: + orc->sp_reg = ORC_REG_FP; + break; + default: + WARN_INSN(insn, "unknown CFA base reg %d", cfi->cfa.base); + return -1; + } + + switch (fp->base) { + case CFI_UNDEFINED: + orc->fp_reg = ORC_REG_UNDEFINED; + orc->fp_offset = 0; + break; + case CFI_CFA: + orc->fp_reg = ORC_REG_PREV_SP; + orc->fp_offset = fp->offset; + break; + case CFI_FP: + orc->fp_reg = ORC_REG_FP; + break; + default: + WARN_INSN(insn, "unknown FP base reg %d", fp->base); + return -1; + } + + switch (ra->base) { + case CFI_UNDEFINED: + orc->ra_reg = ORC_REG_UNDEFINED; + orc->ra_offset = 0; + break; + case CFI_CFA: + orc->ra_reg = ORC_REG_PREV_SP; + orc->ra_offset = ra->offset; + break; + case CFI_FP: + orc->ra_reg = ORC_REG_FP; + break; + default: + WARN_INSN(insn, "unknown RA base reg %d", ra->base); + return -1; + } + + orc->sp_offset = cfi->cfa.offset; + + return 0; +} + +int write_orc_entry(struct elf *elf, struct section *orc_sec, + struct section *ip_sec, unsigned int idx, + struct section *insn_sec, unsigned long insn_off, + struct orc_entry *o) +{ + struct orc_entry *orc; + + /* populate ORC data */ + orc = (struct orc_entry *)orc_sec->data->d_buf + idx; + memcpy(orc, o, sizeof(*orc)); + + /* populate reloc for ip */ + if (!elf_init_reloc_text_sym(elf, ip_sec, idx * sizeof(int), idx, + insn_sec, insn_off)) + return -1; + + return 0; +} + +static const char *reg_name(unsigned int reg) +{ + switch (reg) { + case ORC_REG_SP: + return "sp"; + case ORC_REG_FP: + return "fp"; + case ORC_REG_PREV_SP: + return "prevsp"; + default: + return "?"; + } +} + +static const char *orc_type_name(unsigned int type) +{ + switch (type) { + case UNWIND_HINT_TYPE_CALL: + return "call"; + case UNWIND_HINT_TYPE_REGS: + return "regs"; + case UNWIND_HINT_TYPE_REGS_PARTIAL: + return "regs (partial)"; + default: + return "?"; + } +} + +static void print_reg(unsigned int reg, int offset) +{ + if (reg == ORC_REG_UNDEFINED) + printf(" (und) "); + else + printf("%s + %3d", reg_name(reg), offset); + +} + +void orc_print_dump(struct elf *dummy_elf, struct orc_entry *orc, int i) +{ + printf("type:%s", orc_type_name(orc[i].type)); + + printf(" sp:"); + print_reg(orc[i].sp_reg, orc[i].sp_offset); + + printf(" fp:"); + print_reg(orc[i].fp_reg, orc[i].fp_offset); + + printf(" ra:"); + print_reg(orc[i].ra_reg, orc[i].ra_offset); + + printf(" signal:%d\n", orc[i].signal); +} From d5ab2bc36c6b0ce2f3409f934ff9cdf6d6768fa2 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 142/496] objtool: Check local label in add_dead_ends() When update the latest upstream gcc and binutils, it generates more objtool warnings on LoongArch, like this: init/main.o: warning: objtool: unexpected relocation symbol type in .rela.discard.unreachable We can see that the reloc sym name is local label instead of section in relocation section '.rela.discard.unreachable', in this case, the reloc sym type is STT_NOTYPE instead of STT_SECTION. As suggested by Peter Zijlstra, we add a "local_label" member in struct symbol, then set it as true if symbol type is STT_NOTYPE and symbol name starts with ".L" string in classify_symbols(). Let's check reloc->sym->local_label to not return -1 in add_dead_ends(), and also use reloc->sym->offset instead of reloc addend which is 0 to find the corresponding instruction. At the same time, let's replace the variable "addend" with "offset" to reflect the reality. Here are some detailed info: [fedora@linux 6.8.test]$ gcc --version gcc (GCC) 14.0.1 20240129 (experimental) [fedora@linux 6.8.test]$ as --version GNU assembler (GNU Binutils) 2.42.50.20240129 [fedora@linux 6.8.test]$ readelf -r init/main.o | grep -A 2 "rela.discard.unreachable" Relocation section '.rela.discard.unreachable' at offset 0x6028 contains 1 entry: Offset Info Type Sym. Value Sym. Name + Addend 000000000000 00d900000063 R_LARCH_32_PCREL 00000000000002c4 .L500^B1 + 0 Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/check.c | 40 +++++++++++++++++------------ tools/objtool/include/objtool/elf.h | 1 + 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 548ec3cd7c00..d0e82c729dd9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -20,6 +20,7 @@ #include #include #include +#include struct alternative { struct alternative *next; @@ -584,7 +585,7 @@ static int add_dead_ends(struct objtool_file *file) struct section *rsec; struct reloc *reloc; struct instruction *insn; - s64 addend; + unsigned long offset; /* * Check for manually annotated dead ends. @@ -594,27 +595,28 @@ static int add_dead_ends(struct objtool_file *file) goto reachable; for_each_reloc(rsec, reloc) { - - if (reloc->sym->type != STT_SECTION) { + if (reloc->sym->type == STT_SECTION) { + offset = reloc_addend(reloc); + } else if (reloc->sym->local_label) { + offset = reloc->sym->offset; + } else { WARN("unexpected relocation symbol type in %s", rsec->name); return -1; } - addend = reloc_addend(reloc); - - insn = find_insn(file, reloc->sym->sec, addend); + insn = find_insn(file, reloc->sym->sec, offset); if (insn) insn = prev_insn_same_sec(file, insn); - else if (addend == reloc->sym->sec->sh.sh_size) { + else if (offset == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find unreachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, addend); + reloc->sym->sec->name, offset); return -1; } } else { WARN("can't find unreachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, addend); + reloc->sym->sec->name, offset); return -1; } @@ -633,27 +635,28 @@ reachable: return 0; for_each_reloc(rsec, reloc) { - - if (reloc->sym->type != STT_SECTION) { + if (reloc->sym->type == STT_SECTION) { + offset = reloc_addend(reloc); + } else if (reloc->sym->local_label) { + offset = reloc->sym->offset; + } else { WARN("unexpected relocation symbol type in %s", rsec->name); return -1; } - addend = reloc_addend(reloc); - - insn = find_insn(file, reloc->sym->sec, addend); + insn = find_insn(file, reloc->sym->sec, offset); if (insn) insn = prev_insn_same_sec(file, insn); - else if (addend == reloc->sym->sec->sh.sh_size) { + else if (offset == reloc->sym->sec->sh.sh_size) { insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find reachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, addend); + reloc->sym->sec->name, offset); return -1; } } else { WARN("can't find reachable insn at %s+0x%" PRIx64, - reloc->sym->sec->name, addend); + reloc->sym->sec->name, offset); return -1; } @@ -2522,6 +2525,9 @@ static int classify_symbols(struct objtool_file *file) struct symbol *func; for_each_sym(file, func) { + if (func->type == STT_NOTYPE && strstarts(func->name, ".L")) + func->local_label = true; + if (func->bind != STB_GLOBAL) continue; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 9f71e988eca4..2b8a69de4db8 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -67,6 +67,7 @@ struct symbol { u8 profiling_func : 1; u8 warned : 1; u8 embedded_insn : 1; + u8 local_label : 1; struct list_head pv_target; struct reloc *relocs; }; From e91c5e4c21b0339376ee124cda5c9b27d41f2cbc Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 143/496] objtool: Check local label in read_unwind_hints() When update the latest upstream gcc and binutils, it generates some objtool warnings on LoongArch, like this: arch/loongarch/kernel/entry.o: warning: objtool: ret_from_fork+0x0: unreachable instruction We can see that the reloc sym name is local label instead of section in relocation section '.rela.discard.unwind_hints', in this case, the reloc sym type is STT_NOTYPE instead of STT_SECTION. Let us check it to not return -1, then use reloc->sym->offset instead of reloc addend which is 0 to find the corresponding instruction. Here are some detailed info: [fedora@linux 6.8.test]$ gcc --version gcc (GCC) 14.0.1 20240129 (experimental) [fedora@linux 6.8.test]$ as --version GNU assembler (GNU Binutils) 2.42.50.20240129 [fedora@linux 6.8.test]$ readelf -r arch/loongarch/kernel/entry.o | grep -A 3 "rela.discard.unwind_hints" Relocation section '.rela.discard.unwind_hints' at offset 0x3a8 contains 7 entries: Offset Info Type Sym. Value Sym. Name + Addend 000000000000 000a00000063 R_LARCH_32_PCREL 0000000000000000 .Lhere_1 + 0 00000000000c 000b00000063 R_LARCH_32_PCREL 00000000000000a8 .Lhere_50 + 0 Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- tools/objtool/check.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d0e82c729dd9..1009d144f756 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2227,6 +2227,7 @@ static int read_unwind_hints(struct objtool_file *file) struct unwind_hint *hint; struct instruction *insn; struct reloc *reloc; + unsigned long offset; int i; sec = find_section_by_name(file->elf, ".discard.unwind_hints"); @@ -2254,7 +2255,16 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); + if (reloc->sym->type == STT_SECTION) { + offset = reloc_addend(reloc); + } else if (reloc->sym->local_label) { + offset = reloc->sym->offset; + } else { + WARN("unexpected relocation symbol type in %s", sec->rsec->name); + return -1; + } + + insn = find_insn(file, reloc->sym->sec, offset); if (!insn) { WARN("can't find insn for unwind_hints[%d]", i); return -1; From cb8a2ef0848ca80d67d6d56e2df757cfdf6b3355 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 144/496] LoongArch: Add ORC stack unwinder support The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is similar in concept to a DWARF unwinder. The difference is that the format of the ORC data is much simpler than DWARF, which in turn allows the ORC unwinder to be much simpler and faster. The ORC data consists of unwind tables which are generated by objtool. After analyzing all the code paths of a .o file, it determines information about the stack state at each instruction address in the file and outputs that information to the .orc_unwind and .orc_unwind_ip sections. The per-object ORC sections are combined at link time and are sorted and post-processed at boot time. The unwinder uses the resulting data to correlate instruction addresses with their stack states at run time. Most of the logic are similar with x86, in order to get ra info before ra is saved into stack, add ra_reg and ra_offset into orc_entry. At the same time, modify some arch-specific code to silence the objtool warnings. Co-developed-by: Jinyang He Signed-off-by: Jinyang He Co-developed-by: Youling Tang Signed-off-by: Youling Tang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 2 + arch/loongarch/Kconfig.debug | 11 + arch/loongarch/Makefile | 23 +- arch/loongarch/include/asm/Kbuild | 2 + arch/loongarch/include/asm/bug.h | 1 + arch/loongarch/include/asm/exception.h | 2 + arch/loongarch/include/asm/module.h | 7 + arch/loongarch/include/asm/orc_header.h | 18 + arch/loongarch/include/asm/orc_lookup.h | 31 ++ arch/loongarch/include/asm/orc_types.h | 58 +++ arch/loongarch/include/asm/stackframe.h | 3 + arch/loongarch/include/asm/unwind.h | 20 +- arch/loongarch/include/asm/unwind_hints.h | 28 ++ arch/loongarch/kernel/Makefile | 4 + arch/loongarch/kernel/entry.S | 5 + arch/loongarch/kernel/fpu.S | 7 + arch/loongarch/kernel/genex.S | 6 + arch/loongarch/kernel/lbt.S | 3 + arch/loongarch/kernel/mcount_dyn.S | 6 + arch/loongarch/kernel/module.c | 22 +- arch/loongarch/kernel/relocate_kernel.S | 7 +- arch/loongarch/kernel/rethook_trampoline.S | 1 + arch/loongarch/kernel/setup.c | 2 + arch/loongarch/kernel/stacktrace.c | 1 + arch/loongarch/kernel/traps.c | 42 +- arch/loongarch/kernel/unwind_orc.c | 528 +++++++++++++++++++++ arch/loongarch/kernel/vmlinux.lds.S | 3 + arch/loongarch/kvm/switch.S | 9 +- arch/loongarch/lib/clear_user.S | 3 + arch/loongarch/lib/copy_user.S | 3 + arch/loongarch/lib/memcpy.S | 3 + arch/loongarch/lib/memset.S | 3 + arch/loongarch/mm/tlb.c | 27 +- arch/loongarch/mm/tlbex.S | 9 + arch/loongarch/vdso/Makefile | 1 + include/linux/compiler.h | 9 + scripts/Makefile | 7 +- 37 files changed, 875 insertions(+), 42 deletions(-) create mode 100644 arch/loongarch/include/asm/orc_header.h create mode 100644 arch/loongarch/include/asm/orc_lookup.h create mode 100644 arch/loongarch/include/asm/orc_types.h create mode 100644 arch/loongarch/include/asm/unwind_hints.h create mode 100644 arch/loongarch/kernel/unwind_orc.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 929f68926b34..8d6725115ac6 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -136,6 +136,7 @@ config LOONGARCH select HAVE_KVM select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI + select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS select HAVE_PCI select HAVE_PERF_EVENTS select HAVE_PERF_REGS @@ -148,6 +149,7 @@ config LOONGARCH select HAVE_SAMPLE_FTRACE_DIRECT select HAVE_SAMPLE_FTRACE_DIRECT_MULTI select HAVE_SETUP_PER_CPU_AREA if NUMA + select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_TIF_NOHZ diff --git a/arch/loongarch/Kconfig.debug b/arch/loongarch/Kconfig.debug index 8d36aab53008..98d60630c3d4 100644 --- a/arch/loongarch/Kconfig.debug +++ b/arch/loongarch/Kconfig.debug @@ -26,4 +26,15 @@ config UNWINDER_PROLOGUE Some of the addresses it reports may be incorrect (but better than the Guess unwinder). +config UNWINDER_ORC + bool "ORC unwinder" + select OBJTOOL + help + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + endchoice diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 983aa2b1629a..e3bc02fb7fdc 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -26,6 +26,18 @@ endif 32bit-emul = elf32loongarch 64bit-emul = elf64loongarch +ifdef CONFIG_UNWINDER_ORC +orc_hash_h := arch/$(SRCARCH)/include/generated/asm/orc_hash.h +orc_hash_sh := $(srctree)/scripts/orc_hash.sh +targets += $(orc_hash_h) +quiet_cmd_orc_hash = GEN $@ + cmd_orc_hash = mkdir -p $(dir $@); \ + $(CONFIG_SHELL) $(orc_hash_sh) < $< > $@ +$(orc_hash_h): $(srctree)/arch/loongarch/include/asm/orc_types.h $(orc_hash_sh) FORCE + $(call if_changed,orc_hash) +archprepare: $(orc_hash_h) +endif + ifdef CONFIG_DYNAMIC_FTRACE KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY CC_FLAGS_FTRACE := -fpatchable-function-entry=2 @@ -72,8 +84,6 @@ KBUILD_CFLAGS_KERNEL += $(call cc-option,-mdirect-extern-access) KBUILD_CFLAGS_KERNEL += $(call cc-option,-fdirect-access-external-data) KBUILD_AFLAGS_MODULE += $(call cc-option,-fno-direct-access-external-data) KBUILD_CFLAGS_MODULE += $(call cc-option,-fno-direct-access-external-data) -KBUILD_AFLAGS_MODULE += $(call cc-option,-mno-relax) $(call cc-option,-Wa$(comma)-mno-relax) -KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) $(call cc-option,-Wa$(comma)-mno-relax) else cflags-y += $(call cc-option,-mno-explicit-relocs) KBUILD_AFLAGS_KERNEL += -Wa,-mla-global-with-pcrel @@ -82,6 +92,15 @@ KBUILD_AFLAGS_MODULE += -Wa,-mla-global-with-abs KBUILD_CFLAGS_MODULE += -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs endif +KBUILD_AFLAGS += $(call cc-option,-mno-relax) $(call cc-option,-Wa$(comma)-mno-relax) +KBUILD_CFLAGS += $(call cc-option,-mno-relax) $(call cc-option,-Wa$(comma)-mno-relax) +KBUILD_AFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma)-mthin-add-sub) +KBUILD_CFLAGS += $(call cc-option,-mthin-add-sub) $(call cc-option,-Wa$(comma)-mthin-add-sub) + +ifdef CONFIG_OBJTOOL +KBUILD_CFLAGS += -fno-jump-tables +endif + KBUILD_RUSTFLAGS_MODULE += -Crelocation-model=pic ifeq ($(CONFIG_RELOCATABLE),y) diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 93783fa24f6e..a97c0edbb866 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +generated-y += orc_hash.h + generic-y += dma-contiguous.h generic-y += mcs_spinlock.h generic-y += parport.h diff --git a/arch/loongarch/include/asm/bug.h b/arch/loongarch/include/asm/bug.h index d4ca3ba25418..08388876ade4 100644 --- a/arch/loongarch/include/asm/bug.h +++ b/arch/loongarch/include/asm/bug.h @@ -44,6 +44,7 @@ do { \ instrumentation_begin(); \ __BUG_FLAGS(BUGFLAG_WARNING|(flags)); \ + annotate_reachable(); \ instrumentation_end(); \ } while (0) diff --git a/arch/loongarch/include/asm/exception.h b/arch/loongarch/include/asm/exception.h index af74a3fdcad1..c6d20736fd92 100644 --- a/arch/loongarch/include/asm/exception.h +++ b/arch/loongarch/include/asm/exception.h @@ -6,6 +6,8 @@ #include #include +extern void *exception_table[]; + void show_registers(struct pt_regs *regs); asmlinkage void cache_parity_error(void); diff --git a/arch/loongarch/include/asm/module.h b/arch/loongarch/include/asm/module.h index 2ecd82bb64e1..f33f3fd32ecc 100644 --- a/arch/loongarch/include/asm/module.h +++ b/arch/loongarch/include/asm/module.h @@ -6,6 +6,7 @@ #define _ASM_MODULE_H #include +#include #include #define RELA_STACK_DEPTH 16 @@ -21,6 +22,12 @@ struct mod_arch_specific { struct mod_section plt; struct mod_section plt_idx; +#ifdef CONFIG_UNWINDER_ORC + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif + /* For CONFIG_DYNAMIC_FTRACE */ struct plt_entry *ftrace_trampolines; }; diff --git a/arch/loongarch/include/asm/orc_header.h b/arch/loongarch/include/asm/orc_header.h new file mode 100644 index 000000000000..f9d509c3fd70 --- /dev/null +++ b/arch/loongarch/include/asm/orc_header.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _ORC_HEADER_H +#define _ORC_HEADER_H + +#include +#include +#include + +/* + * The header is currently a 20-byte hash of the ORC entry definition; see + * scripts/orc_hash.sh. + */ +#define ORC_HEADER \ + __used __section(".orc_header") __aligned(4) \ + static const u8 orc_header[] = { ORC_HASH } + +#endif /* _ORC_HEADER_H */ diff --git a/arch/loongarch/include/asm/orc_lookup.h b/arch/loongarch/include/asm/orc_lookup.h new file mode 100644 index 000000000000..b02e6357def4 --- /dev/null +++ b/arch/loongarch/include/asm/orc_lookup.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ diff --git a/arch/loongarch/include/asm/orc_types.h b/arch/loongarch/include/asm/orc_types.h new file mode 100644 index 000000000000..caf1f71a1057 --- /dev/null +++ b/arch/loongarch/include/asm/orc_types.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and FP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous FP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_SP 2 +#define ORC_REG_FP 3 +#define ORC_REG_MAX 4 + +#define ORC_TYPE_UNDEFINED 0 +#define ORC_TYPE_END_OF_STACK 1 +#define ORC_TYPE_CALL 2 +#define ORC_TYPE_REGS 3 +#define ORC_TYPE_REGS_PARTIAL 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and FP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 fp_offset; + s16 ra_offset; + unsigned int sp_reg:4; + unsigned int fp_reg:4; + unsigned int ra_reg:4; + unsigned int type:3; + unsigned int signal:1; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 4fb1e6408b98..45b507a7b06f 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -13,6 +13,7 @@ #include #include #include +#include /* Make the addition of cfi info a little easier. */ .macro cfi_rel_offset reg offset=0 docfi=0 @@ -162,6 +163,7 @@ li.w t0, CSR_CRMD_WE csrxchg t0, t0, LOONGARCH_CSR_CRMD #endif + UNWIND_HINT_REGS .endm .macro SAVE_ALL docfi=0 @@ -219,6 +221,7 @@ .macro RESTORE_SP_AND_RET docfi=0 cfi_ld sp, PT_R3, \docfi + UNWIND_HINT_FUNC ertn .endm diff --git a/arch/loongarch/include/asm/unwind.h b/arch/loongarch/include/asm/unwind.h index b9dce87afd2e..40a6763c5aec 100644 --- a/arch/loongarch/include/asm/unwind.h +++ b/arch/loongarch/include/asm/unwind.h @@ -16,6 +16,7 @@ enum unwinder_type { UNWINDER_GUESS, UNWINDER_PROLOGUE, + UNWINDER_ORC, }; struct unwind_state { @@ -24,7 +25,7 @@ struct unwind_state { struct task_struct *task; bool first, error, reset; int graph_idx; - unsigned long sp, pc, ra; + unsigned long sp, fp, pc, ra; }; bool default_next_frame(struct unwind_state *state); @@ -61,14 +62,17 @@ static __always_inline void __unwind_start(struct unwind_state *state, state->sp = regs->regs[3]; state->pc = regs->csr_era; state->ra = regs->regs[1]; + state->fp = regs->regs[22]; } else if (task && task != current) { state->sp = thread_saved_fp(task); state->pc = thread_saved_ra(task); state->ra = 0; + state->fp = 0; } else { state->sp = (unsigned long)__builtin_frame_address(0); state->pc = (unsigned long)__builtin_return_address(0); state->ra = 0; + state->fp = 0; } state->task = task; get_stack_info(state->sp, state->task, &state->stack_info); @@ -77,6 +81,18 @@ static __always_inline void __unwind_start(struct unwind_state *state, static __always_inline unsigned long __unwind_get_return_address(struct unwind_state *state) { - return unwind_done(state) ? 0 : state->pc; + if (unwind_done(state)) + return 0; + + return __kernel_text_address(state->pc) ? state->pc : 0; } + +#ifdef CONFIG_UNWINDER_ORC +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} +#endif + #endif /* _ASM_UNWIND_H */ diff --git a/arch/loongarch/include/asm/unwind_hints.h b/arch/loongarch/include/asm/unwind_hints.h new file mode 100644 index 000000000000..a01086ad9dde --- /dev/null +++ b/arch/loongarch/include/asm/unwind_hints.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_LOONGARCH_UNWIND_HINTS_H +#define _ASM_LOONGARCH_UNWIND_HINTS_H + +#include +#include + +#ifdef __ASSEMBLY__ + +.macro UNWIND_HINT_UNDEFINED + UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED +.endm + +.macro UNWIND_HINT_END_OF_STACK + UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK +.endm + +.macro UNWIND_HINT_REGS + UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_REGS +.endm + +.macro UNWIND_HINT_FUNC + UNWIND_HINT sp_reg=ORC_REG_SP type=UNWIND_HINT_TYPE_CALL +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_LOONGARCH_UNWIND_HINTS_H */ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 3c808c680370..3a7620b66bc6 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -3,6 +3,8 @@ # Makefile for the Linux/LoongArch kernel. # +OBJECT_FILES_NON_STANDARD_head.o := y + extra-y := vmlinux.lds obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ @@ -21,6 +23,7 @@ obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o CFLAGS_module.o += $(call cc-option,-Wno-override-init,) CFLAGS_syscall.o += $(call cc-option,-Wno-override-init,) +CFLAGS_traps.o += $(call cc-option,-Wno-override-init,) CFLAGS_perf_event.o += $(call cc-option,-Wno-override-init,) ifdef CONFIG_FUNCTION_TRACER @@ -62,6 +65,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_regs.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index 1ec8e4c4cc2b..48e7e34e355e 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -14,11 +14,13 @@ #include #include #include +#include .text .cfi_sections .debug_frame .align 5 SYM_CODE_START(handle_syscall) + UNWIND_HINT_UNDEFINED csrrd t0, PERCPU_BASE_KS la.pcrel t1, kernelsp add.d t1, t1, t0 @@ -57,6 +59,7 @@ SYM_CODE_START(handle_syscall) cfi_st fp, PT_R22 SAVE_STATIC + UNWIND_HINT_REGS #ifdef CONFIG_KGDB li.w t1, CSR_CRMD_WE @@ -75,6 +78,7 @@ SYM_CODE_END(handle_syscall) _ASM_NOKPROBE(handle_syscall) SYM_CODE_START(ret_from_fork) + UNWIND_HINT_REGS bl schedule_tail # a0 = struct task_struct *prev move a0, sp bl syscall_exit_to_user_mode @@ -84,6 +88,7 @@ SYM_CODE_START(ret_from_fork) SYM_CODE_END(ret_from_fork) SYM_CODE_START(ret_from_kernel_thread) + UNWIND_HINT_REGS bl schedule_tail # a0 = struct task_struct *prev move a0, s1 jirl ra, s0, 0 diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 4382e36ae3d4..69a85f2479fb 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -15,6 +15,7 @@ #include #include #include +#include #define FPU_REG_WIDTH 8 #define LSX_REG_WIDTH 16 @@ -526,3 +527,9 @@ SYM_FUNC_END(_restore_lasx_context) .L_fpu_fault: li.w a0, -EFAULT # failure jr ra + +#ifdef CONFIG_CPU_HAS_LBT +STACK_FRAME_NON_STANDARD _restore_fp +STACK_FRAME_NON_STANDARD _restore_lsx +STACK_FRAME_NON_STANDARD _restore_lasx +#endif diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 2bb3aa2dcfcb..86d5d90ebefe 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -32,6 +32,7 @@ SYM_FUNC_START(__arch_cpu_idle) SYM_FUNC_END(__arch_cpu_idle) SYM_CODE_START(handle_vint) + UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL la_abs t1, __arch_cpu_idle @@ -49,6 +50,7 @@ SYM_CODE_START(handle_vint) SYM_CODE_END(handle_vint) SYM_CODE_START(except_vec_cex) + UNWIND_HINT_UNDEFINED b cache_parity_error SYM_CODE_END(except_vec_cex) @@ -67,6 +69,7 @@ SYM_CODE_END(except_vec_cex) .macro BUILD_HANDLER exception handler prep .align 5 SYM_CODE_START(handle_\exception) + UNWIND_HINT_UNDEFINED 666: BACKUP_T0T1 SAVE_ALL @@ -77,7 +80,9 @@ SYM_CODE_END(except_vec_cex) 668: RESTORE_ALL_AND_RET SYM_CODE_END(handle_\exception) + .pushsection ".data", "aw", %progbits SYM_DATA(unwind_hint_\exception, .word 668b - 666b) + .popsection .endm BUILD_HANDLER ade ade badv @@ -94,6 +99,7 @@ SYM_CODE_END(except_vec_cex) BUILD_HANDLER reserved reserved none /* others */ SYM_CODE_START(handle_sys) + UNWIND_HINT_UNDEFINED la_abs t0, handle_syscall jr t0 SYM_CODE_END(handle_sys) diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S index 9c75120a26d8..001f061d226a 100644 --- a/arch/loongarch/kernel/lbt.S +++ b/arch/loongarch/kernel/lbt.S @@ -11,6 +11,7 @@ #include #include #include +#include #define SCR_REG_WIDTH 8 @@ -153,3 +154,5 @@ SYM_FUNC_END(_restore_ftop_context) .L_lbt_fault: li.w a0, -EFAULT # failure jr ra + +STACK_FRAME_NON_STANDARD _restore_ftop_context diff --git a/arch/loongarch/kernel/mcount_dyn.S b/arch/loongarch/kernel/mcount_dyn.S index 482aa553aa2d..0c65cf09110c 100644 --- a/arch/loongarch/kernel/mcount_dyn.S +++ b/arch/loongarch/kernel/mcount_dyn.S @@ -73,6 +73,7 @@ SYM_FUNC_START(ftrace_stub) SYM_FUNC_END(ftrace_stub) SYM_CODE_START(ftrace_common) + UNWIND_HINT_UNDEFINED PTR_ADDI a0, ra, -8 /* arg0: ip */ move a1, t0 /* arg1: parent_ip */ la.pcrel t1, function_trace_op @@ -113,12 +114,14 @@ ftrace_common_return: SYM_CODE_END(ftrace_common) SYM_CODE_START(ftrace_caller) + UNWIND_HINT_UNDEFINED ftrace_regs_entry allregs=0 b ftrace_common SYM_CODE_END(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS SYM_CODE_START(ftrace_regs_caller) + UNWIND_HINT_UNDEFINED ftrace_regs_entry allregs=1 b ftrace_common SYM_CODE_END(ftrace_regs_caller) @@ -126,6 +129,7 @@ SYM_CODE_END(ftrace_regs_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_CODE_START(ftrace_graph_caller) + UNWIND_HINT_UNDEFINED PTR_L a0, sp, PT_ERA PTR_ADDI a0, a0, -8 /* arg0: self_addr */ PTR_ADDI a1, sp, PT_R1 /* arg1: parent */ @@ -134,6 +138,7 @@ SYM_CODE_START(ftrace_graph_caller) SYM_CODE_END(ftrace_graph_caller) SYM_CODE_START(return_to_handler) + UNWIND_HINT_UNDEFINED /* Save return value regs */ PTR_ADDI sp, sp, -FGRET_REGS_SIZE PTR_S a0, sp, FGRET_REGS_A0 @@ -155,6 +160,7 @@ SYM_CODE_END(return_to_handler) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS SYM_CODE_START(ftrace_stub_direct_tramp) + UNWIND_HINT_UNDEFINED jr t0 SYM_CODE_END(ftrace_stub_direct_tramp) #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c index b13b2858fe39..c7d0338d12c1 100644 --- a/arch/loongarch/kernel/module.c +++ b/arch/loongarch/kernel/module.c @@ -20,6 +20,7 @@ #include #include #include +#include static int rela_stack_push(s64 stack_value, s64 *rela_stack, size_t *rela_stack_top) { @@ -515,15 +516,28 @@ static void module_init_ftrace_plt(const Elf_Ehdr *hdr, int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { - const Elf_Shdr *s, *se; const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + const Elf_Shdr *s, *alt = NULL, *orc = NULL, *orc_ip = NULL, *ftrace = NULL; - for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) { + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrs + s->sh_name)) - apply_alternatives((void *)s->sh_addr, (void *)s->sh_addr + s->sh_size); + alt = s; + if (!strcmp(".orc_unwind", secstrs + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrs + s->sh_name)) + orc_ip = s; if (!strcmp(".ftrace_trampoline", secstrs + s->sh_name)) - module_init_ftrace_plt(hdr, s, mod); + ftrace = s; } + if (alt) + apply_alternatives((void *)alt->sh_addr, (void *)alt->sh_addr + alt->sh_size); + + if (orc && orc_ip) + unwind_module_init(mod, (void *)orc_ip->sh_addr, orc_ip->sh_size, (void *)orc->sh_addr, orc->sh_size); + + if (ftrace) + module_init_ftrace_plt(hdr, ftrace, mod); + return 0; } diff --git a/arch/loongarch/kernel/relocate_kernel.S b/arch/loongarch/kernel/relocate_kernel.S index f49f6b053763..84e6de2fd973 100644 --- a/arch/loongarch/kernel/relocate_kernel.S +++ b/arch/loongarch/kernel/relocate_kernel.S @@ -15,6 +15,7 @@ #include SYM_CODE_START(relocate_new_kernel) + UNWIND_HINT_UNDEFINED /* * a0: EFI boot flag for the new kernel * a1: Command line pointer for the new kernel @@ -90,6 +91,7 @@ SYM_CODE_END(relocate_new_kernel) * then start at the entry point from LOONGARCH_IOCSR_MBUF0. */ SYM_CODE_START(kexec_smp_wait) + UNWIND_HINT_UNDEFINED 1: li.w t0, 0x100 /* wait for init loop */ 2: addi.w t0, t0, -1 /* limit mailbox access */ bnez t0, 2b @@ -106,6 +108,5 @@ SYM_CODE_END(kexec_smp_wait) relocate_new_kernel_end: -SYM_DATA_START(relocate_new_kernel_size) - PTR relocate_new_kernel_end - relocate_new_kernel -SYM_DATA_END(relocate_new_kernel_size) + .section ".data" +SYM_DATA(relocate_new_kernel_size, .long relocate_new_kernel_end - relocate_new_kernel) diff --git a/arch/loongarch/kernel/rethook_trampoline.S b/arch/loongarch/kernel/rethook_trampoline.S index bd5772c96338..d4ceb2fa2a5c 100644 --- a/arch/loongarch/kernel/rethook_trampoline.S +++ b/arch/loongarch/kernel/rethook_trampoline.S @@ -76,6 +76,7 @@ .endm SYM_CODE_START(arch_rethook_trampoline) + UNWIND_HINT_UNDEFINED addi.d sp, sp, -PT_SIZE save_all_base_regs diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 634ef17fd38b..7bf9afaeea00 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -47,6 +47,7 @@ #include #include #include +#include #define SMBIOS_BIOSSIZE_OFFSET 0x09 #define SMBIOS_BIOSEXTERN_OFFSET 0x13 @@ -587,6 +588,7 @@ static void __init prefill_possible_map(void) void __init setup_arch(char **cmdline_p) { cpu_probe(); + unwind_init(); init_environ(); efi_init(); diff --git a/arch/loongarch/kernel/stacktrace.c b/arch/loongarch/kernel/stacktrace.c index f623feb2129f..eaec82e02c92 100644 --- a/arch/loongarch/kernel/stacktrace.c +++ b/arch/loongarch/kernel/stacktrace.c @@ -29,6 +29,7 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, regs->csr_era = thread_saved_ra(task); } regs->regs[1] = 0; + regs->regs[22] = 0; } for (unwind_start(&state, task, regs); diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index aebfc3733a76..f9f4eb00c92e 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -53,6 +53,32 @@ #include "access-helper.h" +void *exception_table[EXCCODE_INT_START] = { + [0 ... EXCCODE_INT_START - 1] = handle_reserved, + + [EXCCODE_TLBI] = handle_tlb_load, + [EXCCODE_TLBL] = handle_tlb_load, + [EXCCODE_TLBS] = handle_tlb_store, + [EXCCODE_TLBM] = handle_tlb_modify, + [EXCCODE_TLBNR] = handle_tlb_protect, + [EXCCODE_TLBNX] = handle_tlb_protect, + [EXCCODE_TLBPE] = handle_tlb_protect, + [EXCCODE_ADE] = handle_ade, + [EXCCODE_ALE] = handle_ale, + [EXCCODE_BCE] = handle_bce, + [EXCCODE_SYS] = handle_sys, + [EXCCODE_BP] = handle_bp, + [EXCCODE_INE] = handle_ri, + [EXCCODE_IPE] = handle_ri, + [EXCCODE_FPDIS] = handle_fpu, + [EXCCODE_LSXDIS] = handle_lsx, + [EXCCODE_LASXDIS] = handle_lasx, + [EXCCODE_FPE] = handle_fpe, + [EXCCODE_WATCH] = handle_watch, + [EXCCODE_BTDIS] = handle_lbt, +}; +EXPORT_SYMBOL_GPL(exception_table); + static void show_backtrace(struct task_struct *task, const struct pt_regs *regs, const char *loglvl, bool user) { @@ -1150,19 +1176,9 @@ void __init trap_init(void) for (i = EXCCODE_INT_START; i <= EXCCODE_INT_END; i++) set_handler(i * VECSIZE, handle_vint, VECSIZE); - set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE); - set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE); - set_handler(EXCCODE_BCE * VECSIZE, handle_bce, VECSIZE); - set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE); - set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE); - set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE); - set_handler(EXCCODE_IPE * VECSIZE, handle_ri, VECSIZE); - set_handler(EXCCODE_FPDIS * VECSIZE, handle_fpu, VECSIZE); - set_handler(EXCCODE_LSXDIS * VECSIZE, handle_lsx, VECSIZE); - set_handler(EXCCODE_LASXDIS * VECSIZE, handle_lasx, VECSIZE); - set_handler(EXCCODE_FPE * VECSIZE, handle_fpe, VECSIZE); - set_handler(EXCCODE_BTDIS * VECSIZE, handle_lbt, VECSIZE); - set_handler(EXCCODE_WATCH * VECSIZE, handle_watch, VECSIZE); + /* Set exception vector handler */ + for (i = EXCCODE_ADE; i <= EXCCODE_BTDIS; i++) + set_handler(i * VECSIZE, exception_table[i], VECSIZE); cache_error_setup(); diff --git a/arch/loongarch/kernel/unwind_orc.c b/arch/loongarch/kernel/unwind_orc.c new file mode 100644 index 000000000000..b25722876331 --- /dev/null +++ b/arch/loongarch/kernel/unwind_orc.c @@ -0,0 +1,528 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +ORC_HEADER; + +#define orc_warn(fmt, ...) \ + printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) + +extern int __start_orc_unwind_ip[]; +extern int __stop_orc_unwind_ip[]; +extern struct orc_entry __start_orc_unwind[]; +extern struct orc_entry __stop_orc_unwind[]; + +static bool orc_init __ro_after_init; +static unsigned int lookup_num_blocks __ro_after_init; + +/* Fake frame pointer entry -- used as a fallback for generated code */ +static struct orc_entry orc_fp_entry = { + .sp_reg = ORC_REG_FP, + .sp_offset = 16, + .fp_reg = ORC_REG_PREV_SP, + .fp_offset = -16, + .ra_reg = ORC_REG_PREV_SP, + .ra_offset = -8, + .type = ORC_TYPE_CALL +}; + +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry orc_null_entry = { + .sp_reg = ORC_REG_SP, + .sp_offset = sizeof(long), + .fp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + +static inline unsigned long orc_ip(const int *ip) +{ + return (unsigned long)ip + *ip; +} + +static struct orc_entry *__orc_find(int *ip_table, struct orc_entry *u_table, + unsigned int num_entries, unsigned long ip) +{ + int *first = ip_table; + int *mid = first, *found = first; + int *last = ip_table + num_entries - 1; + + if (!num_entries) + return NULL; + + /* + * Do a binary range search to find the rightmost duplicate of a given + * starting address. Some entries are section terminators which are + * "weak" entries for ensuring there are no gaps. They should be + * ignored when they conflict with a real entry. + */ + while (first <= last) { + mid = first + ((last - first) / 2); + + if (orc_ip(mid) <= ip) { + found = mid; + first = mid + 1; + } else + last = mid - 1; + } + + return u_table + (found - ip_table); +} + +#ifdef CONFIG_MODULES +static struct orc_entry *orc_module_find(unsigned long ip) +{ + struct module *mod; + + mod = __module_address(ip); + if (!mod || !mod->arch.orc_unwind || !mod->arch.orc_unwind_ip) + return NULL; + + return __orc_find(mod->arch.orc_unwind_ip, mod->arch.orc_unwind, mod->arch.num_orcs, ip); +} +#else +static struct orc_entry *orc_module_find(unsigned long ip) +{ + return NULL; +} +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE +static struct orc_entry *orc_find(unsigned long ip); + +/* + * Ftrace dynamic trampolines do not have orc entries of their own. + * But they are copies of the ftrace entries that are static and + * defined in ftrace_*.S, which do have orc entries. + * + * If the unwinder comes across a ftrace trampoline, then find the + * ftrace function that was used to create it, and use that ftrace + * function's orc entry, as the placement of the return code in + * the stack will be identical. + */ +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + struct ftrace_ops *ops; + unsigned long tramp_addr, offset; + + ops = ftrace_ops_trampoline(ip); + if (!ops) + return NULL; + + /* Set tramp_addr to the start of the code copied by the trampoline */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + tramp_addr = (unsigned long)ftrace_regs_caller; + else + tramp_addr = (unsigned long)ftrace_caller; + + /* Now place tramp_addr to the location within the trampoline ip is at */ + offset = ip - ops->trampoline; + tramp_addr += offset; + + /* Prevent unlikely recursion */ + if (ip == tramp_addr) + return NULL; + + return orc_find(tramp_addr); +} +#else +static struct orc_entry *orc_ftrace_find(unsigned long ip) +{ + return NULL; +} +#endif + +static struct orc_entry *orc_find(unsigned long ip) +{ + static struct orc_entry *orc; + + if (ip == 0) + return &orc_null_entry; + + /* For non-init vmlinux addresses, use the fast lookup table: */ + if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { + unsigned int idx, start, stop; + + idx = (ip - LOOKUP_START_IP) / LOOKUP_BLOCK_SIZE; + + if (unlikely((idx >= lookup_num_blocks-1))) { + orc_warn("WARNING: bad lookup idx: idx=%u num=%u ip=%pB\n", + idx, lookup_num_blocks, (void *)ip); + return NULL; + } + + start = orc_lookup[idx]; + stop = orc_lookup[idx + 1] + 1; + + if (unlikely((__start_orc_unwind + start >= __stop_orc_unwind) || + (__start_orc_unwind + stop > __stop_orc_unwind))) { + orc_warn("WARNING: bad lookup value: idx=%u num=%u start=%u stop=%u ip=%pB\n", + idx, lookup_num_blocks, start, stop, (void *)ip); + return NULL; + } + + return __orc_find(__start_orc_unwind_ip + start, + __start_orc_unwind + start, stop - start, ip); + } + + /* vmlinux .init slow lookup: */ + if (is_kernel_inittext(ip)) + return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); + + /* Module lookup: */ + orc = orc_module_find(ip); + if (orc) + return orc; + + return orc_ftrace_find(ip); +} + +#ifdef CONFIG_MODULES + +static DEFINE_MUTEX(sort_mutex); +static int *cur_orc_ip_table = __start_orc_unwind_ip; +static struct orc_entry *cur_orc_table = __start_orc_unwind; + +static void orc_sort_swap(void *_a, void *_b, int size) +{ + int delta = _b - _a; + int *a = _a, *b = _b, tmp; + struct orc_entry *orc_a, *orc_b; + + /* Swap the .orc_unwind_ip entries: */ + tmp = *a; + *a = *b + delta; + *b = tmp - delta; + + /* Swap the corresponding .orc_unwind entries: */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + orc_b = cur_orc_table + (b - cur_orc_ip_table); + swap(*orc_a, *orc_b); +} + +static int orc_sort_cmp(const void *_a, const void *_b) +{ + const int *a = _a, *b = _b; + unsigned long a_val = orc_ip(a); + unsigned long b_val = orc_ip(b); + struct orc_entry *orc_a; + + if (a_val > b_val) + return 1; + if (a_val < b_val) + return -1; + + /* + * The "weak" section terminator entries need to always be first + * to ensure the lookup code skips them in favor of real entries. + * These terminator entries exist to handle any gaps created by + * whitelisted .o files which didn't get objtool generation. + */ + orc_a = cur_orc_table + (a - cur_orc_ip_table); + + return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1; +} + +void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size, + void *_orc, size_t orc_size) +{ + int *orc_ip = _orc_ip; + struct orc_entry *orc = _orc; + unsigned int num_entries = orc_ip_size / sizeof(int); + + WARN_ON_ONCE(orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(*orc) != 0 || + num_entries != orc_size / sizeof(*orc)); + + /* + * The 'cur_orc_*' globals allow the orc_sort_swap() callback to + * associate an .orc_unwind_ip table entry with its corresponding + * .orc_unwind entry so they can both be swapped. + */ + mutex_lock(&sort_mutex); + cur_orc_ip_table = orc_ip; + cur_orc_table = orc; + sort(orc_ip, num_entries, sizeof(int), orc_sort_cmp, orc_sort_swap); + mutex_unlock(&sort_mutex); + + mod->arch.orc_unwind_ip = orc_ip; + mod->arch.orc_unwind = orc; + mod->arch.num_orcs = num_entries; +} +#endif + +void __init unwind_init(void) +{ + int i; + size_t orc_size = (void *)__stop_orc_unwind - (void *)__start_orc_unwind; + size_t orc_ip_size = (void *)__stop_orc_unwind_ip - (void *)__start_orc_unwind_ip; + size_t num_entries = orc_ip_size / sizeof(int); + struct orc_entry *orc; + + if (!num_entries || orc_ip_size % sizeof(int) != 0 || + orc_size % sizeof(struct orc_entry) != 0 || + num_entries != orc_size / sizeof(struct orc_entry)) { + orc_warn("WARNING: Bad or missing .orc_unwind table. Disabling unwinder.\n"); + return; + } + + /* + * Note, the orc_unwind and orc_unwind_ip tables were already + * sorted at build time via the 'sorttable' tool. + * It's ready for binary search straight away, no need to sort it. + */ + + /* Initialize the fast lookup table: */ + lookup_num_blocks = orc_lookup_end - orc_lookup; + for (i = 0; i < lookup_num_blocks-1; i++) { + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, + num_entries, LOOKUP_START_IP + (LOOKUP_BLOCK_SIZE * i)); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + + orc_lookup[i] = orc - __start_orc_unwind; + } + + /* Initialize the ending block: */ + orc = __orc_find(__start_orc_unwind_ip, __start_orc_unwind, num_entries, LOOKUP_STOP_IP); + if (!orc) { + orc_warn("WARNING: Corrupt .orc_unwind table. Disabling unwinder.\n"); + return; + } + orc_lookup[lookup_num_blocks-1] = orc - __start_orc_unwind; + + orc_init = true; +} + +static inline bool on_stack(struct stack_info *info, unsigned long addr, size_t len) +{ + unsigned long begin = info->begin; + unsigned long end = info->end; + + return (info->type != STACK_TYPE_UNKNOWN && + addr >= begin && addr < end && addr + len > begin && addr + len <= end); +} + +static bool stack_access_ok(struct unwind_state *state, unsigned long addr, size_t len) +{ + struct stack_info *info = &state->stack_info; + + if (on_stack(info, addr, len)) + return true; + + return !get_stack_info(addr, state->task, info) && on_stack(info, addr, len); +} + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + return __unwind_get_return_address(state); +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs) +{ + __unwind_start(state, task, regs); + state->type = UNWINDER_ORC; + if (!unwind_done(state) && !__kernel_text_address(state->pc)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(unwind_start); + +static bool is_entry_func(unsigned long addr) +{ + extern u32 kernel_entry; + extern u32 kernel_entry_end; + + return addr >= (unsigned long)&kernel_entry && addr < (unsigned long)&kernel_entry_end; +} + +static inline unsigned long bt_address(unsigned long ra) +{ + extern unsigned long eentry; + + if (__kernel_text_address(ra)) + return ra; + + if (__module_text_address(ra)) + return ra; + + if (ra >= eentry && ra < eentry + EXCCODE_INT_END * VECSIZE) { + unsigned long func; + unsigned long type = (ra - eentry) / VECSIZE; + unsigned long offset = (ra - eentry) % VECSIZE; + + switch (type) { + case 0 ... EXCCODE_INT_START - 1: + func = (unsigned long)exception_table[type]; + break; + case EXCCODE_INT_START ... EXCCODE_INT_END: + func = (unsigned long)handle_vint; + break; + default: + func = (unsigned long)handle_reserved; + break; + } + + return func + offset; + } + + return ra; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *p, pc; + struct pt_regs *regs; + struct orc_entry *orc; + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + /* Don't let modules unload while we're reading their ORC data. */ + preempt_disable(); + + if (is_entry_func(state->pc)) + goto end; + + orc = orc_find(state->pc); + if (!orc) { + /* + * As a fallback, try to assume this code uses a frame pointer. + * This is useful for generated code, like BPF, which ORC + * doesn't know about. This is just a guess, so the rest of + * the unwind is no longer considered reliable. + */ + orc = &orc_fp_entry; + state->error = true; + } else { + if (orc->type == ORC_TYPE_UNDEFINED) + goto err; + + if (orc->type == ORC_TYPE_END_OF_STACK) + goto end; + } + + switch (orc->sp_reg) { + case ORC_REG_SP: + if (info->type == STACK_TYPE_IRQ && state->sp == info->end) + orc->type = ORC_TYPE_REGS; + else + state->sp = state->sp + orc->sp_offset; + break; + case ORC_REG_FP: + state->sp = state->fp; + break; + default: + orc_warn("unknown SP base reg %d at %pB\n", orc->sp_reg, (void *)state->pc); + goto err; + } + + switch (orc->fp_reg) { + case ORC_REG_PREV_SP: + p = (unsigned long *)(state->sp + orc->fp_offset); + if (!stack_access_ok(state, (unsigned long)p, sizeof(unsigned long))) + goto err; + + state->fp = *p; + break; + case ORC_REG_UNDEFINED: + /* Nothing. */ + break; + default: + orc_warn("unknown FP base reg %d at %pB\n", orc->fp_reg, (void *)state->pc); + goto err; + } + + switch (orc->type) { + case ORC_TYPE_CALL: + if (orc->ra_reg == ORC_REG_PREV_SP) { + p = (unsigned long *)(state->sp + orc->ra_offset); + if (!stack_access_ok(state, (unsigned long)p, sizeof(unsigned long))) + goto err; + + pc = unwind_graph_addr(state, *p, state->sp); + pc -= LOONGARCH_INSN_SIZE; + } else if (orc->ra_reg == ORC_REG_UNDEFINED) { + if (!state->ra || state->ra == state->pc) + goto err; + + pc = unwind_graph_addr(state, state->ra, state->sp); + pc -= LOONGARCH_INSN_SIZE; + state->ra = 0; + } else { + orc_warn("unknown ra base reg %d at %pB\n", orc->ra_reg, (void *)state->pc); + goto err; + } + break; + case ORC_TYPE_REGS: + if (info->type == STACK_TYPE_IRQ && state->sp == info->end) + regs = (struct pt_regs *)info->next_sp; + else + regs = (struct pt_regs *)state->sp; + + if (!stack_access_ok(state, (unsigned long)regs, sizeof(*regs))) + goto err; + + if ((info->end == (unsigned long)regs + sizeof(*regs)) && + !regs->regs[3] && !regs->regs[1]) + goto end; + + if (user_mode(regs)) + goto end; + + pc = regs->csr_era; + if (!__kernel_text_address(pc)) + goto err; + + state->sp = regs->regs[3]; + state->ra = regs->regs[1]; + state->fp = regs->regs[22]; + get_stack_info(state->sp, state->task, info); + + break; + default: + orc_warn("unknown .orc_unwind entry type %d at %pB\n", orc->type, (void *)state->pc); + goto err; + } + + state->pc = bt_address(pc); + if (!state->pc) { + pr_err("cannot find unwind pc at %pK\n", (void *)pc); + goto err; + } + + if (!__kernel_text_address(state->pc)) + goto err; + + preempt_enable(); + return true; + +err: + state->error = true; + +end: + preempt_enable(); + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S index a5d0cd2035da..e8e97dbf9ca4 100644 --- a/arch/loongarch/kernel/vmlinux.lds.S +++ b/arch/loongarch/kernel/vmlinux.lds.S @@ -2,6 +2,7 @@ #include #include #include +#include #define PAGE_SIZE _PAGE_SIZE #define RO_EXCEPTION_TABLE_ALIGN 4 @@ -122,6 +123,8 @@ SECTIONS } #endif + ORC_UNWIND_TABLE + .sdata : { *(.sdata) } diff --git a/arch/loongarch/kvm/switch.S b/arch/loongarch/kvm/switch.S index ba976509bfe8..1fcc4b7eda32 100644 --- a/arch/loongarch/kvm/switch.S +++ b/arch/loongarch/kvm/switch.S @@ -8,7 +8,7 @@ #include #include #include -#include +#include #define HGPR_OFFSET(x) (PT_R0 + 8*x) #define GGPR_OFFSET(x) (KVM_ARCH_GGPR + 8*x) @@ -112,6 +112,7 @@ .text .cfi_sections .debug_frame SYM_CODE_START(kvm_exc_entry) + UNWIND_HINT_UNDEFINED csrwr a2, KVM_TEMP_KS csrrd a2, KVM_VCPU_KS addi.d a2, a2, KVM_VCPU_ARCH @@ -279,3 +280,9 @@ SYM_FUNC_END(kvm_restore_lasx) .section ".rodata" SYM_DATA(kvm_exception_size, .quad kvm_exc_entry_end - kvm_exc_entry) SYM_DATA(kvm_enter_guest_size, .quad kvm_enter_guest_end - kvm_enter_guest) + +#ifdef CONFIG_CPU_HAS_LBT +STACK_FRAME_NON_STANDARD kvm_restore_fpu +STACK_FRAME_NON_STANDARD kvm_restore_lsx +STACK_FRAME_NON_STANDARD kvm_restore_lasx +#endif diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S index be741544e62b..7a0db643b286 100644 --- a/arch/loongarch/lib/clear_user.S +++ b/arch/loongarch/lib/clear_user.S @@ -10,6 +10,7 @@ #include #include #include +#include SYM_FUNC_START(__clear_user) /* @@ -204,3 +205,5 @@ SYM_FUNC_START(__clear_user_fast) _asm_extable 28b, .Lsmall_fixup _asm_extable 29b, .Lexit SYM_FUNC_END(__clear_user_fast) + +STACK_FRAME_NON_STANDARD __clear_user_fast diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S index feec3d362803..095ce9181c6c 100644 --- a/arch/loongarch/lib/copy_user.S +++ b/arch/loongarch/lib/copy_user.S @@ -10,6 +10,7 @@ #include #include #include +#include SYM_FUNC_START(__copy_user) /* @@ -278,3 +279,5 @@ SYM_FUNC_START(__copy_user_fast) _asm_extable 58b, .Lexit _asm_extable 59b, .Lexit SYM_FUNC_END(__copy_user_fast) + +STACK_FRAME_NON_STANDARD __copy_user_fast diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S index fa1148878d2b..9517a2f961af 100644 --- a/arch/loongarch/lib/memcpy.S +++ b/arch/loongarch/lib/memcpy.S @@ -9,6 +9,7 @@ #include #include #include +#include .section .noinstr.text, "ax" @@ -197,3 +198,5 @@ SYM_FUNC_START(__memcpy_fast) jr ra SYM_FUNC_END(__memcpy_fast) _ASM_NOKPROBE(__memcpy_fast) + +STACK_FRAME_NON_STANDARD __memcpy_small diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S index 06d3ca54cbfe..df3846620553 100644 --- a/arch/loongarch/lib/memset.S +++ b/arch/loongarch/lib/memset.S @@ -9,6 +9,7 @@ #include #include #include +#include .macro fill_to_64 r0 bstrins.d \r0, \r0, 15, 8 @@ -166,3 +167,5 @@ SYM_FUNC_START(__memset_fast) jr ra SYM_FUNC_END(__memset_fast) _ASM_NOKPROBE(__memset_fast) + +STACK_FRAME_NON_STANDARD __memset_fast diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index 0b95d32b30c9..5ac9beb5f093 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -9,8 +9,9 @@ #include #include -#include #include +#include +#include #include #include #include @@ -266,24 +267,20 @@ static void setup_tlb_handler(int cpu) setup_ptwalker(); local_flush_tlb_all(); + if (cpu_has_ptw) { + exception_table[EXCCODE_TLBI] = handle_tlb_load_ptw; + exception_table[EXCCODE_TLBL] = handle_tlb_load_ptw; + exception_table[EXCCODE_TLBS] = handle_tlb_store_ptw; + exception_table[EXCCODE_TLBM] = handle_tlb_modify_ptw; + } + /* The tlb handlers are generated only once */ if (cpu == 0) { memcpy((void *)tlbrentry, handle_tlb_refill, 0x80); local_flush_icache_range(tlbrentry, tlbrentry + 0x80); - if (!cpu_has_ptw) { - set_handler(EXCCODE_TLBI * VECSIZE, handle_tlb_load, VECSIZE); - set_handler(EXCCODE_TLBL * VECSIZE, handle_tlb_load, VECSIZE); - set_handler(EXCCODE_TLBS * VECSIZE, handle_tlb_store, VECSIZE); - set_handler(EXCCODE_TLBM * VECSIZE, handle_tlb_modify, VECSIZE); - } else { - set_handler(EXCCODE_TLBI * VECSIZE, handle_tlb_load_ptw, VECSIZE); - set_handler(EXCCODE_TLBL * VECSIZE, handle_tlb_load_ptw, VECSIZE); - set_handler(EXCCODE_TLBS * VECSIZE, handle_tlb_store_ptw, VECSIZE); - set_handler(EXCCODE_TLBM * VECSIZE, handle_tlb_modify_ptw, VECSIZE); - } - set_handler(EXCCODE_TLBNR * VECSIZE, handle_tlb_protect, VECSIZE); - set_handler(EXCCODE_TLBNX * VECSIZE, handle_tlb_protect, VECSIZE); - set_handler(EXCCODE_TLBPE * VECSIZE, handle_tlb_protect, VECSIZE); + + for (int i = EXCCODE_TLBL; i <= EXCCODE_TLBPE; i++) + set_handler(i * VECSIZE, exception_table[i], VECSIZE); } else { int vec_sz __maybe_unused; void *addr __maybe_unused; diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index d5d682f3d29f..a44387b838af 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -18,6 +18,7 @@ .macro tlb_do_page_fault, write SYM_CODE_START(tlb_do_page_fault_\write) + UNWIND_HINT_UNDEFINED SAVE_ALL csrrd a2, LOONGARCH_CSR_BADV move a0, sp @@ -32,6 +33,7 @@ tlb_do_page_fault 1 SYM_CODE_START(handle_tlb_protect) + UNWIND_HINT_UNDEFINED BACKUP_T0T1 SAVE_ALL move a0, sp @@ -44,6 +46,7 @@ SYM_CODE_START(handle_tlb_protect) SYM_CODE_END(handle_tlb_protect) SYM_CODE_START(handle_tlb_load) + UNWIND_HINT_UNDEFINED csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -190,6 +193,7 @@ nopage_tlb_load: SYM_CODE_END(handle_tlb_load) SYM_CODE_START(handle_tlb_load_ptw) + UNWIND_HINT_UNDEFINED csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_0 @@ -197,6 +201,7 @@ SYM_CODE_START(handle_tlb_load_ptw) SYM_CODE_END(handle_tlb_load_ptw) SYM_CODE_START(handle_tlb_store) + UNWIND_HINT_UNDEFINED csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -346,6 +351,7 @@ nopage_tlb_store: SYM_CODE_END(handle_tlb_store) SYM_CODE_START(handle_tlb_store_ptw) + UNWIND_HINT_UNDEFINED csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_1 @@ -353,6 +359,7 @@ SYM_CODE_START(handle_tlb_store_ptw) SYM_CODE_END(handle_tlb_store_ptw) SYM_CODE_START(handle_tlb_modify) + UNWIND_HINT_UNDEFINED csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 csrwr ra, EXCEPTION_KS2 @@ -500,6 +507,7 @@ nopage_tlb_modify: SYM_CODE_END(handle_tlb_modify) SYM_CODE_START(handle_tlb_modify_ptw) + UNWIND_HINT_UNDEFINED csrwr t0, LOONGARCH_CSR_KS0 csrwr t1, LOONGARCH_CSR_KS1 la_abs t0, tlb_do_page_fault_1 @@ -507,6 +515,7 @@ SYM_CODE_START(handle_tlb_modify_ptw) SYM_CODE_END(handle_tlb_modify_ptw) SYM_CODE_START(handle_tlb_refill) + UNWIND_HINT_UNDEFINED csrwr t0, LOONGARCH_CSR_TLBRSAVE csrrd t0, LOONGARCH_CSR_PGD lddir t0, t0, 3 diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile index f597cd08a96b..75c6726382c3 100644 --- a/arch/loongarch/vdso/Makefile +++ b/arch/loongarch/vdso/Makefile @@ -4,6 +4,7 @@ KASAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n +OBJECT_FILES_NON_STANDARD := y # Include the generic Makefile to check the built vdso. include $(srctree)/lib/vdso/Makefile diff --git a/include/linux/compiler.h b/include/linux/compiler.h index bb1339c7057b..39f2d4a05208 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -116,6 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n +#define __annotate_reachable(c) ({ \ + asm volatile(__stringify_label(c) ":\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long " __stringify_label(c) "b - .\n\t" \ + ".popsection\n\t"); \ +}) +#define annotate_reachable() __annotate_reachable(__COUNTER__) + #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -128,6 +136,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define __annotate_jump_table __section(".rodata..c_jump_table") #else /* !CONFIG_OBJTOOL */ +#define annotate_reachable() #define annotate_unreachable() #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ diff --git a/scripts/Makefile b/scripts/Makefile index 576cf64be667..e4cca53d2285 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -31,9 +31,12 @@ HOSTLDLIBS_sign-file = $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null | ifdef CONFIG_UNWINDER_ORC ifeq ($(ARCH),x86_64) -ARCH := x86 +SRCARCH := x86 endif -HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/x86/include +ifeq ($(ARCH),loongarch) +SRCARCH := loongarch +endif +HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/$(SRCARCH)/include HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED endif From 199cc14cb4f1cb8668be45f67af41755ed5f0175 Mon Sep 17 00:00:00 2001 From: Jinyang He Date: Mon, 11 Mar 2024 22:23:47 +0800 Subject: [PATCH 145/496] LoongArch: Add kernel livepatching support The arch-specified function ftrace_regs_set_instruction_pointer() has been implemented in arch/loongarch/include/asm/ftrace.h, so here only implement arch_stack_walk_reliable() function. Here are the test logs: [root@linux fedora]# cat /proc/cmdline BOOT_IMAGE=/vmlinuz-6.8.0-rc2 root=/dev/sda3 [root@linux fedora]# modprobe livepatch-sample [root@linux fedora]# cat /proc/cmdline this has been live patched [root@linux fedora]# echo 0 > /sys/kernel/livepatch/livepatch_sample/enabled [root@linux fedora]# rmmod livepatch_sample [root@linux fedora]# cat /proc/cmdline BOOT_IMAGE=/vmlinuz-6.8.0-rc2 root=/dev/sda3 [root@linux fedora]# dmesg -t | tail -5 livepatch: enabling patch 'livepatch_sample' livepatch: 'livepatch_sample': starting patching transition livepatch: 'livepatch_sample': patching complete livepatch: 'livepatch_sample': starting unpatching transition livepatch: 'livepatch_sample': unpatching complete Signed-off-by: Jinyang He Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 4 +++ arch/loongarch/include/asm/thread_info.h | 2 ++ arch/loongarch/kernel/stacktrace.c | 40 ++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 8d6725115ac6..99a0a15ce5f7 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -134,6 +134,7 @@ config LOONGARCH select HAVE_KPROBES_ON_FTRACE select HAVE_KRETPROBES select HAVE_KVM + select HAVE_LIVEPATCH select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OBJTOOL if AS_HAS_EXPLICIT_RELOCS @@ -143,6 +144,7 @@ config LOONGARCH select HAVE_PERF_USER_STACK_DUMP select HAVE_PREEMPT_DYNAMIC_KEY select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC select HAVE_RETHOOK select HAVE_RSEQ select HAVE_RUST @@ -636,6 +638,8 @@ config RANDOMIZE_BASE_MAX_OFFSET This is limited by the size of the lower address memory, 256MB. +source "kernel/livepatch/Kconfig" + endmenu config ARCH_SELECT_MEMORY_MODEL diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h index 8cb653d49a54..8bf0e6f51546 100644 --- a/arch/loongarch/include/asm/thread_info.h +++ b/arch/loongarch/include/asm/thread_info.h @@ -86,6 +86,7 @@ register unsigned long current_stack_pointer __asm__("$sp"); #define TIF_LASX_CTX_LIVE 18 /* LASX context must be preserved */ #define TIF_USEDLBT 19 /* LBT was used by this task this quantum (SMP) */ #define TIF_LBT_CTX_LIVE 20 /* LBT context must be preserved */ +#define TIF_PATCH_PENDING 21 /* pending live patching update */ #define _TIF_SIGPENDING (1<regs[3] = (unsigned long)__builtin_frame_address(0); + regs->csr_era = (unsigned long)__builtin_return_address(0); + } else { + regs->regs[3] = thread_saved_fp(task); + regs->csr_era = thread_saved_ra(task); + } + regs->regs[1] = 0; + regs->regs[22] = 0; + + for (unwind_start(&state, task, regs); + !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + + /* + * A NULL or invalid return address probably means there's some + * generated code which __kernel_text_address() doesn't know about. + */ + if (!addr) + return -EINVAL; + + if (!consume_entry(cookie, addr)) + return -EINVAL; + } + + /* Check for stack corruption */ + if (unwind_error(&state)) + return -EINVAL; + + return 0; +} + static int copy_stack_frame(unsigned long fp, struct stack_frame *frame) { From db185362fca554b201e2c62beb15a02bb39a064b Mon Sep 17 00:00:00 2001 From: M Cooley Date: Fri, 8 Mar 2024 17:35:40 -0500 Subject: [PATCH 146/496] ASoC: amd: yc: Fix non-functional mic on ASUS M7600RE The ASUS M7600RE (Vivobook Pro 16X OLED) needs a quirks-table entry for the internal microphone to function properly. Signed-off-by: Mitch Cooley Link: https://msgid.link/r/CALijGznExWW4fujNWwMzmn_K=wo96sGzV_2VkT7NjvEUdkg7Gw@mail.gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index cc231185d72c..384217c5eeeb 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -304,6 +304,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "E1504FA"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "M7600RE"), + } + }, { .driver_data = &acp6x_card, .matches = { From b36e78b216e632d90138751e4ff80044de303656 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 26 Feb 2024 12:25:01 +0100 Subject: [PATCH 147/496] ARM: 9354/1: ptrace: Use bitfield helpers The isa_mode() macro extracts two fields, and recombines them into a single value. Make this more obvious by using the FIELD_GET() helper, and shifting the result into its final resting place. Signed-off-by: Geert Uytterhoeven Reviewed-by: Oleg Nesterov Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/ptrace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 7f44e88d1f25..14a38cc67e0b 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -10,6 +10,7 @@ #include #ifndef __ASSEMBLY__ +#include #include struct pt_regs { @@ -35,8 +36,8 @@ struct svc_pt_regs { #ifndef CONFIG_CPU_V7M #define isa_mode(regs) \ - ((((regs)->ARM_cpsr & PSR_J_BIT) >> (__ffs(PSR_J_BIT) - 1)) | \ - (((regs)->ARM_cpsr & PSR_T_BIT) >> (__ffs(PSR_T_BIT)))) + (FIELD_GET(PSR_J_BIT, (regs)->ARM_cpsr) << 1 | \ + FIELD_GET(PSR_T_BIT, (regs)->ARM_cpsr)) #else #define isa_mode(regs) 1 /* Thumb */ #endif From 0c66c6f4e21cb22220cbd8821c5c73fc157d20dc Mon Sep 17 00:00:00 2001 From: Yongqiang Liu Date: Thu, 7 Mar 2024 13:05:09 +0100 Subject: [PATCH 148/496] ARM: 9359/1: flush: check if the folio is reserved for no-mapping addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit a4d5613c4dc6 ("arm: extend pfn_valid to take into account freed memory map alignment") changes the semantics of pfn_valid() to check presence of the memory map for a PFN. A valid page for an address which is reserved but not mapped by the kernel[1], the system crashed during some uio test with the following memory layout: node 0: [mem 0x00000000c0a00000-0x00000000cc8fffff] node 0: [mem 0x00000000d0000000-0x00000000da1fffff] the uio layout is:0xc0900000, 0x100000 the crash backtrace like: Unable to handle kernel paging request at virtual address bff00000 [...] CPU: 1 PID: 465 Comm: startapp.bin Tainted: G O 5.10.0 #1 Hardware name: Generic DT based system PC is at b15_flush_kern_dcache_area+0x24/0x3c LR is at __sync_icache_dcache+0x6c/0x98 [...] (b15_flush_kern_dcache_area) from (__sync_icache_dcache+0x6c/0x98) (__sync_icache_dcache) from (set_pte_at+0x28/0x54) (set_pte_at) from (remap_pfn_range+0x1a0/0x274) (remap_pfn_range) from (uio_mmap+0x184/0x1b8 [uio]) (uio_mmap [uio]) from (__mmap_region+0x264/0x5f4) (__mmap_region) from (__do_mmap_mm+0x3ec/0x440) (__do_mmap_mm) from (do_mmap+0x50/0x58) (do_mmap) from (vm_mmap_pgoff+0xfc/0x188) (vm_mmap_pgoff) from (ksys_mmap_pgoff+0xac/0xc4) (ksys_mmap_pgoff) from (ret_fast_syscall+0x0/0x5c) Code: e0801001 e2423001 e1c00003 f57ff04f (ee070f3e) ---[ end trace 09cf0734c3805d52 ]--- Kernel panic - not syncing: Fatal exception So check if PG_reserved was set to solve this issue. [1]: https://lore.kernel.org/lkml/Zbtdue57RO0QScJM@linux.ibm.com/ Fixes: a4d5613c4dc6 ("arm: extend pfn_valid to take into account freed memory map alignment") Suggested-by: Mike Rapoport Signed-off-by: Yongqiang Liu Signed-off-by: Russell King (Oracle) --- arch/arm/mm/flush.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index d19d140a10c7..0749cf8a6637 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -296,6 +296,9 @@ void __sync_icache_dcache(pte_t pteval) return; folio = page_folio(pfn_to_page(pfn)); + if (folio_test_reserved(folio)) + return; + if (cache_is_vipt_aliasing()) mapping = folio_flush_mapping(folio); else From c95346ac918c5badf51b9a7ac58a26d3bd5bb224 Mon Sep 17 00:00:00 2001 From: Andrew Price Date: Mon, 11 Mar 2024 16:40:36 +0100 Subject: [PATCH 149/496] gfs2: Fix invalid metadata access in punch_hole In punch_hole(), when the offset lies in the final block for a given height, there is no hole to punch, but the maximum size check fails to detect that. Consequently, punch_hole() will try to punch a hole beyond the end of the metadata and fail. Fix the maximum size check. Signed-off-by: Andrew Price Signed-off-by: Andreas Gruenbacher --- fs/gfs2/bmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f1..643175498d1c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1718,7 +1718,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) struct buffer_head *dibh, *bh; struct gfs2_holder rd_gh; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; - u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; + unsigned int bsize = 1 << bsize_shift; + u64 lblock = (offset + bsize - 1) >> bsize_shift; __u16 start_list[GFS2_MAX_META_HEIGHT]; __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; unsigned int start_aligned, end_aligned; @@ -1729,7 +1730,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) u64 prev_bnr = 0; __be64 *start, *end; - if (offset >= maxsize) { + if (offset + bsize - 1 >= maxsize) { /* * The starting point lies beyond the allocated metadata; * there are no blocks to deallocate. From 6f0974eccbf78baead1735722c4f1ee3eb9422cd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Mar 2024 13:30:43 -0600 Subject: [PATCH 150/496] io_uring: don't save/restore iowait state This kind of state is per-syscall, and since we're doing the waiting off entering the io_uring_enter(2) syscall, there's no way that iowait can already be set for this case. Simplify it by setting it if we need to, and always clearing it to 0 when done. Fixes: 7b72d661f1f2 ("io_uring: gate iowait schedule on having pending requests") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cf348c33f485..49a124daa359 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2539,7 +2539,7 @@ static bool current_pending_io(void) static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { - int io_wait, ret; + int ret; if (unlikely(READ_ONCE(ctx->check_cq))) return 1; @@ -2557,7 +2557,6 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - io_wait = current->in_iowait; if (current_pending_io()) current->in_iowait = 1; ret = 0; @@ -2565,7 +2564,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, schedule(); else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) ret = -ETIME; - current->in_iowait = io_wait; + current->in_iowait = 0; return ret; } From d4f4a361c4eadbc992dfb9667c9a73bb879f4458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:23 +0100 Subject: [PATCH 151/496] i2c: nomadik: simplify IRQ masking logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IRQ_MASK and I2C_CLEAR_ALL_INTS both mask available interrupts. IRQ_MASK removes top options (bits 29-31). I2C_CLEAR_ALL_INTS removes reserved options including top bits. Keep the latter. 31 29 27 25 23 21 19 17 15 13 11 09 07 05 03 01 30 28 26 24 22 20 18 16 14 12 10 08 06 04 02 00 -- IRQ_MASK: --------------------------------------------------- 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 -- I2C_CLEAR_ALL_INTS: ----------------------------------------- 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Notice I2C_CLEAR_ALL_INTS is more restrictive than IRQ_MASK. Reviewed-by: Linus Walleij Reviewed-by: Andi Shyti Signed-off-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index cd511c884f99..80bdf7e42613 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -94,9 +94,6 @@ /* some bits in ICR are reserved */ #define I2C_CLEAR_ALL_INTS 0x131f007f -/* first three msb bits are reserved */ -#define IRQ_MASK(mask) (mask & 0x1fffffff) - /* maximum threshold value */ #define MAX_I2C_FIFO_THRESHOLD 15 @@ -249,8 +246,7 @@ static int flush_i2c_fifo(struct nmk_i2c_dev *priv) */ static void disable_all_interrupts(struct nmk_i2c_dev *priv) { - u32 mask = IRQ_MASK(0); - writel(mask, priv->virtbase + I2C_IMSCR); + writel(0, priv->virtbase + I2C_IMSCR); } /** @@ -259,9 +255,7 @@ static void disable_all_interrupts(struct nmk_i2c_dev *priv) */ static void clear_all_interrupts(struct nmk_i2c_dev *priv) { - u32 mask; - mask = IRQ_MASK(I2C_CLEAR_ALL_INTS); - writel(mask, priv->virtbase + I2C_ICR); + writel(I2C_CLEAR_ALL_INTS, priv->virtbase + I2C_ICR); } /** @@ -468,7 +462,7 @@ static int read_i2c(struct nmk_i2c_dev *priv, u16 flags) else irq_mask |= I2C_IT_MTDWS; - irq_mask = I2C_CLEAR_ALL_INTS & IRQ_MASK(irq_mask); + irq_mask &= I2C_CLEAR_ALL_INTS; writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, priv->virtbase + I2C_IMSCR); @@ -547,7 +541,7 @@ static int write_i2c(struct nmk_i2c_dev *priv, u16 flags) else irq_mask |= I2C_IT_MTDWS; - irq_mask = I2C_CLEAR_ALL_INTS & IRQ_MASK(irq_mask); + irq_mask &= I2C_CLEAR_ALL_INTS; writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, priv->virtbase + I2C_IMSCR); @@ -703,8 +697,8 @@ static int nmk_i2c_xfer(struct i2c_adapter *i2c_adap, */ static int disable_interrupts(struct nmk_i2c_dev *priv, u32 irq) { - irq = IRQ_MASK(irq); - writel(readl(priv->virtbase + I2C_IMSCR) & ~(I2C_CLEAR_ALL_INTS & irq), + irq &= I2C_CLEAR_ALL_INTS; + writel(readl(priv->virtbase + I2C_IMSCR) & ~irq, priv->virtbase + I2C_IMSCR); return 0; } From a9f5cd892354425dcba67692e0afe3af6a2b4b1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:24 +0100 Subject: [PATCH 152/496] i2c: nomadik: use bitops helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constant register bit fields are declared using hardcoded hex values; replace them by calls to BIT() and GENMASK(). Replace custom GEN_MASK() macro by the generic FIELD_PREP(). Replace manual bit manipulations by the generic FIELD_GET() macro. Reviewed-by: Linus Walleij Signed-off-by: Théo Lebrun Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 161 +++++++++++++++++-------------- 1 file changed, 88 insertions(+), 73 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 80bdf7e42613..63ce70f763a8 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -9,6 +9,7 @@ * Author: Srinidhi Kasagar * Author: Sachin Verma */ +#include #include #include #include @@ -42,54 +43,59 @@ #define I2C_ICR (0x038) /* Control registers */ -#define I2C_CR_PE (0x1 << 0) /* Peripheral Enable */ -#define I2C_CR_OM (0x3 << 1) /* Operating mode */ -#define I2C_CR_SAM (0x1 << 3) /* Slave addressing mode */ -#define I2C_CR_SM (0x3 << 4) /* Speed mode */ -#define I2C_CR_SGCM (0x1 << 6) /* Slave general call mode */ -#define I2C_CR_FTX (0x1 << 7) /* Flush Transmit */ -#define I2C_CR_FRX (0x1 << 8) /* Flush Receive */ -#define I2C_CR_DMA_TX_EN (0x1 << 9) /* DMA Tx enable */ -#define I2C_CR_DMA_RX_EN (0x1 << 10) /* DMA Rx Enable */ -#define I2C_CR_DMA_SLE (0x1 << 11) /* DMA sync. logic enable */ -#define I2C_CR_LM (0x1 << 12) /* Loopback mode */ -#define I2C_CR_FON (0x3 << 13) /* Filtering on */ -#define I2C_CR_FS (0x3 << 15) /* Force stop enable */ +#define I2C_CR_PE BIT(0) /* Peripheral Enable */ +#define I2C_CR_OM GENMASK(2, 1) /* Operating mode */ +#define I2C_CR_SAM BIT(3) /* Slave addressing mode */ +#define I2C_CR_SM GENMASK(5, 4) /* Speed mode */ +#define I2C_CR_SGCM BIT(6) /* Slave general call mode */ +#define I2C_CR_FTX BIT(7) /* Flush Transmit */ +#define I2C_CR_FRX BIT(8) /* Flush Receive */ +#define I2C_CR_DMA_TX_EN BIT(9) /* DMA Tx enable */ +#define I2C_CR_DMA_RX_EN BIT(10) /* DMA Rx Enable */ +#define I2C_CR_DMA_SLE BIT(11) /* DMA sync. logic enable */ +#define I2C_CR_LM BIT(12) /* Loopback mode */ +#define I2C_CR_FON GENMASK(14, 13) /* Filtering on */ +#define I2C_CR_FS GENMASK(16, 15) /* Force stop enable */ + +/* Slave control register (SCR) */ +#define I2C_SCR_SLSU GENMASK(31, 16) /* Slave data setup time */ /* Master controller (MCR) register */ -#define I2C_MCR_OP (0x1 << 0) /* Operation */ -#define I2C_MCR_A7 (0x7f << 1) /* 7-bit address */ -#define I2C_MCR_EA10 (0x7 << 8) /* 10-bit Extended address */ -#define I2C_MCR_SB (0x1 << 11) /* Extended address */ -#define I2C_MCR_AM (0x3 << 12) /* Address type */ -#define I2C_MCR_STOP (0x1 << 14) /* Stop condition */ -#define I2C_MCR_LENGTH (0x7ff << 15) /* Transaction length */ +#define I2C_MCR_OP BIT(0) /* Operation */ +#define I2C_MCR_A7 GENMASK(7, 1) /* 7-bit address */ +#define I2C_MCR_EA10 GENMASK(10, 8) /* 10-bit Extended address */ +#define I2C_MCR_SB BIT(11) /* Extended address */ +#define I2C_MCR_AM GENMASK(13, 12) /* Address type */ +#define I2C_MCR_STOP BIT(14) /* Stop condition */ +#define I2C_MCR_LENGTH GENMASK(25, 15) /* Transaction length */ /* Status register (SR) */ -#define I2C_SR_OP (0x3 << 0) /* Operation */ -#define I2C_SR_STATUS (0x3 << 2) /* controller status */ -#define I2C_SR_CAUSE (0x7 << 4) /* Abort cause */ -#define I2C_SR_TYPE (0x3 << 7) /* Receive type */ -#define I2C_SR_LENGTH (0x7ff << 9) /* Transfer length */ +#define I2C_SR_OP GENMASK(1, 0) /* Operation */ +#define I2C_SR_STATUS GENMASK(3, 2) /* controller status */ +#define I2C_SR_CAUSE GENMASK(6, 4) /* Abort cause */ +#define I2C_SR_TYPE GENMASK(8, 7) /* Receive type */ +#define I2C_SR_LENGTH GENMASK(19, 9) /* Transfer length */ + +/* Baud-rate counter register (BRCR) */ +#define I2C_BRCR_BRCNT1 GENMASK(31, 16) /* Baud-rate counter 1 */ +#define I2C_BRCR_BRCNT2 GENMASK(15, 0) /* Baud-rate counter 2 */ /* Interrupt mask set/clear (IMSCR) bits */ -#define I2C_IT_TXFE (0x1 << 0) -#define I2C_IT_TXFNE (0x1 << 1) -#define I2C_IT_TXFF (0x1 << 2) -#define I2C_IT_TXFOVR (0x1 << 3) -#define I2C_IT_RXFE (0x1 << 4) -#define I2C_IT_RXFNF (0x1 << 5) -#define I2C_IT_RXFF (0x1 << 6) -#define I2C_IT_RFSR (0x1 << 16) -#define I2C_IT_RFSE (0x1 << 17) -#define I2C_IT_WTSR (0x1 << 18) -#define I2C_IT_MTD (0x1 << 19) -#define I2C_IT_STD (0x1 << 20) -#define I2C_IT_MAL (0x1 << 24) -#define I2C_IT_BERR (0x1 << 25) -#define I2C_IT_MTDWS (0x1 << 28) - -#define GEN_MASK(val, mask, sb) (((val) << (sb)) & (mask)) +#define I2C_IT_TXFE BIT(0) +#define I2C_IT_TXFNE BIT(1) +#define I2C_IT_TXFF BIT(2) +#define I2C_IT_TXFOVR BIT(3) +#define I2C_IT_RXFE BIT(4) +#define I2C_IT_RXFNF BIT(5) +#define I2C_IT_RXFF BIT(6) +#define I2C_IT_RFSR BIT(16) +#define I2C_IT_RFSE BIT(17) +#define I2C_IT_WTSR BIT(18) +#define I2C_IT_MTD BIT(19) +#define I2C_IT_STD BIT(20) +#define I2C_IT_MAL BIT(24) +#define I2C_IT_BERR BIT(25) +#define I2C_IT_MTDWS BIT(28) /* some bits in ICR are reserved */ #define I2C_CLEAR_ALL_INTS 0x131f007f @@ -128,6 +134,12 @@ enum i2c_operation { I2C_READ = 0x01 }; +enum i2c_operating_mode { + I2C_OM_SLAVE, + I2C_OM_MASTER, + I2C_OM_MASTER_OR_SLAVE, +}; + /** * struct i2c_nmk_client - client specific data * @slave_adr: 7-bit slave address @@ -284,7 +296,10 @@ exit: } /* enable peripheral, master mode operation */ -#define DEFAULT_I2C_REG_CR ((1 << 1) | I2C_CR_PE) +#define DEFAULT_I2C_REG_CR (FIELD_PREP(I2C_CR_OM, I2C_OM_MASTER) | I2C_CR_PE) + +/* grab top three bits from extended I2C addresses */ +#define ADR_3MSB_BITS GENMASK(9, 7) /** * load_i2c_mcr_reg() - load the MCR register @@ -296,41 +311,42 @@ static u32 load_i2c_mcr_reg(struct nmk_i2c_dev *priv, u16 flags) u32 mcr = 0; unsigned short slave_adr_3msb_bits; - mcr |= GEN_MASK(priv->cli.slave_adr, I2C_MCR_A7, 1); + mcr |= FIELD_PREP(I2C_MCR_A7, priv->cli.slave_adr); if (unlikely(flags & I2C_M_TEN)) { /* 10-bit address transaction */ - mcr |= GEN_MASK(2, I2C_MCR_AM, 12); + mcr |= FIELD_PREP(I2C_MCR_AM, 2); /* * Get the top 3 bits. * EA10 represents extended address in MCR. This includes * the extension (MSB bits) of the 7 bit address loaded * in A7 */ - slave_adr_3msb_bits = (priv->cli.slave_adr >> 7) & 0x7; + slave_adr_3msb_bits = FIELD_GET(ADR_3MSB_BITS, + priv->cli.slave_adr); - mcr |= GEN_MASK(slave_adr_3msb_bits, I2C_MCR_EA10, 8); + mcr |= FIELD_PREP(I2C_MCR_EA10, slave_adr_3msb_bits); } else { /* 7-bit address transaction */ - mcr |= GEN_MASK(1, I2C_MCR_AM, 12); + mcr |= FIELD_PREP(I2C_MCR_AM, 1); } /* start byte procedure not applied */ - mcr |= GEN_MASK(0, I2C_MCR_SB, 11); + mcr |= FIELD_PREP(I2C_MCR_SB, 0); /* check the operation, master read/write? */ if (priv->cli.operation == I2C_WRITE) - mcr |= GEN_MASK(I2C_WRITE, I2C_MCR_OP, 0); + mcr |= FIELD_PREP(I2C_MCR_OP, I2C_WRITE); else - mcr |= GEN_MASK(I2C_READ, I2C_MCR_OP, 0); + mcr |= FIELD_PREP(I2C_MCR_OP, I2C_READ); /* stop or repeated start? */ if (priv->stop) - mcr |= GEN_MASK(1, I2C_MCR_STOP, 14); + mcr |= FIELD_PREP(I2C_MCR_STOP, 1); else - mcr &= ~(GEN_MASK(1, I2C_MCR_STOP, 14)); + mcr &= ~FIELD_PREP(I2C_MCR_STOP, 1); - mcr |= GEN_MASK(priv->cli.count, I2C_MCR_LENGTH, 15); + mcr |= FIELD_PREP(I2C_MCR_LENGTH, priv->cli.count); return mcr; } @@ -383,7 +399,7 @@ static void setup_i2c_controller(struct nmk_i2c_dev *priv) slsu += 1; dev_dbg(&priv->adev->dev, "calculated SLSU = %04x\n", slsu); - writel(slsu << 16, priv->virtbase + I2C_SCR); + writel(FIELD_PREP(I2C_SCR_SLSU, slsu), priv->virtbase + I2C_SCR); /* * The spec says, in case of std. mode the divider is @@ -399,8 +415,8 @@ static void setup_i2c_controller(struct nmk_i2c_dev *priv) * plus operation. Currently we do not supprt high speed mode * so set brcr1 to 0. */ - brcr1 = 0 << 16; - brcr2 = (i2c_clk / (priv->clk_freq * div)) & 0xffff; + brcr1 = FIELD_PREP(I2C_BRCR_BRCNT1, 0); + brcr2 = FIELD_PREP(I2C_BRCR_BRCNT2, i2c_clk / (priv->clk_freq * div)); /* set the baud rate counter register */ writel((brcr1 | brcr2), priv->virtbase + I2C_BRCR); @@ -414,12 +430,13 @@ static void setup_i2c_controller(struct nmk_i2c_dev *priv) if (priv->sm > I2C_FREQ_MODE_FAST) { dev_err(&priv->adev->dev, "do not support this mode defaulting to std. mode\n"); - brcr2 = i2c_clk / (I2C_MAX_STANDARD_MODE_FREQ * 2) & 0xffff; + brcr2 = FIELD_PREP(I2C_BRCR_BRCNT2, + i2c_clk / (I2C_MAX_STANDARD_MODE_FREQ * 2)); writel((brcr1 | brcr2), priv->virtbase + I2C_BRCR); - writel(I2C_FREQ_MODE_STANDARD << 4, - priv->virtbase + I2C_CR); + writel(FIELD_PREP(I2C_CR_SM, I2C_FREQ_MODE_STANDARD), + priv->virtbase + I2C_CR); } - writel(priv->sm << 4, priv->virtbase + I2C_CR); + writel(FIELD_PREP(I2C_CR_SM, priv->sm), priv->virtbase + I2C_CR); /* set the Tx and Rx FIFO threshold */ writel(priv->tft, priv->virtbase + I2C_TFTR); @@ -583,13 +600,8 @@ static int nmk_i2c_xfer_one(struct nmk_i2c_dev *priv, u16 flags) u32 cause; i2c_sr = readl(priv->virtbase + I2C_SR); - /* - * Check if the controller I2C operation status - * is set to ABORT(11b). - */ - if (((i2c_sr >> 2) & 0x3) == 0x3) { - /* get the abort cause */ - cause = (i2c_sr >> 4) & 0x7; + if (FIELD_GET(I2C_SR_STATUS, i2c_sr) == I2C_ABORT) { + cause = FIELD_GET(I2C_SR_CAUSE, i2c_sr); dev_err(&priv->adev->dev, "%s\n", cause >= ARRAY_SIZE(abort_causes) ? "unknown reason" : @@ -730,7 +742,7 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) misr = readl(priv->virtbase + I2C_MISR); src = __ffs(misr); - switch ((1 << src)) { + switch (BIT(src)) { /* Transmit FIFO nearly empty interrupt */ case I2C_IT_TXFNE: @@ -824,15 +836,18 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) * during the transaction. */ case I2C_IT_BERR: + { + u32 sr; + + sr = readl(priv->virtbase + I2C_SR); priv->result = -EIO; - /* get the status */ - if (((readl(priv->virtbase + I2C_SR) >> 2) & 0x3) == I2C_ABORT) + if (FIELD_GET(I2C_SR_STATUS, sr) == I2C_ABORT) init_hw(priv); i2c_set_bit(priv->virtbase + I2C_ICR, I2C_IT_BERR); complete(&priv->xfer_complete); - - break; + } + break; /* * Tx FIFO overrun interrupt. From 7489cd43a2ea26c963c422c08e943a4a616fdb15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:25 +0100 Subject: [PATCH 153/496] i2c: nomadik: support short xfer timeouts using waitqueue & hrtimer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the completion by a waitqueue for synchronization from IRQ handler to task. For short timeouts, use hrtimers, else use timers. Usecase: avoid blocking the I2C bus for too long when an issue occurs. The threshold picked is one jiffy: if timeout is below that, use hrtimers. This threshold is NOT configurable. Implement behavior but do NOT change fetching of timeout. This means the timeout is unchanged (200ms) and the hrtimer case will never trigger. A waitqueue is used because it supports both desired timeout approaches. See wait_event_timeout() and wait_event_hrtimeout(). An atomic boolean serves as synchronization condition. Reviewed-by: Linus Walleij Reviewed-by: Wolfram Sang Reviewed-by: Andi Shyti Signed-off-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 70 ++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 63ce70f763a8..2ba7d082e205 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -168,10 +168,11 @@ struct i2c_nmk_client { * @clk_freq: clock frequency for the operation mode * @tft: Tx FIFO Threshold in bytes * @rft: Rx FIFO Threshold in bytes - * @timeout: Slave response timeout (ms) + * @timeout_usecs: Slave response timeout * @sm: speed mode * @stop: stop condition. - * @xfer_complete: acknowledge completion for a I2C message. + * @xfer_wq: xfer done wait queue. + * @xfer_done: xfer done boolean. * @result: controller propogated result. */ struct nmk_i2c_dev { @@ -185,10 +186,11 @@ struct nmk_i2c_dev { u32 clk_freq; unsigned char tft; unsigned char rft; - int timeout; + u32 timeout_usecs; enum i2c_freq_mode sm; int stop; - struct completion xfer_complete; + struct wait_queue_head xfer_wq; + bool xfer_done; int result; }; @@ -443,6 +445,22 @@ static void setup_i2c_controller(struct nmk_i2c_dev *priv) writel(priv->rft, priv->virtbase + I2C_RFTR); } +static bool nmk_i2c_wait_xfer_done(struct nmk_i2c_dev *priv) +{ + if (priv->timeout_usecs < jiffies_to_usecs(1)) { + unsigned long timeout_usecs = priv->timeout_usecs; + ktime_t timeout = ktime_set(0, timeout_usecs * NSEC_PER_USEC); + + wait_event_hrtimeout(priv->xfer_wq, priv->xfer_done, timeout); + } else { + unsigned long timeout = usecs_to_jiffies(priv->timeout_usecs); + + wait_event_timeout(priv->xfer_wq, priv->xfer_done, timeout); + } + + return priv->xfer_done; +} + /** * read_i2c() - Read from I2C client device * @priv: private data of I2C Driver @@ -454,9 +472,9 @@ static void setup_i2c_controller(struct nmk_i2c_dev *priv) */ static int read_i2c(struct nmk_i2c_dev *priv, u16 flags) { - int status = 0; u32 mcr, irq_mask; - unsigned long timeout; + int status = 0; + bool xfer_done; mcr = load_i2c_mcr_reg(priv, flags); writel(mcr, priv->virtbase + I2C_MCR); @@ -468,7 +486,8 @@ static int read_i2c(struct nmk_i2c_dev *priv, u16 flags) /* enable the controller */ i2c_set_bit(priv->virtbase + I2C_CR, I2C_CR_PE); - init_completion(&priv->xfer_complete); + init_waitqueue_head(&priv->xfer_wq); + priv->xfer_done = false; /* enable interrupts by setting the mask */ irq_mask = (I2C_IT_RXFNF | I2C_IT_RXFF | @@ -484,10 +503,9 @@ static int read_i2c(struct nmk_i2c_dev *priv, u16 flags) writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, priv->virtbase + I2C_IMSCR); - timeout = wait_for_completion_timeout( - &priv->xfer_complete, priv->adap.timeout); + xfer_done = nmk_i2c_wait_xfer_done(priv); - if (timeout == 0) { + if (!xfer_done) { /* Controller timed out */ dev_err(&priv->adev->dev, "read from slave 0x%x timed out\n", priv->cli.slave_adr); @@ -522,9 +540,9 @@ static void fill_tx_fifo(struct nmk_i2c_dev *priv, int no_bytes) */ static int write_i2c(struct nmk_i2c_dev *priv, u16 flags) { - u32 status = 0; u32 mcr, irq_mask; - unsigned long timeout; + u32 status = 0; + bool xfer_done; mcr = load_i2c_mcr_reg(priv, flags); @@ -537,7 +555,8 @@ static int write_i2c(struct nmk_i2c_dev *priv, u16 flags) /* enable the controller */ i2c_set_bit(priv->virtbase + I2C_CR, I2C_CR_PE); - init_completion(&priv->xfer_complete); + init_waitqueue_head(&priv->xfer_wq); + priv->xfer_done = false; /* enable interrupts by settings the masks */ irq_mask = (I2C_IT_TXFOVR | I2C_IT_MAL | I2C_IT_BERR); @@ -563,10 +582,9 @@ static int write_i2c(struct nmk_i2c_dev *priv, u16 flags) writel(readl(priv->virtbase + I2C_IMSCR) | irq_mask, priv->virtbase + I2C_IMSCR); - timeout = wait_for_completion_timeout( - &priv->xfer_complete, priv->adap.timeout); + xfer_done = nmk_i2c_wait_xfer_done(priv); - if (timeout == 0) { + if (!xfer_done) { /* Controller timed out */ dev_err(&priv->adev->dev, "write to slave 0x%x timed out\n", priv->cli.slave_adr); @@ -816,7 +834,9 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) priv->cli.count); init_hw(priv); } - complete(&priv->xfer_complete); + priv->xfer_done = true; + wake_up(&priv->xfer_wq); + break; @@ -826,7 +846,9 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) init_hw(priv); i2c_set_bit(priv->virtbase + I2C_ICR, I2C_IT_MAL); - complete(&priv->xfer_complete); + priv->xfer_done = true; + wake_up(&priv->xfer_wq); + break; @@ -845,7 +867,9 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) init_hw(priv); i2c_set_bit(priv->virtbase + I2C_ICR, I2C_IT_BERR); - complete(&priv->xfer_complete); + priv->xfer_done = true; + wake_up(&priv->xfer_wq); + } break; @@ -859,7 +883,9 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) init_hw(priv); dev_err(dev, "Tx Fifo Over run\n"); - complete(&priv->xfer_complete); + priv->xfer_done = true; + wake_up(&priv->xfer_wq); + break; @@ -960,7 +986,7 @@ static void nmk_i2c_of_probe(struct device_node *np, priv->sm = I2C_FREQ_MODE_FAST; priv->tft = 1; /* Tx FIFO threshold */ priv->rft = 8; /* Rx FIFO threshold */ - priv->timeout = 200; /* Slave response timeout(ms) */ + priv->timeout_usecs = 200 * USEC_PER_MSEC; /* Slave response timeout */ } static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) @@ -1020,7 +1046,7 @@ static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) adap->owner = THIS_MODULE; adap->class = I2C_CLASS_DEPRECATED; adap->algo = &nmk_i2c_algo; - adap->timeout = msecs_to_jiffies(priv->timeout); + adap->timeout = usecs_to_jiffies(priv->timeout_usecs); snprintf(adap->name, sizeof(adap->name), "Nomadik I2C at %pR", &adev->res); From c763072ab4533f0669b19c60d8a18b31f2877164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:26 +0100 Subject: [PATCH 154/496] i2c: nomadik: replace jiffies by ktime for FIFO flushing timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FIFO flush function uses a jiffies amount to detect timeouts as the flushing is async. Replace with ktime to get more accurate precision and support short timeouts. Reviewed-by: Linus Walleij Reviewed-by: Wolfram Sang Reviewed-by: Andi Shyti Signed-off-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 2ba7d082e205..020beb8ffa17 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -225,8 +225,8 @@ static inline void i2c_clr_bit(void __iomem *reg, u32 mask) static int flush_i2c_fifo(struct nmk_i2c_dev *priv) { #define LOOP_ATTEMPTS 10 + ktime_t timeout; int i; - unsigned long timeout; /* * flush the transmit and receive FIFO. The flushing @@ -238,9 +238,9 @@ static int flush_i2c_fifo(struct nmk_i2c_dev *priv) writel((I2C_CR_FTX | I2C_CR_FRX), priv->virtbase + I2C_CR); for (i = 0; i < LOOP_ATTEMPTS; i++) { - timeout = jiffies + priv->adap.timeout; + timeout = ktime_add_us(ktime_get(), priv->timeout_usecs); - while (!time_after(jiffies, timeout)) { + while (ktime_after(timeout, ktime_get())) { if ((readl(priv->virtbase + I2C_CR) & (I2C_CR_FTX | I2C_CR_FRX)) == 0) return 0; From ec189b9fb83cd1c869d45bfe3c9cb27b71093a16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:27 +0100 Subject: [PATCH 155/496] i2c: nomadik: fetch i2c-transfer-timeout-us property from devicetree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow overriding the default timeout value (200ms) from devicetree, using the generic i2c-transfer-timeout-us property. The i2c_adapter->timeout field is an unaccurate jiffies amount; i2c-nomadik uses hrtimers for timeouts below one jiffy. Reviewed-by: Linus Walleij Reviewed-by: Wolfram Sang Reviewed-by: Andi Shyti Signed-off-by: Théo Lebrun Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 020beb8ffa17..2e738b18677e 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -975,6 +975,8 @@ static const struct i2c_algorithm nmk_i2c_algo = { static void nmk_i2c_of_probe(struct device_node *np, struct nmk_i2c_dev *priv) { + u32 timeout_usecs; + /* Default to 100 kHz if no frequency is given in the node */ if (of_property_read_u32(np, "clock-frequency", &priv->clk_freq)) priv->clk_freq = I2C_MAX_STANDARD_MODE_FREQ; @@ -986,7 +988,12 @@ static void nmk_i2c_of_probe(struct device_node *np, priv->sm = I2C_FREQ_MODE_FAST; priv->tft = 1; /* Tx FIFO threshold */ priv->rft = 8; /* Rx FIFO threshold */ - priv->timeout_usecs = 200 * USEC_PER_MSEC; /* Slave response timeout */ + + /* Slave response timeout */ + if (!of_property_read_u32(np, "i2c-transfer-timeout-us", &timeout_usecs)) + priv->timeout_usecs = timeout_usecs; + else + priv->timeout_usecs = 200 * USEC_PER_MSEC; } static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) From 7d4c57abb9284030dc58ffac4fb76eb12d3d947c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:28 +0100 Subject: [PATCH 156/496] i2c: nomadik: support Mobileye EyeQ5 I2C controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add compatible for the integration of the same DB8500 IP block into the Mobileye EyeQ5 platform. Two quirks are present: - The memory bus only supports 32-bit accesses. Avoid writeb() and readb() by introducing helper functions that fallback to writel() and readl(). - A register must be configured for the I2C speed mode; it is located in a shared register region called OLB. We access that memory region using a syscon & regmap that gets passed as a phandle (mobileye,olb). A two-bit enum per controller is written into the register; that requires us to know the global index of the I2C controller (cell arg to the mobileye,olb phandle). We add #include and . Reviewed-by: Linus Walleij Signed-off-by: Théo Lebrun Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 90 ++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 2e738b18677e..0c2b1cbfad0a 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -6,6 +6,12 @@ * I2C master mode controller driver, used in Nomadik 8815 * and Ux500 platforms. * + * The Mobileye EyeQ5 platform is also supported; it uses + * the same Ux500/DB8500 IP block with two quirks: + * - The memory bus only supports 32-bit accesses. + * - A register must be configured for the I2C speed mode; + * it is located in a shared register region called OLB. + * * Author: Srinidhi Kasagar * Author: Sachin Verma */ @@ -22,6 +28,8 @@ #include #include #include +#include +#include #define DRIVER_NAME "nmk-i2c" @@ -110,6 +118,15 @@ enum i2c_freq_mode { I2C_FREQ_MODE_FAST_PLUS, /* up to 1 Mb/s */ }; +/* Mobileye EyeQ5 offset into a shared register region (called OLB) */ +#define NMK_I2C_EYEQ5_OLB_IOCR2 0x0B8 + +enum i2c_eyeq5_speed { + I2C_EYEQ5_SPEED_FAST, + I2C_EYEQ5_SPEED_FAST_PLUS, + I2C_EYEQ5_SPEED_HIGH_SPEED, +}; + /** * struct i2c_vendor_data - per-vendor variations * @has_mtdws: variant has the MTDWS bit @@ -174,6 +191,7 @@ struct i2c_nmk_client { * @xfer_wq: xfer done wait queue. * @xfer_done: xfer done boolean. * @result: controller propogated result. + * @has_32b_bus: controller is on a bus that only supports 32-bit accesses. */ struct nmk_i2c_dev { struct i2c_vendor_data *vendor; @@ -192,6 +210,7 @@ struct nmk_i2c_dev { struct wait_queue_head xfer_wq; bool xfer_done; int result; + bool has_32b_bus; }; /* controller's abort causes */ @@ -215,6 +234,24 @@ static inline void i2c_clr_bit(void __iomem *reg, u32 mask) writel(readl(reg) & ~mask, reg); } +static inline u8 nmk_i2c_readb(const struct nmk_i2c_dev *priv, + unsigned long reg) +{ + if (priv->has_32b_bus) + return readl(priv->virtbase + reg); + else + return readb(priv->virtbase + reg); +} + +static inline void nmk_i2c_writeb(const struct nmk_i2c_dev *priv, u32 val, + unsigned long reg) +{ + if (priv->has_32b_bus) + writel(val, priv->virtbase + reg); + else + writeb(val, priv->virtbase + reg); +} + /** * flush_i2c_fifo() - This function flushes the I2C FIFO * @priv: private data of I2C Driver @@ -523,7 +560,7 @@ static void fill_tx_fifo(struct nmk_i2c_dev *priv, int no_bytes) (priv->cli.count != 0); count--) { /* write to the Tx FIFO */ - writeb(*priv->cli.buffer, priv->virtbase + I2C_TFR); + nmk_i2c_writeb(priv, *priv->cli.buffer, I2C_TFR); priv->cli.buffer++; priv->cli.count--; priv->cli.xfer_bytes++; @@ -792,7 +829,7 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) case I2C_IT_RXFNF: for (count = rft; count > 0; count--) { /* Read the Rx FIFO */ - *priv->cli.buffer = readb(priv->virtbase + I2C_RFR); + *priv->cli.buffer = nmk_i2c_readb(priv, I2C_RFR); priv->cli.buffer++; } priv->cli.count -= rft; @@ -802,7 +839,7 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) /* Rx FIFO full */ case I2C_IT_RXFF: for (count = MAX_I2C_FIFO_THRESHOLD; count > 0; count--) { - *priv->cli.buffer = readb(priv->virtbase + I2C_RFR); + *priv->cli.buffer = nmk_i2c_readb(priv, I2C_RFR); priv->cli.buffer++; } priv->cli.count -= MAX_I2C_FIFO_THRESHOLD; @@ -818,7 +855,7 @@ static irqreturn_t i2c_irq_handler(int irq, void *arg) if (priv->cli.count == 0) break; *priv->cli.buffer = - readb(priv->virtbase + I2C_RFR); + nmk_i2c_readb(priv, I2C_RFR); priv->cli.buffer++; priv->cli.count--; priv->cli.xfer_bytes++; @@ -996,6 +1033,44 @@ static void nmk_i2c_of_probe(struct device_node *np, priv->timeout_usecs = 200 * USEC_PER_MSEC; } +static const unsigned int nmk_i2c_eyeq5_masks[] = { + GENMASK(5, 4), + GENMASK(7, 6), + GENMASK(9, 8), + GENMASK(11, 10), + GENMASK(13, 12), +}; + +static int nmk_i2c_eyeq5_probe(struct nmk_i2c_dev *priv) +{ + struct device *dev = &priv->adev->dev; + struct device_node *np = dev->of_node; + unsigned int mask, speed_mode; + struct regmap *olb; + unsigned int id; + + priv->has_32b_bus = true; + + olb = syscon_regmap_lookup_by_phandle_args(np, "mobileye,olb", 1, &id); + if (IS_ERR(olb)) + return PTR_ERR(olb); + if (id >= ARRAY_SIZE(nmk_i2c_eyeq5_masks)) + return -ENOENT; + + if (priv->clk_freq <= 400000) + speed_mode = I2C_EYEQ5_SPEED_FAST; + else if (priv->clk_freq <= 1000000) + speed_mode = I2C_EYEQ5_SPEED_FAST_PLUS; + else + speed_mode = I2C_EYEQ5_SPEED_HIGH_SPEED; + + mask = nmk_i2c_eyeq5_masks[id]; + regmap_update_bits(olb, NMK_I2C_EYEQ5_OLB_IOCR2, + mask, speed_mode << __fls(mask)); + + return 0; +} + static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) { int ret = 0; @@ -1012,8 +1087,15 @@ static int nmk_i2c_probe(struct amba_device *adev, const struct amba_id *id) priv->vendor = vendor; priv->adev = adev; + priv->has_32b_bus = false; nmk_i2c_of_probe(np, priv); + if (of_device_is_compatible(np, "mobileye,eyeq5-i2c")) { + ret = nmk_i2c_eyeq5_probe(priv); + if (ret) + return dev_err_probe(dev, ret, "failed OLB lookup\n"); + } + if (priv->tft > max_fifo_threshold) { dev_warn(dev, "requested TX FIFO threshold %u, adjusted down to %u\n", priv->tft, max_fifo_threshold); From bb271301b80410592cbe0170b9f6d2f677f68171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 6 Mar 2024 18:59:29 +0100 Subject: [PATCH 157/496] i2c: nomadik: sort includes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort #include statements in i2c-nomadik driver. Signed-off-by: Théo Lebrun Reviewed-by: Linus Walleij Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-nomadik.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index 0c2b1cbfad0a..4f41a3c7824d 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -15,21 +15,21 @@ * Author: Srinidhi Kasagar * Author: Sachin Verma */ -#include -#include -#include #include -#include -#include -#include -#include +#include #include +#include +#include +#include +#include #include -#include +#include +#include #include #include -#include +#include #include +#include #define DRIVER_NAME "nmk-i2c" From fb13b11d53875e28e7fbf0c26b288e4ea676aa9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20R=C3=B6sti?= Date: Mon, 11 Mar 2024 21:17:04 +0000 Subject: [PATCH 158/496] entry: Respect changes to system call number by trace_sys_enter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a probe is registered at the trace_sys_enter() tracepoint, and that probe changes the system call number, the old system call still gets executed. This worked correctly until commit b6ec41346103 ("core/entry: Report syscall correctly for trace and audit"), which removed the re-evaluation of the syscall number after the trace point. Restore the original semantics by re-evaluating the system call number after trace_sys_enter(). The performance impact of this re-evaluation is minimal because it only takes place when a trace point is active, and compared to the actual trace point overhead the read from a cache hot variable is negligible. Fixes: b6ec41346103 ("core/entry: Report syscall correctly for trace and audit") Signed-off-by: André Rösti Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240311211704.7262-1-an.roesti@gmail.com --- kernel/entry/common.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 88cb3c88aaa5..90843cc38588 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -57,8 +57,14 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } syscall_enter_audit(regs, syscall); From 10eb0d3314c59dd0497282b33afabddf607b3050 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 11 Mar 2024 16:25:53 -0600 Subject: [PATCH 159/496] ASoC: dt-bindings: cirrus,cs42l43: Fix 'gpio-ranges' schema 'gpio-ranges' is a phandle-array which is really a matrix. The schema in cirrus,cs42l43 is incomplete as it doesn't define there's only a single entry. Add the outer array constraints that there is a single entry. Signed-off-by: Rob Herring Link: https://msgid.link/r/20240311222554.1940567-1-robh@kernel.org Signed-off-by: Mark Brown --- .../devicetree/bindings/sound/cirrus,cs42l43.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/sound/cirrus,cs42l43.yaml b/Documentation/devicetree/bindings/sound/cirrus,cs42l43.yaml index 7f9d8c7a635a..99a536601cc7 100644 --- a/Documentation/devicetree/bindings/sound/cirrus,cs42l43.yaml +++ b/Documentation/devicetree/bindings/sound/cirrus,cs42l43.yaml @@ -185,11 +185,12 @@ properties: gpio-ranges: items: - - description: A phandle to the CODEC pinctrl node - minimum: 0 - - const: 0 - - const: 0 - - const: 3 + - items: + - description: A phandle to the CODEC pinctrl node + minimum: 0 + - const: 0 + - const: 0 + - const: 3 patternProperties: "-state$": From e4ead3cdfd798092288f3a06b405cf98ded6fa10 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 12 Mar 2024 10:16:38 +0100 Subject: [PATCH 160/496] regulator: core: Propagate the regulator state in case of exclusive get Previously, performing an exclusive get on an already-enabled regulator resulted in inconsistent state initialization between child and parent regulators. While the child's counts were updated, its parent's counters remained unaffected. Consequently, attempting to disable an already-enabled exclusive regulator triggered unbalanced disables warnings from its parent regulator. This commit addresses the issue by propagating the enable state to the parent regulator using a regulator_enable call. This ensures consistent state management across the regulator hierarchy, preventing warnings! Signed-off-by: Kory Maincent Link: https://msgid.link/r/20240312091638.1266167-1-kory.maincent@bootlin.com Signed-off-by: Mark Brown --- drivers/regulator/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index d019ca6dee9b..dabac9772741 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -2274,6 +2274,17 @@ struct regulator *_regulator_get(struct device *dev, const char *id, if (ret > 0) { rdev->use_count = 1; regulator->enable_count = 1; + + /* Propagate the regulator state to its supply */ + if (rdev->supply) { + ret = regulator_enable(rdev->supply); + if (ret < 0) { + destroy_regulator(regulator); + module_put(rdev->owner); + put_device(&rdev->dev); + return ERR_PTR(ret); + } + } } else { rdev->use_count = 0; regulator->enable_count = 0; From 2ae0ab0143fcc06190713ed81a6486ed0ad3c861 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Tue, 12 Mar 2024 12:20:48 +0100 Subject: [PATCH 161/496] spi: lpspi: Avoid potential use-after-free in probe() fsl_lpspi_probe() is allocating/disposing memory manually with spi_alloc_host()/spi_alloc_target(), but uses devm_spi_register_controller(). In case of error after the latter call the memory will be explicitly freed in the probe function by spi_controller_put() call, but used afterwards by "devm" management outside probe() (spi_unregister_controller() <- devm_spi_unregister() below). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 ... Call trace: kernfs_find_ns kernfs_find_and_get_ns sysfs_remove_group sysfs_remove_groups device_remove_attrs device_del spi_unregister_controller devm_spi_unregister release_nodes devres_release_all really_probe driver_probe_device __device_attach_driver bus_for_each_drv __device_attach device_initial_probe bus_probe_device deferred_probe_work_func process_one_work worker_thread kthread ret_from_fork Fixes: 5314987de5e5 ("spi: imx: add lpspi bus driver") Signed-off-by: Alexander Sverdlin Link: https://msgid.link/r/20240312112050.2503643-1-alexander.sverdlin@siemens.com Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-lpspi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-fsl-lpspi.c b/drivers/spi/spi-fsl-lpspi.c index 11991eb12636..079035db7dd8 100644 --- a/drivers/spi/spi-fsl-lpspi.c +++ b/drivers/spi/spi-fsl-lpspi.c @@ -830,11 +830,11 @@ static int fsl_lpspi_probe(struct platform_device *pdev) is_target = of_property_read_bool((&pdev->dev)->of_node, "spi-slave"); if (is_target) - controller = spi_alloc_target(&pdev->dev, - sizeof(struct fsl_lpspi_data)); + controller = devm_spi_alloc_target(&pdev->dev, + sizeof(struct fsl_lpspi_data)); else - controller = spi_alloc_host(&pdev->dev, - sizeof(struct fsl_lpspi_data)); + controller = devm_spi_alloc_host(&pdev->dev, + sizeof(struct fsl_lpspi_data)); if (!controller) return -ENOMEM; From aa0162dc0dd95c3bf248e3c78068760094e8f64b Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Mon, 11 Mar 2024 23:53:17 +0100 Subject: [PATCH 162/496] spi: Restore delays for non-GPIO chip select SPI controller with integrated chip select handling still need to adhere to SPI device's CS setup, hold and inactive delays. For controller without set_cs_timing spi core shall handle the delays to avoid duplicated delay handling in each controller driver. Fixes a regression for the out of tree SPI controller and SPI HID transport on Apple M1/M1 Pro/Max notebooks. Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core") Signed-off-by: Janne Grunau Link: https://msgid.link/r/20240311-spi-cs-delays-regression-v1-1-0075020a90b2@jannau.net Signed-off-by: Mark Brown --- drivers/spi/spi.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index f18738ae95f8..ff75838c1b5d 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1063,10 +1063,14 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) if (spi->mode & SPI_CS_HIGH) enable = !enable; - if (spi_is_csgpiod(spi)) { - if (!spi->controller->set_cs_timing && !activate) - spi_delay_exec(&spi->cs_hold, NULL); + /* + * Handle chip select delays for GPIO based CS or controllers without + * programmable chip select timing. + */ + if ((spi_is_csgpiod(spi) || !spi->controller->set_cs_timing) && !activate) + spi_delay_exec(&spi->cs_hold, NULL); + if (spi_is_csgpiod(spi)) { if (!(spi->mode & SPI_NO_CS)) { /* * Historically ACPI has no means of the GPIO polarity and @@ -1099,16 +1103,16 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) if ((spi->controller->flags & SPI_CONTROLLER_GPIO_SS) && spi->controller->set_cs) spi->controller->set_cs(spi, !enable); - - if (!spi->controller->set_cs_timing) { - if (activate) - spi_delay_exec(&spi->cs_setup, NULL); - else - spi_delay_exec(&spi->cs_inactive, NULL); - } } else if (spi->controller->set_cs) { spi->controller->set_cs(spi, !enable); } + + if (spi_is_csgpiod(spi) || !spi->controller->set_cs_timing) { + if (activate) + spi_delay_exec(&spi->cs_setup, NULL); + else + spi_delay_exec(&spi->cs_inactive, NULL); + } } #ifdef CONFIG_HAS_DMA From be5e8872b3fbc74b4a58a7e6a7e9fb7e8509eaf8 Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:37 +0800 Subject: [PATCH 163/496] riscv: errata: Rename defines for Andes Use "ANDES" rather than "ANDESTECH" to unify the naming convention with directory, file names, Kconfig options and other definitions. Signed-off-by: Yu Chien Peter Lin Reviewed-by: Charles Ci-Jyun Wu Reviewed-by: Leo Yu-Chi Liang Acked-by: Conor Dooley Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240222083946.3977135-2-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/errata/andes/errata.c | 10 +++++----- arch/riscv/include/asm/errata_list.h | 4 ++-- arch/riscv/include/asm/vendorid_list.h | 2 +- arch/riscv/kernel/alternative.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c index 17a904869724..f2708a9494a1 100644 --- a/arch/riscv/errata/andes/errata.c +++ b/arch/riscv/errata/andes/errata.c @@ -18,9 +18,9 @@ #include #include -#define ANDESTECH_AX45MP_MARCHID 0x8000000000008a45UL -#define ANDESTECH_AX45MP_MIMPID 0x500UL -#define ANDESTECH_SBI_EXT_ANDES 0x0900031E +#define ANDES_AX45MP_MARCHID 0x8000000000008a45UL +#define ANDES_AX45MP_MIMPID 0x500UL +#define ANDES_SBI_EXT_ANDES 0x0900031E #define ANDES_SBI_EXT_IOCP_SW_WORKAROUND 1 @@ -32,7 +32,7 @@ static long ax45mp_iocp_sw_workaround(void) * ANDES_SBI_EXT_IOCP_SW_WORKAROUND SBI EXT checks if the IOCP is missing and * cache is controllable only then CMO will be applied to the platform. */ - ret = sbi_ecall(ANDESTECH_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND, + ret = sbi_ecall(ANDES_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND, 0, 0, 0, 0, 0, 0); return ret.error ? 0 : ret.value; @@ -50,7 +50,7 @@ static void errata_probe_iocp(unsigned int stage, unsigned long arch_id, unsigne done = true; - if (arch_id != ANDESTECH_AX45MP_MARCHID || impid != ANDESTECH_AX45MP_MIMPID) + if (arch_id != ANDES_AX45MP_MARCHID || impid != ANDES_AX45MP_MIMPID) return; if (!ax45mp_iocp_sw_workaround()) diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index ea33288f8a25..96025eec5631 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -12,8 +12,8 @@ #include #ifdef CONFIG_ERRATA_ANDES -#define ERRATA_ANDESTECH_NO_IOCP 0 -#define ERRATA_ANDESTECH_NUMBER 1 +#define ERRATA_ANDES_NO_IOCP 0 +#define ERRATA_ANDES_NUMBER 1 #endif #ifdef CONFIG_ERRATA_SIFIVE diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h index e55407ace0c3..2f2bb0c84f9a 100644 --- a/arch/riscv/include/asm/vendorid_list.h +++ b/arch/riscv/include/asm/vendorid_list.h @@ -5,7 +5,7 @@ #ifndef ASM_VENDOR_LIST_H #define ASM_VENDOR_LIST_H -#define ANDESTECH_VENDOR_ID 0x31e +#define ANDES_VENDOR_ID 0x31e #define SIFIVE_VENDOR_ID 0x489 #define THEAD_VENDOR_ID 0x5b7 diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 319a1da0358b..0128b161bfda 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -43,7 +43,7 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info switch (cpu_mfr_info->vendor_id) { #ifdef CONFIG_ERRATA_ANDES - case ANDESTECH_VENDOR_ID: + case ANDES_VENDOR_ID: cpu_mfr_info->patch_func = andes_errata_patch_func; break; #endif From b88727d554f0fb826e0608192f59542497ba19c5 Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:40 +0800 Subject: [PATCH 164/496] dt-bindings: riscv: Add Andes interrupt controller compatible string Add "andestech,cpu-intc" compatible string to indicate that Andes specific local interrupt is supported on the core, e.g. AX45MP cores have 3 types of non-standard local interrupt which can be handled in supervisor mode: - Slave port ECC error interrupt - Bus write transaction error interrupt - Performance monitor overflow interrupt These interrupts are enabled/disabled via a custom register SLIE instead of the standard interrupt enable register SIE. Signed-off-by: Yu Chien Peter Lin Acked-by: Conor Dooley Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240222083946.3977135-5-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/cpus.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml index 9d8670c00e3b..6ccd75cbbc59 100644 --- a/Documentation/devicetree/bindings/riscv/cpus.yaml +++ b/Documentation/devicetree/bindings/riscv/cpus.yaml @@ -106,7 +106,11 @@ properties: const: 1 compatible: - const: riscv,cpu-intc + oneOf: + - items: + - const: andestech,cpu-intc + - const: riscv,cpu-intc + - const: riscv,cpu-intc interrupt-controller: true From 95113bb705157f3518cec4ff0225a922507a0f8b Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:41 +0800 Subject: [PATCH 165/496] riscv: dts: renesas: r9a07g043f: Update compatible string to use Andes INTC The Andes hart-level interrupt controller (Andes INTC) allows AX45MP cores to handle custom local interrupts, such as the performance counter overflow interrupt. Signed-off-by: Yu Chien Peter Lin Reviewed-by: Geert Uytterhoeven Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240222083946.3977135-6-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/renesas/r9a07g043f.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi b/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi index a92cfcfc021b..099f3df75b42 100644 --- a/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi +++ b/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi @@ -39,7 +39,7 @@ cpu0_intc: interrupt-controller { #interrupt-cells = <1>; - compatible = "riscv,cpu-intc"; + compatible = "andestech,cpu-intc", "riscv,cpu-intc"; interrupt-controller; }; }; From ea0e0178e101c8d4662a0db7424df057b88e2712 Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:42 +0800 Subject: [PATCH 166/496] perf: RISC-V: Eliminate redundant interrupt enable/disable operations The interrupt enable/disable operations are already performed by the IRQ chip functions riscv_intc_irq_unmask()/riscv_intc_irq_mask() during enable_percpu_irq()/disable_percpu_irq(). It can be done only once. Signed-off-by: Yu Chien Peter Lin Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20240222083946.3977135-7-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu_sbi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 16acd4dcdb96..2edbc37abadf 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -781,7 +781,6 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) if (riscv_pmu_use_irq) { cpu_hw_evt->irq = riscv_pmu_irq; csr_clear(CSR_IP, BIT(riscv_pmu_irq_num)); - csr_set(CSR_IE, BIT(riscv_pmu_irq_num)); enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE); } @@ -792,7 +791,6 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node) { if (riscv_pmu_use_irq) { disable_percpu_irq(riscv_pmu_irq); - csr_clear(CSR_IE, BIT(riscv_pmu_irq_num)); } /* Disable all counters access for user mode now */ From bc969d6cc96a2d0539576ec639f7a2a7dcf757f8 Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:43 +0800 Subject: [PATCH 167/496] perf: RISC-V: Introduce Andes PMU to support perf event sampling Assign riscv_pmu_irq_num the value of (256 + 18) for the custome PMU and add SSCOUNTOVF and SIP alternatives to ALT_SBI_PMU_OVERFLOW() and ALT_SBI_PMU_OVF_CLEAR_PENDING() macros, respectively. To make use of Andes PMU extension, "xandespmu" needs to be appended to the riscv,isa-extensions for each cpu node in device-tree, and make sure CONFIG_ANDES_CUSTOM_PMU is enabled. Signed-off-by: Yu Chien Peter Lin Reviewed-by: Charles Ci-Jyun Wu Reviewed-by: Leo Yu-Chi Liang Co-developed-by: Locus Wei-Han Chen Signed-off-by: Locus Wei-Han Chen Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240222083946.3977135-8-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/errata_list.h | 9 ------- arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpufeature.c | 1 + drivers/perf/Kconfig | 14 +++++++++++ drivers/perf/riscv_pmu_sbi.c | 35 +++++++++++++++++++++++++--- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 96025eec5631..1f2dbfb8a8bf 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -112,15 +112,6 @@ asm volatile(ALTERNATIVE( \ #define THEAD_C9XX_RV_IRQ_PMU 17 #define THEAD_C9XX_CSR_SCOUNTEROF 0x5c5 -#define ALT_SBI_PMU_OVERFLOW(__ovl) \ -asm volatile(ALTERNATIVE( \ - "csrr %0, " __stringify(CSR_SSCOUNTOVF), \ - "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \ - THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \ - CONFIG_ERRATA_THEAD_PMU) \ - : "=r" (__ovl) : \ - : "memory") - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 5340f818746b..bae7eac76c18 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -80,6 +80,7 @@ #define RISCV_ISA_EXT_ZFA 71 #define RISCV_ISA_EXT_ZTSO 72 #define RISCV_ISA_EXT_ZACAS 73 +#define RISCV_ISA_EXT_XANDESPMU 74 #define RISCV_ISA_EXT_MAX 128 #define RISCV_ISA_EXT_INVALID U32_MAX diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 89920f84d0a3..0c7688fa8376 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -307,6 +307,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), + __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU), }; const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index ec6e0d9194a1..564e813d8c69 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -86,6 +86,20 @@ config RISCV_PMU_SBI full perf feature support i.e. counter overflow, privilege mode filtering, counter configuration. +config ANDES_CUSTOM_PMU + bool "Andes custom PMU support" + depends on ARCH_RENESAS && RISCV_ALTERNATIVE && RISCV_PMU_SBI + default y + help + The Andes cores implement the PMU overflow extension very + similar to the standard Sscofpmf and Smcntrpmf extension. + + This will patch the overflow and pending CSRs and handle the + non-standard behaviour via the regular SBI PMU driver and + interface. + + If you don't know what to do here, say "Y". + config ARM_PMU_ACPI depends on ARM_PMU && ACPI def_bool y diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 2edbc37abadf..bbd6fe021b3a 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -19,11 +19,33 @@ #include #include #include +#include #include #include #include +#define ALT_SBI_PMU_OVERFLOW(__ovl) \ +asm volatile(ALTERNATIVE_2( \ + "csrr %0, " __stringify(CSR_SSCOUNTOVF), \ + "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF), \ + THEAD_VENDOR_ID, ERRATA_THEAD_PMU, \ + CONFIG_ERRATA_THEAD_PMU, \ + "csrr %0, " __stringify(ANDES_CSR_SCOUNTEROF), \ + 0, RISCV_ISA_EXT_XANDESPMU, \ + CONFIG_ANDES_CUSTOM_PMU) \ + : "=r" (__ovl) : \ + : "memory") + +#define ALT_SBI_PMU_OVF_CLEAR_PENDING(__irq_mask) \ +asm volatile(ALTERNATIVE( \ + "csrc " __stringify(CSR_IP) ", %0\n\t", \ + "csrc " __stringify(ANDES_CSR_SLIP) ", %0\n\t", \ + 0, RISCV_ISA_EXT_XANDESPMU, \ + CONFIG_ANDES_CUSTOM_PMU) \ + : : "r"(__irq_mask) \ + : "memory") + #define SYSCTL_NO_USER_ACCESS 0 #define SYSCTL_USER_ACCESS 1 #define SYSCTL_LEGACY 2 @@ -61,6 +83,7 @@ static int sysctl_perf_user_access __read_mostly = SYSCTL_USER_ACCESS; static union sbi_pmu_ctr_info *pmu_ctr_list; static bool riscv_pmu_use_irq; static unsigned int riscv_pmu_irq_num; +static unsigned int riscv_pmu_irq_mask; static unsigned int riscv_pmu_irq; /* Cache the available counters in a bitmask */ @@ -694,7 +717,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) event = cpu_hw_evt->events[fidx]; if (!event) { - csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num)); + ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask); return IRQ_NONE; } @@ -708,7 +731,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev) * Overflow interrupt pending bit should only be cleared after stopping * all the counters to avoid any race condition. */ - csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num)); + ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask); /* No overflow bit is set */ if (!overflow) @@ -780,7 +803,7 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) if (riscv_pmu_use_irq) { cpu_hw_evt->irq = riscv_pmu_irq; - csr_clear(CSR_IP, BIT(riscv_pmu_irq_num)); + ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask); enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE); } @@ -814,8 +837,14 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde riscv_cached_mimpid(0) == 0) { riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU; riscv_pmu_use_irq = true; + } else if (riscv_isa_extension_available(NULL, XANDESPMU) && + IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) { + riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI; + riscv_pmu_use_irq = true; } + riscv_pmu_irq_mask = BIT(riscv_pmu_irq_num % BITS_PER_LONG); + if (!riscv_pmu_use_irq) return -EOPNOTSUPP; From 61609bf2b29dcb07de3aaad7d6212cc3c341192b Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:44 +0800 Subject: [PATCH 168/496] dt-bindings: riscv: Add Andes PMU extension description Document the ISA string for Andes Technology performance monitor extension which provides counter overflow interrupt and mode filtering mechanisms. Signed-off-by: Yu Chien Peter Lin Acked-by: Conor Dooley Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20240222083946.3977135-9-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- Documentation/devicetree/bindings/riscv/extensions.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/riscv/extensions.yaml b/Documentation/devicetree/bindings/riscv/extensions.yaml index 63d81dc895e5..468c646247aa 100644 --- a/Documentation/devicetree/bindings/riscv/extensions.yaml +++ b/Documentation/devicetree/bindings/riscv/extensions.yaml @@ -477,5 +477,12 @@ properties: latency, as ratified in commit 56ed795 ("Update riscv-crypto-spec-vector.adoc") of riscv-crypto. + - const: xandespmu + description: + The Andes Technology performance monitor extension for counter overflow + and privilege mode filtering. For more details, see Counter Related + Registers in the AX45MP datasheet. + https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf + additionalProperties: true ... From 270fc77e7b0e38964635c2c5d87ad354dbd2cd34 Mon Sep 17 00:00:00 2001 From: Yu Chien Peter Lin Date: Thu, 22 Feb 2024 16:39:45 +0800 Subject: [PATCH 169/496] riscv: dts: renesas: Add Andes PMU extension for r9a07g043f xandespmu stands for Andes Performance Monitor Unit extension. Based on the added Andes PMU ISA string, the SBI PMU driver will make use of the non-standard irq source. Signed-off-by: Yu Chien Peter Lin Reviewed-by: Geert Uytterhoeven Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Acked-by: Conor Dooley Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20240222083946.3977135-10-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/renesas/r9a07g043f.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi b/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi index 099f3df75b42..d7a66043f13b 100644 --- a/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi +++ b/arch/riscv/boot/dts/renesas/r9a07g043f.dtsi @@ -27,7 +27,7 @@ riscv,isa-base = "rv64i"; riscv,isa-extensions = "i", "m", "a", "f", "d", "c", "zicntr", "zicsr", "zifencei", - "zihpm"; + "zihpm", "xandespmu"; mmu-type = "riscv,sv39"; i-cache-size = <0x8000>; i-cache-line-size = <0x40>; From f5102e31c209798cafd2d79463f5093771aadc12 Mon Sep 17 00:00:00 2001 From: Locus Wei-Han Chen Date: Thu, 22 Feb 2024 16:39:46 +0800 Subject: [PATCH 170/496] riscv: andes: Support specifying symbolic firmware and hardware raw events Add the Andes AX45 JSON files that allows specifying symbolic event names for the raw PMU events. Signed-off-by: Locus Wei-Han Chen Reviewed-by: Yu Chien Peter Lin Reviewed-by: Charles Ci-Jyun Wu Reviewed-by: Leo Yu-Chi Liang Tested-by: Lad Prabhakar Acked-by: Atish Patra Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20240222083946.3977135-11-peterlin@andestech.com Signed-off-by: Palmer Dabbelt --- .../arch/riscv/andes/ax45/firmware.json | 68 ++++++++++ .../arch/riscv/andes/ax45/instructions.json | 127 ++++++++++++++++++ .../arch/riscv/andes/ax45/memory.json | 57 ++++++++ .../arch/riscv/andes/ax45/microarch.json | 77 +++++++++++ tools/perf/pmu-events/arch/riscv/mapfile.csv | 1 + 5 files changed, 330 insertions(+) create mode 100644 tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json create mode 100644 tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json create mode 100644 tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json create mode 100644 tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json new file mode 100644 index 000000000000..9b4a032186a7 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json @@ -0,0 +1,68 @@ +[ + { + "ArchStdEvent": "FW_MISALIGNED_LOAD" + }, + { + "ArchStdEvent": "FW_MISALIGNED_STORE" + }, + { + "ArchStdEvent": "FW_ACCESS_LOAD" + }, + { + "ArchStdEvent": "FW_ACCESS_STORE" + }, + { + "ArchStdEvent": "FW_ILLEGAL_INSN" + }, + { + "ArchStdEvent": "FW_SET_TIMER" + }, + { + "ArchStdEvent": "FW_IPI_SENT" + }, + { + "ArchStdEvent": "FW_IPI_RECEIVED" + }, + { + "ArchStdEvent": "FW_FENCE_I_SENT" + }, + { + "ArchStdEvent": "FW_FENCE_I_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_SENT" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT" + }, + { + "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json new file mode 100644 index 000000000000..713a08c1a40f --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json @@ -0,0 +1,127 @@ +[ + { + "EventCode": "0x10", + "EventName": "cycle_count", + "BriefDescription": "Cycle count" + }, + { + "EventCode": "0x20", + "EventName": "inst_count", + "BriefDescription": "Retired instruction count" + }, + { + "EventCode": "0x30", + "EventName": "int_load_inst", + "BriefDescription": "Integer load instruction count" + }, + { + "EventCode": "0x40", + "EventName": "int_store_inst", + "BriefDescription": "Integer store instruction count" + }, + { + "EventCode": "0x50", + "EventName": "atomic_inst", + "BriefDescription": "Atomic instruction count" + }, + { + "EventCode": "0x60", + "EventName": "sys_inst", + "BriefDescription": "System instruction count" + }, + { + "EventCode": "0x70", + "EventName": "int_compute_inst", + "BriefDescription": "Integer computational instruction count" + }, + { + "EventCode": "0x80", + "EventName": "condition_br", + "BriefDescription": "Conditional branch instruction count" + }, + { + "EventCode": "0x90", + "EventName": "taken_condition_br", + "BriefDescription": "Taken conditional branch instruction count" + }, + { + "EventCode": "0xA0", + "EventName": "jal_inst", + "BriefDescription": "JAL instruction count" + }, + { + "EventCode": "0xB0", + "EventName": "jalr_inst", + "BriefDescription": "JALR instruction count" + }, + { + "EventCode": "0xC0", + "EventName": "ret_inst", + "BriefDescription": "Return instruction count" + }, + { + "EventCode": "0xD0", + "EventName": "control_trans_inst", + "BriefDescription": "Control transfer instruction count" + }, + { + "EventCode": "0xE0", + "EventName": "ex9_inst", + "BriefDescription": "EXEC.IT instruction count" + }, + { + "EventCode": "0xF0", + "EventName": "int_mul_inst", + "BriefDescription": "Integer multiplication instruction count" + }, + { + "EventCode": "0x100", + "EventName": "int_div_rem_inst", + "BriefDescription": "Integer division/remainder instruction count" + }, + { + "EventCode": "0x110", + "EventName": "float_load_inst", + "BriefDescription": "Floating-point load instruction count" + }, + { + "EventCode": "0x120", + "EventName": "float_store_inst", + "BriefDescription": "Floating-point store instruction count" + }, + { + "EventCode": "0x130", + "EventName": "float_add_sub_inst", + "BriefDescription": "Floating-point addition/subtraction instruction count" + }, + { + "EventCode": "0x140", + "EventName": "float_mul_inst", + "BriefDescription": "Floating-point multiplication instruction count" + }, + { + "EventCode": "0x150", + "EventName": "float_fused_muladd_inst", + "BriefDescription": "Floating-point fused multiply-add instruction count" + }, + { + "EventCode": "0x160", + "EventName": "float_div_sqrt_inst", + "BriefDescription": "Floating-point division or square-root instruction count" + }, + { + "EventCode": "0x170", + "EventName": "other_float_inst", + "BriefDescription": "Other floating-point instruction count" + }, + { + "EventCode": "0x180", + "EventName": "int_mul_add_sub_inst", + "BriefDescription": "Integer multiplication and add/sub instruction count" + }, + { + "EventCode": "0x190", + "EventName": "retired_ops", + "BriefDescription": "Retired operation count" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json new file mode 100644 index 000000000000..c7401b526c77 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json @@ -0,0 +1,57 @@ +[ + { + "EventCode": "0x01", + "EventName": "ilm_access", + "BriefDescription": "ILM access" + }, + { + "EventCode": "0x11", + "EventName": "dlm_access", + "BriefDescription": "DLM access" + }, + { + "EventCode": "0x21", + "EventName": "icache_access", + "BriefDescription": "ICACHE access" + }, + { + "EventCode": "0x31", + "EventName": "icache_miss", + "BriefDescription": "ICACHE miss" + }, + { + "EventCode": "0x41", + "EventName": "dcache_access", + "BriefDescription": "DCACHE access" + }, + { + "EventCode": "0x51", + "EventName": "dcache_miss", + "BriefDescription": "DCACHE miss" + }, + { + "EventCode": "0x61", + "EventName": "dcache_load_access", + "BriefDescription": "DCACHE load access" + }, + { + "EventCode": "0x71", + "EventName": "dcache_load_miss", + "BriefDescription": "DCACHE load miss" + }, + { + "EventCode": "0x81", + "EventName": "dcache_store_access", + "BriefDescription": "DCACHE store access" + }, + { + "EventCode": "0x91", + "EventName": "dcache_store_miss", + "BriefDescription": "DCACHE store miss" + }, + { + "EventCode": "0xA1", + "EventName": "dcache_wb", + "BriefDescription": "DCACHE writeback" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json new file mode 100644 index 000000000000..a6d378cbaa74 --- /dev/null +++ b/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json @@ -0,0 +1,77 @@ +[ + { + "EventCode": "0xB1", + "EventName": "cycle_wait_icache_fill", + "BriefDescription": "Cycles waiting for ICACHE fill data" + }, + { + "EventCode": "0xC1", + "EventName": "cycle_wait_dcache_fill", + "BriefDescription": "Cycles waiting for DCACHE fill data" + }, + { + "EventCode": "0xD1", + "EventName": "uncached_ifetch_from_bus", + "BriefDescription": "Uncached ifetch data access from bus" + }, + { + "EventCode": "0xE1", + "EventName": "uncached_load_from_bus", + "BriefDescription": "Uncached load data access from bus" + }, + { + "EventCode": "0xF1", + "EventName": "cycle_wait_uncached_ifetch", + "BriefDescription": "Cycles waiting for uncached ifetch data from bus" + }, + { + "EventCode": "0x101", + "EventName": "cycle_wait_uncached_load", + "BriefDescription": "Cycles waiting for uncached load data from bus" + }, + { + "EventCode": "0x111", + "EventName": "main_itlb_access", + "BriefDescription": "Main ITLB access" + }, + { + "EventCode": "0x121", + "EventName": "main_itlb_miss", + "BriefDescription": "Main ITLB miss" + }, + { + "EventCode": "0x131", + "EventName": "main_dtlb_access", + "BriefDescription": "Main DTLB access" + }, + { + "EventCode": "0x141", + "EventName": "main_dtlb_miss", + "BriefDescription": "Main DTLB miss" + }, + { + "EventCode": "0x151", + "EventName": "cycle_wait_itlb_fill", + "BriefDescription": "Cycles waiting for Main ITLB fill data" + }, + { + "EventCode": "0x161", + "EventName": "pipe_stall_cycle_dtlb_miss", + "BriefDescription": "Pipeline stall cycles caused by Main DTLB miss" + }, + { + "EventCode": "0x02", + "EventName": "mispredict_condition_br", + "BriefDescription": "Misprediction of conditional branches" + }, + { + "EventCode": "0x12", + "EventName": "mispredict_take_condition_br", + "BriefDescription": "Misprediction of taken conditional branches" + }, + { + "EventCode": "0x22", + "EventName": "mispredict_target_ret_inst", + "BriefDescription": "Misprediction of targets of Return instructions" + } +] diff --git a/tools/perf/pmu-events/arch/riscv/mapfile.csv b/tools/perf/pmu-events/arch/riscv/mapfile.csv index cfc449b19810..3d3a809a5446 100644 --- a/tools/perf/pmu-events/arch/riscv/mapfile.csv +++ b/tools/perf/pmu-events/arch/riscv/mapfile.csv @@ -17,3 +17,4 @@ 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core 0x5b7-0x0-0x0,v1,thead/c900-legacy,core 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core +0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core From 0a3737db8479b77f95f4bfda8e71b03c697eb56a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 08:29:47 -0600 Subject: [PATCH 171/496] io_uring/rw: return IOU_ISSUE_SKIP_COMPLETE for multishot retry If read multishot is being invoked from the poll retry handler, then we should return IOU_ISSUE_SKIP_COMPLETE rather than -EAGAIN. If not, then a CQE will be posted with -EAGAIN rather than triggering the retry when the file is flagged as readable again. Cc: stable@vger.kernel.org Reported-by: Sargun Dhillon Fixes: fc68fcda04910 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 47e097ab5d7e..0585ebcc9773 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -947,6 +947,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; return -EAGAIN; } From cef59d1ea7170ec753182302645a0191c8aa3382 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Mar 2024 14:56:27 +0000 Subject: [PATCH 172/496] io_uring: clean rings on NO_MMAP alloc fail We make a few cancellation judgements based on ctx->rings, so let's zero it afer deallocation for IORING_SETUP_NO_MMAP just like it's done with the mmap case. Likely, it's not a real problem, but zeroing is safer and better tested. Cc: stable@vger.kernel.org Fixes: 03d89a2de25bbc ("io_uring: support for user allocated memory for rings/sqes") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9ff6cdf91429b8a51699c210e1f6af6ea3f8bdcf.1710255382.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 49a124daa359..e7d7a456b489 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2788,14 +2788,15 @@ static void io_rings_free(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); - ctx->rings = NULL; - ctx->sq_sqes = NULL; } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; } + + ctx->rings = NULL; + ctx->sq_sqes = NULL; } void *io_mem_alloc(size_t size) From 9e2ab4b18ebd46813fc3459207335af4d368e323 Mon Sep 17 00:00:00 2001 From: Luca Ceresoli Date: Tue, 5 Mar 2024 15:36:28 +0100 Subject: [PATCH 173/496] ASoC: rockchip: i2s-tdm: Fix inaccurate sampling rates The sample rates set by the rockchip_i2s_tdm driver in master mode are inaccurate up to 5% in several cases, due to the driver logic to configure clocks and a nasty interaction with the Common Clock Framework. To understand what happens, here is the relevant section of the clock tree (slightly simplified), along with the names used in the driver: vpll0 _OR_ vpll1 "mclk_root" clk_i2s2_8ch_tx_src "mclk_parent" clk_i2s2_8ch_tx_mux clk_i2s2_8ch_tx "mclk" or "mclk_tx" This is what happens when playing back e.g. at 192 kHz using audio-graph-card (when recording the same applies, only s/tx/rx/): 0. at probe, rockchip_i2s_tdm_set_sysclk() stores the passed frequency in i2s_tdm->mclk_tx_freq (*) which is 50176000, and that is never modified afterwards 1. when playback is started, rockchip_i2s_tdm_hw_params() is called and does the following two calls 2. rockchip_i2s_tdm_calibrate_mclk(): 2a. selects mclk_root0 (vpll0) as a parent for mclk_parent (mclk_tx_src), which is OK because the vpll0 rate is a good for 192000 (and sumbultiple) rates 2b. sets the mclk_root frequency based on ppm calibration computations 2c. sets mclk_tx_src to 49152000 (= 256 * 192000), which is also OK as it is a multiple of the required bit clock 3. rockchip_i2s_tdm_set_mclk() 3a. calls clk_set_rate() to set the rate of mclk_tx (clk_i2s2_8ch_tx) to the value of i2s_tdm->mclk_tx_freq (*), i.e. 50176000 which is not a multiple of the sampling frequency -- this is not OK 3a1. clk_set_rate() reacts by reparenting clk_i2s2_8ch_tx_src to vpll1 -- this is not OK because the default vpll1 rate can be divided to get 44.1 kHz and related rates, not 192 kHz The result is that the driver does a lot of ad-hoc decisions about clocks and ends up in using the wrong parent at an unoptimal rate. Step 0 is one part of the problem: unless the card driver calls set_sysclk at each stream start, whatever rate is set in mclk_tx_freq during boot will be taken and used until reboot. Moreover the driver does not care if its value is not a multiple of any audio frequency. Another part of the problem is that the whole reparenting and clock rate setting logic is conflicting with the CCF algorithms to achieve largely the same goal: selecting the best parent and setting the closest clock rate. And it turns out that only calling once clk_set_rate() on clk_i2s2_8ch_tx picks the correct vpll and sets the correct rate. The fix is based on removing the custom logic in the driver to select the parent and set the various clocks, and just let the Clock Framework do it all. As a side effect, the set_sysclk() op becomes useless because we now let the CCF compute the appropriate value for the sampling rate. It also implies that the whole calibration logic is now dead code and so it is removed along with the "PCM Clock Compensation in PPM" kcontrol, which has always been broken anyway. The handling of the 4 optional clocks also becomes dead code and is removed. The actual rates have been tested playing 30 seconds of audio at various sampling rates before and after this change using sox: time play -r -n synth 30 sine 950 gain -3 The time reported in the table below is the 'real' value reported by the 'time' command in the above command line. rate before after --------- ------ ------ 8000 Hz 30.60s 30.63s 11025 Hz 30.45s 30.51s 16000 Hz 30.47s 30.50s 22050 Hz 30.78s 30.41s 32000 Hz 31.02s 30.43s 44100 Hz 30.78s 30.41s 48000 Hz 29.81s 30.45s 88200 Hz 30.78s 30.41s 96000 Hz 29.79s 30.42s 176400 Hz 27.40s 30.41s 192000 Hz 29.79s 30.42s While the tests are running the clock tree confirms that: * without the patch, vpll1 is always used and clk_i2s2_8ch_tx always produces 50176000 Hz, which cannot be divided for most audio rates except the slowest ones, generating inaccurate rates * with the patch: - for 192000 Hz vpll0 is used - for 176400 Hz vpll1 is used - clk_i2s2_8ch_tx always produces (256 * ) Hz Tested on the RK3308 using the internal audio codec. Fixes: 081068fd6414 ("ASoC: rockchip: add support for i2s-tdm controller") Signed-off-by: Luca Ceresoli Link: https://msgid.link/r/20240305-rk3308-audio-codec-v4-1-312acdbe628f@bootlin.com Signed-off-by: Mark Brown --- sound/soc/rockchip/rockchip_i2s_tdm.c | 352 +------------------------- 1 file changed, 6 insertions(+), 346 deletions(-) diff --git a/sound/soc/rockchip/rockchip_i2s_tdm.c b/sound/soc/rockchip/rockchip_i2s_tdm.c index 860e66ec85e8..9fa020ef7eab 100644 --- a/sound/soc/rockchip/rockchip_i2s_tdm.c +++ b/sound/soc/rockchip/rockchip_i2s_tdm.c @@ -25,8 +25,6 @@ #define DEFAULT_MCLK_FS 256 #define CH_GRP_MAX 4 /* The max channel 8 / 2 */ #define MULTIPLEX_CH_MAX 10 -#define CLK_PPM_MIN -1000 -#define CLK_PPM_MAX 1000 #define TRCM_TXRX 0 #define TRCM_TX 1 @@ -53,20 +51,6 @@ struct rk_i2s_tdm_dev { struct clk *hclk; struct clk *mclk_tx; struct clk *mclk_rx; - /* The mclk_tx_src is parent of mclk_tx */ - struct clk *mclk_tx_src; - /* The mclk_rx_src is parent of mclk_rx */ - struct clk *mclk_rx_src; - /* - * The mclk_root0 and mclk_root1 are root parent and supplies for - * the different FS. - * - * e.g: - * mclk_root0 is VPLL0, used for FS=48000Hz - * mclk_root1 is VPLL1, used for FS=44100Hz - */ - struct clk *mclk_root0; - struct clk *mclk_root1; struct regmap *regmap; struct regmap *grf; struct snd_dmaengine_dai_dma_data capture_dma_data; @@ -76,19 +60,11 @@ struct rk_i2s_tdm_dev { const struct rk_i2s_soc_data *soc_data; bool is_master_mode; bool io_multiplex; - bool mclk_calibrate; bool tdm_mode; - unsigned int mclk_rx_freq; - unsigned int mclk_tx_freq; - unsigned int mclk_root0_freq; - unsigned int mclk_root1_freq; - unsigned int mclk_root0_initial_freq; - unsigned int mclk_root1_initial_freq; unsigned int frame_width; unsigned int clk_trcm; unsigned int i2s_sdis[CH_GRP_MAX]; unsigned int i2s_sdos[CH_GRP_MAX]; - int clk_ppm; int refcount; spinlock_t lock; /* xfer lock */ bool has_playback; @@ -114,12 +90,6 @@ static void i2s_tdm_disable_unprepare_mclk(struct rk_i2s_tdm_dev *i2s_tdm) { clk_disable_unprepare(i2s_tdm->mclk_tx); clk_disable_unprepare(i2s_tdm->mclk_rx); - if (i2s_tdm->mclk_calibrate) { - clk_disable_unprepare(i2s_tdm->mclk_tx_src); - clk_disable_unprepare(i2s_tdm->mclk_rx_src); - clk_disable_unprepare(i2s_tdm->mclk_root0); - clk_disable_unprepare(i2s_tdm->mclk_root1); - } } /** @@ -142,29 +112,9 @@ static int i2s_tdm_prepare_enable_mclk(struct rk_i2s_tdm_dev *i2s_tdm) ret = clk_prepare_enable(i2s_tdm->mclk_rx); if (ret) goto err_mclk_rx; - if (i2s_tdm->mclk_calibrate) { - ret = clk_prepare_enable(i2s_tdm->mclk_tx_src); - if (ret) - goto err_mclk_rx; - ret = clk_prepare_enable(i2s_tdm->mclk_rx_src); - if (ret) - goto err_mclk_rx_src; - ret = clk_prepare_enable(i2s_tdm->mclk_root0); - if (ret) - goto err_mclk_root0; - ret = clk_prepare_enable(i2s_tdm->mclk_root1); - if (ret) - goto err_mclk_root1; - } return 0; -err_mclk_root1: - clk_disable_unprepare(i2s_tdm->mclk_root0); -err_mclk_root0: - clk_disable_unprepare(i2s_tdm->mclk_rx_src); -err_mclk_rx_src: - clk_disable_unprepare(i2s_tdm->mclk_tx_src); err_mclk_rx: clk_disable_unprepare(i2s_tdm->mclk_tx); err_mclk_tx: @@ -564,159 +514,6 @@ static void rockchip_i2s_tdm_xfer_resume(struct snd_pcm_substream *substream, I2S_XFER_RXS_START); } -static int rockchip_i2s_tdm_clk_set_rate(struct rk_i2s_tdm_dev *i2s_tdm, - struct clk *clk, unsigned long rate, - int ppm) -{ - unsigned long rate_target; - int delta, ret; - - if (ppm == i2s_tdm->clk_ppm) - return 0; - - if (ppm < 0) - delta = -1; - else - delta = 1; - - delta *= (int)div64_u64((u64)rate * (u64)abs(ppm) + 500000, - 1000000); - - rate_target = rate + delta; - - if (!rate_target) - return -EINVAL; - - ret = clk_set_rate(clk, rate_target); - if (ret) - return ret; - - i2s_tdm->clk_ppm = ppm; - - return 0; -} - -static int rockchip_i2s_tdm_calibrate_mclk(struct rk_i2s_tdm_dev *i2s_tdm, - struct snd_pcm_substream *substream, - unsigned int lrck_freq) -{ - struct clk *mclk_root; - struct clk *mclk_parent; - unsigned int mclk_root_freq; - unsigned int mclk_root_initial_freq; - unsigned int mclk_parent_freq; - unsigned int div, delta; - u64 ppm; - int ret; - - if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) - mclk_parent = i2s_tdm->mclk_tx_src; - else - mclk_parent = i2s_tdm->mclk_rx_src; - - switch (lrck_freq) { - case 8000: - case 16000: - case 24000: - case 32000: - case 48000: - case 64000: - case 96000: - case 192000: - mclk_root = i2s_tdm->mclk_root0; - mclk_root_freq = i2s_tdm->mclk_root0_freq; - mclk_root_initial_freq = i2s_tdm->mclk_root0_initial_freq; - mclk_parent_freq = DEFAULT_MCLK_FS * 192000; - break; - case 11025: - case 22050: - case 44100: - case 88200: - case 176400: - mclk_root = i2s_tdm->mclk_root1; - mclk_root_freq = i2s_tdm->mclk_root1_freq; - mclk_root_initial_freq = i2s_tdm->mclk_root1_initial_freq; - mclk_parent_freq = DEFAULT_MCLK_FS * 176400; - break; - default: - dev_err(i2s_tdm->dev, "Invalid LRCK frequency: %u Hz\n", - lrck_freq); - return -EINVAL; - } - - ret = clk_set_parent(mclk_parent, mclk_root); - if (ret) - return ret; - - ret = rockchip_i2s_tdm_clk_set_rate(i2s_tdm, mclk_root, - mclk_root_freq, 0); - if (ret) - return ret; - - delta = abs(mclk_root_freq % mclk_parent_freq - mclk_parent_freq); - ppm = div64_u64((uint64_t)delta * 1000000, (uint64_t)mclk_root_freq); - - if (ppm) { - div = DIV_ROUND_CLOSEST(mclk_root_initial_freq, mclk_parent_freq); - if (!div) - return -EINVAL; - - mclk_root_freq = mclk_parent_freq * round_up(div, 2); - - ret = clk_set_rate(mclk_root, mclk_root_freq); - if (ret) - return ret; - - i2s_tdm->mclk_root0_freq = clk_get_rate(i2s_tdm->mclk_root0); - i2s_tdm->mclk_root1_freq = clk_get_rate(i2s_tdm->mclk_root1); - } - - return clk_set_rate(mclk_parent, mclk_parent_freq); -} - -static int rockchip_i2s_tdm_set_mclk(struct rk_i2s_tdm_dev *i2s_tdm, - struct snd_pcm_substream *substream, - struct clk **mclk) -{ - unsigned int mclk_freq; - int ret; - - if (i2s_tdm->clk_trcm) { - if (i2s_tdm->mclk_tx_freq != i2s_tdm->mclk_rx_freq) { - dev_err(i2s_tdm->dev, - "clk_trcm, tx: %d and rx: %d should be the same\n", - i2s_tdm->mclk_tx_freq, - i2s_tdm->mclk_rx_freq); - return -EINVAL; - } - - ret = clk_set_rate(i2s_tdm->mclk_tx, i2s_tdm->mclk_tx_freq); - if (ret) - return ret; - - ret = clk_set_rate(i2s_tdm->mclk_rx, i2s_tdm->mclk_rx_freq); - if (ret) - return ret; - - /* mclk_rx is also ok. */ - *mclk = i2s_tdm->mclk_tx; - } else { - if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { - *mclk = i2s_tdm->mclk_tx; - mclk_freq = i2s_tdm->mclk_tx_freq; - } else { - *mclk = i2s_tdm->mclk_rx; - mclk_freq = i2s_tdm->mclk_rx_freq; - } - - ret = clk_set_rate(*mclk, mclk_freq); - if (ret) - return ret; - } - - return 0; -} - static int rockchip_i2s_ch_to_io(unsigned int ch, bool substream_capture) { if (substream_capture) { @@ -853,19 +650,17 @@ static int rockchip_i2s_tdm_hw_params(struct snd_pcm_substream *substream, struct snd_soc_dai *dai) { struct rk_i2s_tdm_dev *i2s_tdm = to_info(dai); - struct clk *mclk; - int ret = 0; unsigned int val = 0; unsigned int mclk_rate, bclk_rate, div_bclk = 4, div_lrck = 64; + int err; if (i2s_tdm->is_master_mode) { - if (i2s_tdm->mclk_calibrate) - rockchip_i2s_tdm_calibrate_mclk(i2s_tdm, substream, - params_rate(params)); + struct clk *mclk = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ? + i2s_tdm->mclk_tx : i2s_tdm->mclk_rx; - ret = rockchip_i2s_tdm_set_mclk(i2s_tdm, substream, &mclk); - if (ret) - return ret; + err = clk_set_rate(mclk, DEFAULT_MCLK_FS * params_rate(params)); + if (err) + return err; mclk_rate = clk_get_rate(mclk); bclk_rate = i2s_tdm->frame_width * params_rate(params); @@ -973,96 +768,6 @@ static int rockchip_i2s_tdm_trigger(struct snd_pcm_substream *substream, return 0; } -static int rockchip_i2s_tdm_set_sysclk(struct snd_soc_dai *cpu_dai, int stream, - unsigned int freq, int dir) -{ - struct rk_i2s_tdm_dev *i2s_tdm = to_info(cpu_dai); - - /* Put set mclk rate into rockchip_i2s_tdm_set_mclk() */ - if (i2s_tdm->clk_trcm) { - i2s_tdm->mclk_tx_freq = freq; - i2s_tdm->mclk_rx_freq = freq; - } else { - if (stream == SNDRV_PCM_STREAM_PLAYBACK) - i2s_tdm->mclk_tx_freq = freq; - else - i2s_tdm->mclk_rx_freq = freq; - } - - dev_dbg(i2s_tdm->dev, "The target mclk_%s freq is: %d\n", - stream ? "rx" : "tx", freq); - - return 0; -} - -static int rockchip_i2s_tdm_clk_compensation_info(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_info *uinfo) -{ - uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; - uinfo->count = 1; - uinfo->value.integer.min = CLK_PPM_MIN; - uinfo->value.integer.max = CLK_PPM_MAX; - uinfo->value.integer.step = 1; - - return 0; -} - -static int rockchip_i2s_tdm_clk_compensation_get(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_dai *dai = snd_kcontrol_chip(kcontrol); - struct rk_i2s_tdm_dev *i2s_tdm = snd_soc_dai_get_drvdata(dai); - - ucontrol->value.integer.value[0] = i2s_tdm->clk_ppm; - - return 0; -} - -static int rockchip_i2s_tdm_clk_compensation_put(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_dai *dai = snd_kcontrol_chip(kcontrol); - struct rk_i2s_tdm_dev *i2s_tdm = snd_soc_dai_get_drvdata(dai); - int ret = 0, ppm = 0; - int changed = 0; - unsigned long old_rate; - - if (ucontrol->value.integer.value[0] < CLK_PPM_MIN || - ucontrol->value.integer.value[0] > CLK_PPM_MAX) - return -EINVAL; - - ppm = ucontrol->value.integer.value[0]; - - old_rate = clk_get_rate(i2s_tdm->mclk_root0); - ret = rockchip_i2s_tdm_clk_set_rate(i2s_tdm, i2s_tdm->mclk_root0, - i2s_tdm->mclk_root0_freq, ppm); - if (ret) - return ret; - if (old_rate != clk_get_rate(i2s_tdm->mclk_root0)) - changed = 1; - - if (clk_is_match(i2s_tdm->mclk_root0, i2s_tdm->mclk_root1)) - return changed; - - old_rate = clk_get_rate(i2s_tdm->mclk_root1); - ret = rockchip_i2s_tdm_clk_set_rate(i2s_tdm, i2s_tdm->mclk_root1, - i2s_tdm->mclk_root1_freq, ppm); - if (ret) - return ret; - if (old_rate != clk_get_rate(i2s_tdm->mclk_root1)) - changed = 1; - - return changed; -} - -static struct snd_kcontrol_new rockchip_i2s_tdm_compensation_control = { - .iface = SNDRV_CTL_ELEM_IFACE_PCM, - .name = "PCM Clock Compensation in PPM", - .info = rockchip_i2s_tdm_clk_compensation_info, - .get = rockchip_i2s_tdm_clk_compensation_get, - .put = rockchip_i2s_tdm_clk_compensation_put, -}; - static int rockchip_i2s_tdm_dai_probe(struct snd_soc_dai *dai) { struct rk_i2s_tdm_dev *i2s_tdm = snd_soc_dai_get_drvdata(dai); @@ -1072,9 +777,6 @@ static int rockchip_i2s_tdm_dai_probe(struct snd_soc_dai *dai) if (i2s_tdm->has_playback) snd_soc_dai_dma_data_set_playback(dai, &i2s_tdm->playback_dma_data); - if (i2s_tdm->mclk_calibrate) - snd_soc_add_dai_controls(dai, &rockchip_i2s_tdm_compensation_control, 1); - return 0; } @@ -1115,7 +817,6 @@ static const struct snd_soc_dai_ops rockchip_i2s_tdm_dai_ops = { .probe = rockchip_i2s_tdm_dai_probe, .hw_params = rockchip_i2s_tdm_hw_params, .set_bclk_ratio = rockchip_i2s_tdm_set_bclk_ratio, - .set_sysclk = rockchip_i2s_tdm_set_sysclk, .set_fmt = rockchip_i2s_tdm_set_fmt, .set_tdm_slot = rockchip_dai_tdm_slot, .trigger = rockchip_i2s_tdm_trigger, @@ -1444,35 +1145,6 @@ static void rockchip_i2s_tdm_path_config(struct rk_i2s_tdm_dev *i2s_tdm, rockchip_i2s_tdm_tx_path_config(i2s_tdm, num); } -static int rockchip_i2s_tdm_get_calibrate_mclks(struct rk_i2s_tdm_dev *i2s_tdm) -{ - int num_mclks = 0; - - i2s_tdm->mclk_tx_src = devm_clk_get(i2s_tdm->dev, "mclk_tx_src"); - if (!IS_ERR(i2s_tdm->mclk_tx_src)) - num_mclks++; - - i2s_tdm->mclk_rx_src = devm_clk_get(i2s_tdm->dev, "mclk_rx_src"); - if (!IS_ERR(i2s_tdm->mclk_rx_src)) - num_mclks++; - - i2s_tdm->mclk_root0 = devm_clk_get(i2s_tdm->dev, "mclk_root0"); - if (!IS_ERR(i2s_tdm->mclk_root0)) - num_mclks++; - - i2s_tdm->mclk_root1 = devm_clk_get(i2s_tdm->dev, "mclk_root1"); - if (!IS_ERR(i2s_tdm->mclk_root1)) - num_mclks++; - - if (num_mclks < 4 && num_mclks != 0) - return -ENOENT; - - if (num_mclks == 4) - i2s_tdm->mclk_calibrate = 1; - - return 0; -} - static int rockchip_i2s_tdm_path_prepare(struct rk_i2s_tdm_dev *i2s_tdm, struct device_node *np, bool is_rx_path) @@ -1610,11 +1282,6 @@ static int rockchip_i2s_tdm_probe(struct platform_device *pdev) i2s_tdm->io_multiplex = of_property_read_bool(node, "rockchip,io-multiplex"); - ret = rockchip_i2s_tdm_get_calibrate_mclks(i2s_tdm); - if (ret) - return dev_err_probe(i2s_tdm->dev, ret, - "mclk-calibrate clocks missing"); - regs = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(regs)) { return dev_err_probe(i2s_tdm->dev, PTR_ERR(regs), @@ -1667,13 +1334,6 @@ static int rockchip_i2s_tdm_probe(struct platform_device *pdev) goto err_disable_hclk; } - if (i2s_tdm->mclk_calibrate) { - i2s_tdm->mclk_root0_initial_freq = clk_get_rate(i2s_tdm->mclk_root0); - i2s_tdm->mclk_root1_initial_freq = clk_get_rate(i2s_tdm->mclk_root1); - i2s_tdm->mclk_root0_freq = i2s_tdm->mclk_root0_initial_freq; - i2s_tdm->mclk_root1_freq = i2s_tdm->mclk_root1_initial_freq; - } - pm_runtime_enable(&pdev->dev); regmap_update_bits(i2s_tdm->regmap, I2S_DMACR, I2S_DMACR_TDL_MASK, From 23fb6bc2696119391ec3a92ccaffe50e567c515e Mon Sep 17 00:00:00 2001 From: Chancel Liu Date: Tue, 5 Mar 2024 15:56:06 +0900 Subject: [PATCH 174/496] ASoC: soc-core.c: Skip dummy codec when adding platforms When pcm_runtime is adding platform components it will scan all registered components. In case of DPCM FE/BE some DAI links will configure dummy platform. However both dummy codec and dummy platform are using "snd-soc-dummy" as component->name. Dummy codec should be skipped when adding platforms otherwise there'll be overflow and UBSAN complains. Reported-by: Zhipeng Wang Signed-off-by: Chancel Liu Link: https://msgid.link/r/20240305065606.3778642-1-chancel.liu@nxp.com Signed-off-by: Mark Brown --- sound/soc/soc-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c index 1e94edba12eb..2ec13d1634b6 100644 --- a/sound/soc/soc-core.c +++ b/sound/soc/soc-core.c @@ -1219,6 +1219,9 @@ static int snd_soc_add_pcm_runtime(struct snd_soc_card *card, if (!snd_soc_is_matching_component(platform, component)) continue; + if (snd_soc_component_is_dummy(component) && component->num_dai) + continue; + snd_soc_rtd_add_component(rtd, component); } } From f35c9af45ea7a4b1115b193d84858b14d13517fc Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 11 Mar 2024 17:20:37 +1000 Subject: [PATCH 175/496] nouveau: reset the bo resource bus info after an eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Later attempts to refault the bo won't happen and the whole GPU does to lunch. I think Christian's refactoring of this code out to the driver broke this not very well tested path. Fixes: 141b15e59175 ("drm/nouveau: move io_reserve_lru handling into the driver v5") Cc: Christian König Signed-off-by: Dave Airlie Acked-by: Christian König Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240311072037.287905-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nouveau_bo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c index 56dcd25db1ce..db8cbf615112 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bo.c +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c @@ -1256,6 +1256,8 @@ out: drm_vma_node_unmap(&nvbo->bo.base.vma_node, bdev->dev_mapping); nouveau_ttm_io_mem_free_locked(drm, nvbo->bo.resource); + nvbo->bo.resource->bus.offset = 0; + nvbo->bo.resource->bus.addr = NULL; goto retry; } From dea185b71bae61808c70263da5f9251e149f1e9e Mon Sep 17 00:00:00 2001 From: Timur Tabi Date: Fri, 9 Feb 2024 18:29:00 -0600 Subject: [PATCH 176/496] drm/nouveau: fix kerneldoc warnings kernel test robot complains about missing kerneldoc entries: drivers-gpu-drm-nouveau-nvkm-subdev-gsp-r535.c:warning: Function-parameter-or-struct-member-gsp-not-described-in-nvkm_gsp_radix3_sg Signed-off-by: Timur Tabi Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240210002900.148982-1-ttabi@nvidia.com --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c index a64c81385682..c029e5dc1948 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c @@ -1432,6 +1432,10 @@ r535_gsp_msg_post_event(void *priv, u32 fn, void *repv, u32 repc) /** * r535_gsp_msg_run_cpu_sequencer() -- process I/O commands from the GSP + * @priv: gsp pointer + * @fn: function number (ignored) + * @repv: pointer to libos print RPC + * @repc: message size * * The GSP sequencer is a list of I/O commands that the GSP can send to * the driver to perform for various purposes. The most common usage is to @@ -1783,6 +1787,7 @@ static void create_pte_array(u64 *ptes, dma_addr_t addr, size_t size) /** * r535_gsp_libos_init() -- create the libos arguments structure + * @gsp: gsp pointer * * The logging buffers are byte queues that contain encoded printf-like * messages from GSP-RM. They need to be decoded by a special application @@ -1922,6 +1927,10 @@ nvkm_gsp_radix3_dtor(struct nvkm_gsp *gsp, struct nvkm_gsp_radix3 *rx3) /** * nvkm_gsp_radix3_sg - build a radix3 table from a S/G list + * @gsp: gsp pointer + * @sgt: S/G list to traverse + * @size: size of the image, in bytes + * @rx3: radix3 array to update * * The GSP uses a three-level page table, called radix3, to map the firmware. * Each 64-bit "pointer" in the table is either the bus address of an entry in From 7af03e688792293ba33149fb8df619a8dff90e80 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:39 +0200 Subject: [PATCH 177/496] drm/probe-helper: warn about negative .get_modes() The .get_modes() callback is supposed to return the number of modes, never a negative error code. If a negative value is returned, it'll just be interpreted as a negative count, and added to previous calculations. Document the rules, but handle the negative values gracefully with an error message. Cc: stable@vger.kernel.org Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/50208c866facc33226a3c77b82bb96aeef8ef310.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/drm_probe_helper.c | 7 +++++++ include/drm/drm_modeset_helper_vtables.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_probe_helper.c b/drivers/gpu/drm/drm_probe_helper.c index 19ecb749704b..75f84753f6ee 100644 --- a/drivers/gpu/drm/drm_probe_helper.c +++ b/drivers/gpu/drm/drm_probe_helper.c @@ -422,6 +422,13 @@ static int drm_helper_probe_get_modes(struct drm_connector *connector) count = connector_funcs->get_modes(connector); + /* The .get_modes() callback should not return negative values. */ + if (count < 0) { + drm_err(connector->dev, ".get_modes() returned %pe\n", + ERR_PTR(count)); + count = 0; + } + /* * Fallback for when DDC probe failed in drm_get_edid() and thus skipped * override/firmware EDID. diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h index 881b03e4dc28..9ed42469540e 100644 --- a/include/drm/drm_modeset_helper_vtables.h +++ b/include/drm/drm_modeset_helper_vtables.h @@ -898,7 +898,8 @@ struct drm_connector_helper_funcs { * * RETURNS: * - * The number of modes added by calling drm_mode_probed_add(). + * The number of modes added by calling drm_mode_probed_add(). Return 0 + * on failures (no modes) instead of negative error codes. */ int (*get_modes)(struct drm_connector *connector); From fc4e97726530241d96dd7db72eb65979217422c9 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:40 +0200 Subject: [PATCH 178/496] drm/panel: do not return negative error codes from drm_panel_get_modes() None of the callers of drm_panel_get_modes() expect it to return negative error codes. Either they propagate the return value in their struct drm_connector_helper_funcs .get_modes() hook (which is also not supposed to return negative codes), or add it to other counts leading to bogus values. On the other hand, many of the struct drm_panel_funcs .get_modes() hooks do return negative error codes, so handle them gracefully instead of propagating further. Return 0 for no modes, whatever the reason. Cc: Neil Armstrong Cc: Jessica Zhang Cc: Sam Ravnborg Cc: stable@vger.kernel.org Reviewed-by: Neil Armstrong Reviewed-by: Jessica Zhang Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/79f559b72d8c493940417304e222a4b04dfa19c4.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/drm_panel.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/drm_panel.c b/drivers/gpu/drm/drm_panel.c index e814020bbcd3..cfbe020de54e 100644 --- a/drivers/gpu/drm/drm_panel.c +++ b/drivers/gpu/drm/drm_panel.c @@ -274,19 +274,24 @@ EXPORT_SYMBOL(drm_panel_disable); * The modes probed from the panel are automatically added to the connector * that the panel is attached to. * - * Return: The number of modes available from the panel on success or a - * negative error code on failure. + * Return: The number of modes available from the panel on success, or 0 on + * failure (no modes). */ int drm_panel_get_modes(struct drm_panel *panel, struct drm_connector *connector) { if (!panel) - return -EINVAL; + return 0; - if (panel->funcs && panel->funcs->get_modes) - return panel->funcs->get_modes(panel, connector); + if (panel->funcs && panel->funcs->get_modes) { + int num; - return -EOPNOTSUPP; + num = panel->funcs->get_modes(panel, connector); + if (num > 0) + return num; + } + + return 0; } EXPORT_SYMBOL(drm_panel_get_modes); From 13d5b040363c7ec0ac29c2de9cf661a24a8aa531 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:41 +0200 Subject: [PATCH 179/496] drm/exynos: do not return negative values from .get_modes() The .get_modes() hooks aren't supposed to return negative error codes. Return 0 for no modes, whatever the reason. Cc: Inki Dae Cc: Seung-Woo Kim Cc: Kyungmin Park Cc: stable@vger.kernel.org Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/d8665f620d9c252aa7d5a4811ff6b16e773903a2.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/exynos/exynos_drm_vidi.c | 4 ++-- drivers/gpu/drm/exynos/exynos_hdmi.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c index 00382f28748a..f5bbba9ad225 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c @@ -316,14 +316,14 @@ static int vidi_get_modes(struct drm_connector *connector) */ if (!ctx->raw_edid) { DRM_DEV_DEBUG_KMS(ctx->dev, "raw_edid is null.\n"); - return -EFAULT; + return 0; } edid_len = (1 + ctx->raw_edid->extensions) * EDID_LENGTH; edid = kmemdup(ctx->raw_edid, edid_len, GFP_KERNEL); if (!edid) { DRM_DEV_DEBUG_KMS(ctx->dev, "failed to allocate edid\n"); - return -ENOMEM; + return 0; } drm_connector_update_edid_property(connector, edid); diff --git a/drivers/gpu/drm/exynos/exynos_hdmi.c b/drivers/gpu/drm/exynos/exynos_hdmi.c index 43bed6cbaaea..b1d02dec3774 100644 --- a/drivers/gpu/drm/exynos/exynos_hdmi.c +++ b/drivers/gpu/drm/exynos/exynos_hdmi.c @@ -887,11 +887,11 @@ static int hdmi_get_modes(struct drm_connector *connector) int ret; if (!hdata->ddc_adpt) - return -ENODEV; + return 0; edid = drm_get_edid(connector, hdata->ddc_adpt); if (!edid) - return -ENODEV; + return 0; hdata->dvi_mode = !connector->display_info.is_hdmi; DRM_DEV_DEBUG_KMS(hdata->dev, "%s : width[%d] x height[%d]\n", From 171b711b26cce208bb628526b1b368aeec7b6fa4 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:42 +0200 Subject: [PATCH 180/496] drm/bridge: lt8912b: do not return negative values from .get_modes() The .get_modes() hooks aren't supposed to return negative error codes. Return 0 for no modes, whatever the reason. Cc: Adrien Grassein Cc: stable@vger.kernel.org Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/dcdddcbcb64b6f6cdc55022ee50c10dee8ddbc3d.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/bridge/lontium-lt8912b.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/bridge/lontium-lt8912b.c b/drivers/gpu/drm/bridge/lontium-lt8912b.c index e7c4bef74aa4..4b2ae27f0a57 100644 --- a/drivers/gpu/drm/bridge/lontium-lt8912b.c +++ b/drivers/gpu/drm/bridge/lontium-lt8912b.c @@ -441,23 +441,21 @@ lt8912_connector_mode_valid(struct drm_connector *connector, static int lt8912_connector_get_modes(struct drm_connector *connector) { const struct drm_edid *drm_edid; - int ret = -1; - int num = 0; struct lt8912 *lt = connector_to_lt8912(connector); u32 bus_format = MEDIA_BUS_FMT_RGB888_1X24; + int ret, num; drm_edid = drm_bridge_edid_read(lt->hdmi_port, connector); drm_edid_connector_update(connector, drm_edid); - if (drm_edid) { - num = drm_edid_connector_add_modes(connector); - } else { - return ret; - } + if (!drm_edid) + return 0; + + num = drm_edid_connector_add_modes(connector); ret = drm_display_info_set_bus_formats(&connector->display_info, &bus_format, 1); - if (ret) - num = ret; + if (ret < 0) + num = 0; drm_edid_free(drm_edid); return num; From c2da9ada64962fcd2e6395ed9987b9874ea032d3 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:43 +0200 Subject: [PATCH 181/496] drm/imx/ipuv3: do not return negative values from .get_modes() The .get_modes() hooks aren't supposed to return negative error codes. Return 0 for no modes, whatever the reason. Cc: Philipp Zabel Cc: stable@vger.kernel.org Acked-by: Philipp Zabel Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/311f6eec96d47949b16a670529f4d89fcd97aefa.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/imx/ipuv3/parallel-display.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/imx/ipuv3/parallel-display.c b/drivers/gpu/drm/imx/ipuv3/parallel-display.c index 70349739dd89..55dedd73f528 100644 --- a/drivers/gpu/drm/imx/ipuv3/parallel-display.c +++ b/drivers/gpu/drm/imx/ipuv3/parallel-display.c @@ -72,14 +72,14 @@ static int imx_pd_connector_get_modes(struct drm_connector *connector) int ret; if (!mode) - return -EINVAL; + return 0; ret = of_get_drm_display_mode(np, &imxpd->mode, &imxpd->bus_flags, OF_USE_NATIVE_MODE); if (ret) { drm_mode_destroy(connector->dev, mode); - return ret; + return 0; } drm_mode_copy(mode, &imxpd->mode); From abf493988e380f25242c1023275c68bd3579c9ce Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:44 +0200 Subject: [PATCH 182/496] drm/vc4: hdmi: do not return negative values from .get_modes() The .get_modes() hooks aren't supposed to return negative error codes. Return 0 for no modes, whatever the reason. Cc: Maxime Ripard Cc: stable@vger.kernel.org Acked-by: Maxime Ripard Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/dcda6d4003e2c6192987916b35c7304732800e08.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/vc4/vc4_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/vc4/vc4_hdmi.c b/drivers/gpu/drm/vc4/vc4_hdmi.c index 34f807ed1c31..d8751ea20303 100644 --- a/drivers/gpu/drm/vc4/vc4_hdmi.c +++ b/drivers/gpu/drm/vc4/vc4_hdmi.c @@ -509,7 +509,7 @@ static int vc4_hdmi_connector_get_modes(struct drm_connector *connector) edid = drm_get_edid(connector, vc4_hdmi->ddc); cec_s_phys_addr_from_edid(vc4_hdmi->cec_adap, edid); if (!edid) - return -ENODEV; + return 0; drm_connector_update_edid_property(connector, edid); ret = drm_add_edid_modes(connector, edid); From b43a72c4f3a8b858db57a83da2b64275561c4e73 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:45 +0200 Subject: [PATCH 183/496] drm/bridge: lt9611uxc: use int for holding number of modes lt9611uxc_connector_get_modes() propagates the return value of drm_edid_connector_add_modes() but stores the int temporarily in an unsigned int. Use the correct type. Cc: Andrzej Hajda Cc: Neil Armstrong Cc: Robert Foss Reviewed-by: Dmitry Baryshkov Reviewed-by: Neil Armstrong Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/ed97f4f036263cdc4f34330cef91214970f99a77.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/bridge/lontium-lt9611uxc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/lontium-lt9611uxc.c b/drivers/gpu/drm/bridge/lontium-lt9611uxc.c index bcf8bccd86d6..f4f593ad8f79 100644 --- a/drivers/gpu/drm/bridge/lontium-lt9611uxc.c +++ b/drivers/gpu/drm/bridge/lontium-lt9611uxc.c @@ -294,8 +294,8 @@ static struct mipi_dsi_device *lt9611uxc_attach_dsi(struct lt9611uxc *lt9611uxc, static int lt9611uxc_connector_get_modes(struct drm_connector *connector) { struct lt9611uxc *lt9611uxc = connector_to_lt9611uxc(connector); - unsigned int count; const struct drm_edid *drm_edid; + int count; drm_edid = drm_bridge_edid_read(<9611uxc->bridge, connector); drm_edid_connector_update(connector, drm_edid); From 9dd81b2e1ec72a3759f8d6bb6e9cbef93aab6227 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Mar 2024 18:03:46 +0200 Subject: [PATCH 184/496] drm/exynos: simplify the return value handling in exynos_dp_get_modes() Just use 0 and 1 directly, instead of the confusing local variable that's always set to 0. Cc: Inki Dae Cc: Seung-Woo Kim Cc: Kyungmin Park Acked-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/64cc680f14d961cedb726a6fd5c6dfd53ca9bb85.1709913674.git.jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/exynos/exynos_dp.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_dp.c b/drivers/gpu/drm/exynos/exynos_dp.c index ca31bad6c576..f48c4343f469 100644 --- a/drivers/gpu/drm/exynos/exynos_dp.c +++ b/drivers/gpu/drm/exynos/exynos_dp.c @@ -74,16 +74,15 @@ static int exynos_dp_get_modes(struct analogix_dp_plat_data *plat_data, { struct exynos_dp_device *dp = to_dp(plat_data); struct drm_display_mode *mode; - int num_modes = 0; if (dp->plat_data.panel) - return num_modes; + return 0; mode = drm_mode_create(connector->dev); if (!mode) { DRM_DEV_ERROR(dp->dev, "failed to create a new display mode.\n"); - return num_modes; + return 0; } drm_display_mode_from_videomode(&dp->vm, mode); @@ -94,7 +93,7 @@ static int exynos_dp_get_modes(struct analogix_dp_plat_data *plat_data, drm_mode_set_name(mode); drm_mode_probed_add(connector, mode); - return num_modes + 1; + return 1; } static int exynos_dp_bridge_attach(struct analogix_dp_plat_data *plat_data, From 8248ca30ef89f9cc74ace62ae1b9a22b5f16736c Mon Sep 17 00:00:00 2001 From: Ley Foon Tan Date: Thu, 7 Mar 2024 01:23:30 +0800 Subject: [PATCH 185/496] clocksource/drivers/timer-riscv: Clear timer interrupt on timer initialization In the RISC-V specification, the stimecmp register doesn't have a default value. To prevent the timer interrupt from being triggered during timer initialization, clear the timer interrupt by writing stimecmp with a maximum value. Fixes: 9f7a8ff6391f ("RISC-V: Prefer sstc extension if available") Cc: Signed-off-by: Ley Foon Tan Reviewed-by: Samuel Holland Tested-by: Samuel Holland Reviewed-by: Atish Patra Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20240306172330.255844-1-leyfoon.tan@starfivetech.com --- drivers/clocksource/timer-riscv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/clocksource/timer-riscv.c b/drivers/clocksource/timer-riscv.c index e66dcbd66566..79bb9a98baa7 100644 --- a/drivers/clocksource/timer-riscv.c +++ b/drivers/clocksource/timer-riscv.c @@ -108,6 +108,9 @@ static int riscv_timer_starting_cpu(unsigned int cpu) { struct clock_event_device *ce = per_cpu_ptr(&riscv_clock_event, cpu); + /* Clear timer interrupt */ + riscv_clock_event_stop(); + ce->cpumask = cpumask_of(cpu); ce->irq = riscv_clock_event_irq; if (riscv_timer_cannot_wake_cpu) From 5a83e7313ee115d955d4b7834d33ff4d5a46ab37 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 8 Mar 2024 10:25:55 -0800 Subject: [PATCH 186/496] riscv: lib: Introduce has_fast_unaligned_access() Create has_fast_unaligned_access to avoid needing to explicitly check the fast_misaligned_access_speed_key static key. Signed-off-by: Charlie Jenkins Reviewed-by: Evan Green Reviewed-by: Conor Dooley Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-1-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 11 ++++++++--- arch/riscv/kernel/cpufeature.c | 6 +++--- arch/riscv/lib/csum.c | 7 ++----- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 5a626ed2c47a..466e1f591919 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright 2022-2023 Rivos, Inc + * Copyright 2022-2024 Rivos, Inc */ #ifndef _ASM_CPUFEATURE_H @@ -53,6 +53,13 @@ static inline bool check_unaligned_access_emulated(int cpu) static inline void unaligned_emulation_finish(void) {} #endif +DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); + +static __always_inline bool has_fast_unaligned_accesses(void) +{ + return static_branch_likely(&fast_unaligned_access_speed_key); +} + unsigned long riscv_get_elf_hwcap(void); struct riscv_isa_ext_data { @@ -135,6 +142,4 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi return __riscv_isa_extension_available(hart_isa[cpu].isa, ext); } -DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key); - #endif diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 89920f84d0a3..7878cddccc0d 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -810,14 +810,14 @@ static void check_unaligned_access_nonboot_cpu(void *param) check_unaligned_access(pages[cpu]); } -DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key); +DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); static void modify_unaligned_access_branches(cpumask_t *mask, int weight) { if (cpumask_weight(mask) == weight) - static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key); + static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); else - static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key); + static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); } static void set_unaligned_access_static_branches_except_cpu(int cpu) diff --git a/arch/riscv/lib/csum.c b/arch/riscv/lib/csum.c index af3df5274ccb..7178e0acfa22 100644 --- a/arch/riscv/lib/csum.c +++ b/arch/riscv/lib/csum.c @@ -3,7 +3,7 @@ * Checksum library * * Influenced by arch/arm64/lib/csum.c - * Copyright (C) 2023 Rivos Inc. + * Copyright (C) 2023-2024 Rivos Inc. */ #include #include @@ -318,10 +318,7 @@ unsigned int do_csum(const unsigned char *buff, int len) * branches. The largest chunk of overlap was delegated into the * do_csum_common function. */ - if (static_branch_likely(&fast_misaligned_access_speed_key)) - return do_csum_no_alignment(buff, len); - - if (((unsigned long)buff & OFFSET_MASK) == 0) + if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0)) return do_csum_no_alignment(buff, len); return do_csum_with_alignment(buff, len); From 313130c62cf1fc410ac8730b291fd4fde582d032 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 8 Mar 2024 10:25:56 -0800 Subject: [PATCH 187/496] riscv: Only check online cpus for emulated accesses The unaligned access checker only sets valid values for online cpus. Check for these values on online cpus rather than on present cpus. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Fixes: 71c54b3d169d ("riscv: report misaligned accesses emulation to hwprobe") Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-2-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 8ded225e8c5b..c2ed4e689bf9 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -632,7 +632,7 @@ void unaligned_emulation_finish(void) * accesses emulated since tasks requesting such control can run on any * CPU. */ - for_each_present_cpu(cpu) { + for_each_online_cpu(cpu) { if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_EMULATED) { return; From 6e5ce7f2eae3c7c36dd1709efaac34820a34d538 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 8 Mar 2024 10:25:57 -0800 Subject: [PATCH 188/496] riscv: Decouple emulated unaligned accesses from access speed Detecting if a system traps into the kernel on an unaligned access can be performed separately from checking the speed of unaligned accesses. This decoupling will make it possible to selectively enable or disable each of these checks. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-3-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 2 +- arch/riscv/kernel/cpufeature.c | 25 +++++++++++++++++++++---- arch/riscv/kernel/traps_misaligned.c | 15 +++++++-------- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 466e1f591919..6fec91845aa0 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -37,7 +37,7 @@ void riscv_user_isa_enable(void); #ifdef CONFIG_RISCV_MISALIGNED bool unaligned_ctl_available(void); -bool check_unaligned_access_emulated(int cpu); +bool check_unaligned_access_emulated_all_cpus(void); void unaligned_emulation_finish(void); #else static inline bool unaligned_ctl_available(void) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 7878cddccc0d..abb3a2f53106 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -719,7 +719,8 @@ static int check_unaligned_access(void *param) void *src; long speed = RISCV_HWPROBE_MISALIGNED_SLOW; - if (check_unaligned_access_emulated(cpu)) + if (IS_ENABLED(CONFIG_RISCV_MISALIGNED) && + per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) return 0; /* Make an unaligned destination buffer. */ @@ -896,8 +897,8 @@ static int riscv_offline_cpu(unsigned int cpu) return 0; } -/* Measure unaligned access on all CPUs present at boot in parallel. */ -static int check_unaligned_access_all_cpus(void) +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ +static int check_unaligned_access_speed_all_cpus(void) { unsigned int cpu; unsigned int cpu_count = num_possible_cpus(); @@ -935,7 +936,6 @@ static int check_unaligned_access_all_cpus(void) riscv_online_cpu, riscv_offline_cpu); out: - unaligned_emulation_finish(); for_each_cpu(cpu, cpu_online_mask) { if (bufs[cpu]) __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); @@ -945,6 +945,23 @@ out: return 0; } +#ifdef CONFIG_RISCV_MISALIGNED +static int check_unaligned_access_all_cpus(void) +{ + bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); + + if (!all_cpus_emulated) + return check_unaligned_access_speed_all_cpus(); + + return 0; +} +#else +static int check_unaligned_access_all_cpus(void) +{ + return check_unaligned_access_speed_all_cpus(); +} +#endif + arch_initcall(check_unaligned_access_all_cpus); void riscv_user_isa_enable(void) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index c2ed4e689bf9..e55718179f42 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -596,7 +596,7 @@ int handle_misaligned_store(struct pt_regs *regs) return 0; } -bool check_unaligned_access_emulated(int cpu) +static bool check_unaligned_access_emulated(int cpu) { long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); unsigned long tmp_var, tmp_val; @@ -623,7 +623,7 @@ bool check_unaligned_access_emulated(int cpu) return misaligned_emu_detected; } -void unaligned_emulation_finish(void) +bool check_unaligned_access_emulated_all_cpus(void) { int cpu; @@ -632,13 +632,12 @@ void unaligned_emulation_finish(void) * accesses emulated since tasks requesting such control can run on any * CPU. */ - for_each_online_cpu(cpu) { - if (per_cpu(misaligned_access_speed, cpu) != - RISCV_HWPROBE_MISALIGNED_EMULATED) { - return; - } - } + for_each_online_cpu(cpu) + if (!check_unaligned_access_emulated(cpu)) + return false; + unaligned_ctl = true; + return true; } bool unaligned_ctl_available(void) From f413aae96cda059635910c462ede0a8f0385897c Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Fri, 8 Mar 2024 10:25:58 -0800 Subject: [PATCH 189/496] riscv: Set unaligned access speed at compile time Introduce Kconfig options to set the kernel unaligned access support. These options provide a non-portable alternative to the runtime unaligned access probe. To support this, the unaligned access probing code is moved into it's own file and gated behind a new RISCV_PROBE_UNALIGNED_ACCESS_SUPPORT option. Signed-off-by: Charlie Jenkins Reviewed-by: Conor Dooley Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-4-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 58 ++++- arch/riscv/include/asm/cpufeature.h | 24 +- arch/riscv/kernel/Makefile | 4 +- arch/riscv/kernel/cpufeature.c | 272 -------------------- arch/riscv/kernel/sys_hwprobe.c | 13 + arch/riscv/kernel/traps_misaligned.c | 2 + arch/riscv/kernel/unaligned_access_speed.c | 282 +++++++++++++++++++++ 7 files changed, 359 insertions(+), 296 deletions(-) create mode 100644 arch/riscv/kernel/unaligned_access_speed.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bffbd869a068..51481bf9364e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -688,27 +688,61 @@ config THREAD_SIZE_ORDER affects irq stack size, which is equal to thread stack size. config RISCV_MISALIGNED - bool "Support misaligned load/store traps for kernel and userspace" + bool select SYSCTL_ARCH_UNALIGN_ALLOW - default y help - Say Y here if you want the kernel to embed support for misaligned - load/store for both kernel and userspace. When disable, misaligned - accesses will generate SIGBUS in userspace and panic in kernel. + Embed support for emulating misaligned loads and stores. + +choice + prompt "Unaligned Accesses Support" + default RISCV_PROBE_UNALIGNED_ACCESS + help + This determines the level of support for unaligned accesses. This + information is used by the kernel to perform optimizations. It is also + exposed to user space via the hwprobe syscall. The hardware will be + probed at boot by default. + +config RISCV_PROBE_UNALIGNED_ACCESS + bool "Probe for hardware unaligned access support" + select RISCV_MISALIGNED + help + During boot, the kernel will run a series of tests to determine the + speed of unaligned accesses. This probing will dynamically determine + the speed of unaligned accesses on the underlying system. If unaligned + memory accesses trap into the kernel as they are not supported by the + system, the kernel will emulate the unaligned accesses to preserve the + UABI. + +config RISCV_EMULATED_UNALIGNED_ACCESS + bool "Emulate unaligned access where system support is missing" + select RISCV_MISALIGNED + help + If unaligned memory accesses trap into the kernel as they are not + supported by the system, the kernel will emulate the unaligned + accesses to preserve the UABI. When the underlying system does support + unaligned accesses, the unaligned accesses are assumed to be slow. + +config RISCV_SLOW_UNALIGNED_ACCESS + bool "Assume the system supports slow unaligned memory accesses" + depends on NONPORTABLE + help + Assume that the system supports slow unaligned memory accesses. The + kernel and userspace programs may not be able to run at all on systems + that do not support unaligned memory accesses. config RISCV_EFFICIENT_UNALIGNED_ACCESS - bool "Assume the CPU supports fast unaligned memory accesses" + bool "Assume the system supports fast unaligned memory accesses" depends on NONPORTABLE select DCACHE_WORD_ACCESS if MMU select HAVE_EFFICIENT_UNALIGNED_ACCESS help - Say Y here if you want the kernel to assume that the CPU supports - efficient unaligned memory accesses. When enabled, this option - improves the performance of the kernel on such CPUs. However, the - kernel will run much more slowly, or will not be able to run at all, - on CPUs that do not support efficient unaligned memory accesses. + Assume that the system supports fast unaligned memory accesses. When + enabled, this option improves the performance of the kernel on such + systems. However, the kernel and userspace programs will run much more + slowly, or will not be able to run at all, on systems that do not + support efficient unaligned memory accesses. - If unsure what to do here, say N. +endchoice endmenu # "Platform type" diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 6fec91845aa0..46061f5e9764 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -28,37 +28,39 @@ struct riscv_isainfo { DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo); -DECLARE_PER_CPU(long, misaligned_access_speed); - /* Per-cpu ISA extensions. */ extern struct riscv_isainfo hart_isa[NR_CPUS]; void riscv_user_isa_enable(void); -#ifdef CONFIG_RISCV_MISALIGNED -bool unaligned_ctl_available(void); +#if defined(CONFIG_RISCV_MISALIGNED) bool check_unaligned_access_emulated_all_cpus(void); void unaligned_emulation_finish(void); +bool unaligned_ctl_available(void); +DECLARE_PER_CPU(long, misaligned_access_speed); #else static inline bool unaligned_ctl_available(void) { return false; } - -static inline bool check_unaligned_access_emulated(int cpu) -{ - return false; -} - -static inline void unaligned_emulation_finish(void) {} #endif +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); static __always_inline bool has_fast_unaligned_accesses(void) { return static_branch_likely(&fast_unaligned_access_speed_key); } +#else +static __always_inline bool has_fast_unaligned_accesses(void) +{ + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + return true; + else + return false; +} +#endif unsigned long riscv_get_elf_hwcap(void); diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f71910718053..c8085126a6f9 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -38,7 +38,6 @@ extra-y += vmlinux.lds obj-y += head.o obj-y += soc.o obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o -obj-y += copy-unaligned.o obj-y += cpu.o obj-y += cpufeature.o obj-y += entry.o @@ -62,6 +61,9 @@ obj-y += tests/ obj-$(CONFIG_MMU) += vdso.o vdso/ obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o +obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o +obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o + obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_RISCV_ISA_V) += vector.o obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index abb3a2f53106..319670af5704 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -21,20 +20,12 @@ #include #include #include -#include #include #include #include -#include "copy-unaligned.h" - #define NUM_ALPHA_EXTS ('z' - 'a' + 1) -#define MISALIGNED_ACCESS_JIFFIES_LG2 1 -#define MISALIGNED_BUFFER_SIZE 0x4000 -#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) -#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) - unsigned long elf_hwcap __read_mostly; /* Host ISA bitmap */ @@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly; /* Per-cpu ISA extensions. */ struct riscv_isainfo hart_isa[NR_CPUS]; -/* Performance information */ -DEFINE_PER_CPU(long, misaligned_access_speed); - -static cpumask_t fast_misaligned_access; - /** * riscv_isa_extension_base() - Get base extension word * @@ -706,264 +692,6 @@ unsigned long riscv_get_elf_hwcap(void) return hwcap; } -static int check_unaligned_access(void *param) -{ - int cpu = smp_processor_id(); - u64 start_cycles, end_cycles; - u64 word_cycles; - u64 byte_cycles; - int ratio; - unsigned long start_jiffies, now; - struct page *page = param; - void *dst; - void *src; - long speed = RISCV_HWPROBE_MISALIGNED_SLOW; - - if (IS_ENABLED(CONFIG_RISCV_MISALIGNED) && - per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) - return 0; - - /* Make an unaligned destination buffer. */ - dst = (void *)((unsigned long)page_address(page) | 0x1); - /* Unalign src as well, but differently (off by 1 + 2 = 3). */ - src = dst + (MISALIGNED_BUFFER_SIZE / 2); - src += 2; - word_cycles = -1ULL; - /* Do a warmup. */ - __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); - preempt_disable(); - start_jiffies = jiffies; - while ((now = jiffies) == start_jiffies) - cpu_relax(); - - /* - * For a fixed amount of time, repeatedly try the function, and take - * the best time in cycles as the measurement. - */ - while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { - start_cycles = get_cycles64(); - /* Ensure the CSR read can't reorder WRT to the copy. */ - mb(); - __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); - /* Ensure the copy ends before the end time is snapped. */ - mb(); - end_cycles = get_cycles64(); - if ((end_cycles - start_cycles) < word_cycles) - word_cycles = end_cycles - start_cycles; - } - - byte_cycles = -1ULL; - __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); - start_jiffies = jiffies; - while ((now = jiffies) == start_jiffies) - cpu_relax(); - - while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { - start_cycles = get_cycles64(); - mb(); - __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); - mb(); - end_cycles = get_cycles64(); - if ((end_cycles - start_cycles) < byte_cycles) - byte_cycles = end_cycles - start_cycles; - } - - preempt_enable(); - - /* Don't divide by zero. */ - if (!word_cycles || !byte_cycles) { - pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", - cpu); - - return 0; - } - - if (word_cycles < byte_cycles) - speed = RISCV_HWPROBE_MISALIGNED_FAST; - - ratio = div_u64((byte_cycles * 100), word_cycles); - pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", - cpu, - ratio / 100, - ratio % 100, - (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); - - per_cpu(misaligned_access_speed, cpu) = speed; - - /* - * Set the value of fast_misaligned_access of a CPU. These operations - * are atomic to avoid race conditions. - */ - if (speed == RISCV_HWPROBE_MISALIGNED_FAST) - cpumask_set_cpu(cpu, &fast_misaligned_access); - else - cpumask_clear_cpu(cpu, &fast_misaligned_access); - - return 0; -} - -static void check_unaligned_access_nonboot_cpu(void *param) -{ - unsigned int cpu = smp_processor_id(); - struct page **pages = param; - - if (smp_processor_id() != 0) - check_unaligned_access(pages[cpu]); -} - -DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); - -static void modify_unaligned_access_branches(cpumask_t *mask, int weight) -{ - if (cpumask_weight(mask) == weight) - static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); - else - static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); -} - -static void set_unaligned_access_static_branches_except_cpu(int cpu) -{ - /* - * Same as set_unaligned_access_static_branches, except excludes the - * given CPU from the result. When a CPU is hotplugged into an offline - * state, this function is called before the CPU is set to offline in - * the cpumask, and thus the CPU needs to be explicitly excluded. - */ - - cpumask_t fast_except_me; - - cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); - cpumask_clear_cpu(cpu, &fast_except_me); - - modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); -} - -static void set_unaligned_access_static_branches(void) -{ - /* - * This will be called after check_unaligned_access_all_cpus so the - * result of unaligned access speed for all CPUs will be available. - * - * To avoid the number of online cpus changing between reading - * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be - * held before calling this function. - */ - - cpumask_t fast_and_online; - - cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); - - modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); -} - -static int lock_and_set_unaligned_access_static_branch(void) -{ - cpus_read_lock(); - set_unaligned_access_static_branches(); - cpus_read_unlock(); - - return 0; -} - -arch_initcall_sync(lock_and_set_unaligned_access_static_branch); - -static int riscv_online_cpu(unsigned int cpu) -{ - static struct page *buf; - - /* We are already set since the last check */ - if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) - goto exit; - - buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); - if (!buf) { - pr_warn("Allocation failure, not measuring misaligned performance\n"); - return -ENOMEM; - } - - check_unaligned_access(buf); - __free_pages(buf, MISALIGNED_BUFFER_ORDER); - -exit: - set_unaligned_access_static_branches(); - - return 0; -} - -static int riscv_offline_cpu(unsigned int cpu) -{ - set_unaligned_access_static_branches_except_cpu(cpu); - - return 0; -} - -/* Measure unaligned access speed on all CPUs present at boot in parallel. */ -static int check_unaligned_access_speed_all_cpus(void) -{ - unsigned int cpu; - unsigned int cpu_count = num_possible_cpus(); - struct page **bufs = kzalloc(cpu_count * sizeof(struct page *), - GFP_KERNEL); - - if (!bufs) { - pr_warn("Allocation failure, not measuring misaligned performance\n"); - return 0; - } - - /* - * Allocate separate buffers for each CPU so there's no fighting over - * cache lines. - */ - for_each_cpu(cpu, cpu_online_mask) { - bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); - if (!bufs[cpu]) { - pr_warn("Allocation failure, not measuring misaligned performance\n"); - goto out; - } - } - - /* Check everybody except 0, who stays behind to tend jiffies. */ - on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); - - /* Check core 0. */ - smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); - - /* - * Setup hotplug callbacks for any new CPUs that come online or go - * offline. - */ - cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", - riscv_online_cpu, riscv_offline_cpu); - -out: - for_each_cpu(cpu, cpu_online_mask) { - if (bufs[cpu]) - __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); - } - - kfree(bufs); - return 0; -} - -#ifdef CONFIG_RISCV_MISALIGNED -static int check_unaligned_access_all_cpus(void) -{ - bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); - - if (!all_cpus_emulated) - return check_unaligned_access_speed_all_cpus(); - - return 0; -} -#else -static int check_unaligned_access_all_cpus(void) -{ - return check_unaligned_access_speed_all_cpus(); -} -#endif - -arch_initcall(check_unaligned_access_all_cpus); - void riscv_user_isa_enable(void) { if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ)) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index a7c56b41efd2..8cae41a502dd 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -147,6 +147,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext) return (pair.value & ext); } +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) static u64 hwprobe_misaligned(const struct cpumask *cpus) { int cpu; @@ -169,6 +170,18 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus) return perf; } +#else +static u64 hwprobe_misaligned(const struct cpumask *cpus) +{ + if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS)) + return RISCV_HWPROBE_MISALIGNED_FAST; + + if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available()) + return RISCV_HWPROBE_MISALIGNED_EMULATED; + + return RISCV_HWPROBE_MISALIGNED_SLOW; +} +#endif static void hwprobe_one_pair(struct riscv_hwprobe *pair, const struct cpumask *cpus) diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index e55718179f42..2adb7c3e4dd5 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -413,7 +413,9 @@ int handle_misaligned_load(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED; +#endif if (!unaligned_enabled) return -1; diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c new file mode 100644 index 000000000000..52264ea4f0bd --- /dev/null +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Rivos Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "copy-unaligned.h" + +#define MISALIGNED_ACCESS_JIFFIES_LG2 1 +#define MISALIGNED_BUFFER_SIZE 0x4000 +#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE) +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80) + +DEFINE_PER_CPU(long, misaligned_access_speed); + +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS +static cpumask_t fast_misaligned_access; +static int check_unaligned_access(void *param) +{ + int cpu = smp_processor_id(); + u64 start_cycles, end_cycles; + u64 word_cycles; + u64 byte_cycles; + int ratio; + unsigned long start_jiffies, now; + struct page *page = param; + void *dst; + void *src; + long speed = RISCV_HWPROBE_MISALIGNED_SLOW; + + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + return 0; + + /* Make an unaligned destination buffer. */ + dst = (void *)((unsigned long)page_address(page) | 0x1); + /* Unalign src as well, but differently (off by 1 + 2 = 3). */ + src = dst + (MISALIGNED_BUFFER_SIZE / 2); + src += 2; + word_cycles = -1ULL; + /* Do a warmup. */ + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + preempt_disable(); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + /* + * For a fixed amount of time, repeatedly try the function, and take + * the best time in cycles as the measurement. + */ + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + /* Ensure the CSR read can't reorder WRT to the copy. */ + mb(); + __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE); + /* Ensure the copy ends before the end time is snapped. */ + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < word_cycles) + word_cycles = end_cycles - start_cycles; + } + + byte_cycles = -1ULL; + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + start_jiffies = jiffies; + while ((now = jiffies) == start_jiffies) + cpu_relax(); + + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) { + start_cycles = get_cycles64(); + mb(); + __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE); + mb(); + end_cycles = get_cycles64(); + if ((end_cycles - start_cycles) < byte_cycles) + byte_cycles = end_cycles - start_cycles; + } + + preempt_enable(); + + /* Don't divide by zero. */ + if (!word_cycles || !byte_cycles) { + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n", + cpu); + + return 0; + } + + if (word_cycles < byte_cycles) + speed = RISCV_HWPROBE_MISALIGNED_FAST; + + ratio = div_u64((byte_cycles * 100), word_cycles); + pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n", + cpu, + ratio / 100, + ratio % 100, + (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow"); + + per_cpu(misaligned_access_speed, cpu) = speed; + + /* + * Set the value of fast_misaligned_access of a CPU. These operations + * are atomic to avoid race conditions. + */ + if (speed == RISCV_HWPROBE_MISALIGNED_FAST) + cpumask_set_cpu(cpu, &fast_misaligned_access); + else + cpumask_clear_cpu(cpu, &fast_misaligned_access); + + return 0; +} + +static void check_unaligned_access_nonboot_cpu(void *param) +{ + unsigned int cpu = smp_processor_id(); + struct page **pages = param; + + if (smp_processor_id() != 0) + check_unaligned_access(pages[cpu]); +} + +DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key); + +static void modify_unaligned_access_branches(cpumask_t *mask, int weight) +{ + if (cpumask_weight(mask) == weight) + static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key); + else + static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key); +} + +static void set_unaligned_access_static_branches_except_cpu(int cpu) +{ + /* + * Same as set_unaligned_access_static_branches, except excludes the + * given CPU from the result. When a CPU is hotplugged into an offline + * state, this function is called before the CPU is set to offline in + * the cpumask, and thus the CPU needs to be explicitly excluded. + */ + + cpumask_t fast_except_me; + + cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask); + cpumask_clear_cpu(cpu, &fast_except_me); + + modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1); +} + +static void set_unaligned_access_static_branches(void) +{ + /* + * This will be called after check_unaligned_access_all_cpus so the + * result of unaligned access speed for all CPUs will be available. + * + * To avoid the number of online cpus changing between reading + * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be + * held before calling this function. + */ + + cpumask_t fast_and_online; + + cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask); + + modify_unaligned_access_branches(&fast_and_online, num_online_cpus()); +} + +static int lock_and_set_unaligned_access_static_branch(void) +{ + cpus_read_lock(); + set_unaligned_access_static_branches(); + cpus_read_unlock(); + + return 0; +} + +arch_initcall_sync(lock_and_set_unaligned_access_static_branch); + +static int riscv_online_cpu(unsigned int cpu) +{ + static struct page *buf; + + /* We are already set since the last check */ + if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN) + goto exit; + + buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!buf) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + return -ENOMEM; + } + + check_unaligned_access(buf); + __free_pages(buf, MISALIGNED_BUFFER_ORDER); + +exit: + set_unaligned_access_static_branches(); + + return 0; +} + +static int riscv_offline_cpu(unsigned int cpu) +{ + set_unaligned_access_static_branches_except_cpu(cpu); + + return 0; +} + +/* Measure unaligned access speed on all CPUs present at boot in parallel. */ +static int check_unaligned_access_speed_all_cpus(void) +{ + unsigned int cpu; + unsigned int cpu_count = num_possible_cpus(); + struct page **bufs = kzalloc(cpu_count * sizeof(struct page *), + GFP_KERNEL); + + if (!bufs) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + return 0; + } + + /* + * Allocate separate buffers for each CPU so there's no fighting over + * cache lines. + */ + for_each_cpu(cpu, cpu_online_mask) { + bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); + if (!bufs[cpu]) { + pr_warn("Allocation failure, not measuring misaligned performance\n"); + goto out; + } + } + + /* Check everybody except 0, who stays behind to tend jiffies. */ + on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1); + + /* Check core 0. */ + smp_call_on_cpu(0, check_unaligned_access, bufs[0], true); + + /* + * Setup hotplug callbacks for any new CPUs that come online or go + * offline. + */ + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online", + riscv_online_cpu, riscv_offline_cpu); + +out: + for_each_cpu(cpu, cpu_online_mask) { + if (bufs[cpu]) + __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER); + } + + kfree(bufs); + return 0; +} + +static int check_unaligned_access_all_cpus(void) +{ + bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus(); + + if (!all_cpus_emulated) + return check_unaligned_access_speed_all_cpus(); + + return 0; +} +#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */ +static int check_unaligned_access_all_cpus(void) +{ + check_unaligned_access_emulated_all_cpus(); + + return 0; +} +#endif + +arch_initcall(check_unaligned_access_all_cpus); From 04867a7a33324c9c562ee7949dbcaab7aaad1fb4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Mar 2024 15:28:24 +0000 Subject: [PATCH 190/496] swiotlb: Fix double-allocation of slots due to broken alignment handling Commit bbb73a103fbb ("swiotlb: fix a braino in the alignment check fix"), which was a fix for commit 0eee5ae10256 ("swiotlb: fix slot alignment checks"), causes a functional regression with vsock in a virtual machine using bouncing via a restricted DMA SWIOTLB pool. When virtio allocates the virtqueues for the vsock device using dma_alloc_coherent(), the SWIOTLB search can return page-unaligned allocations if 'area->index' was left unaligned by a previous allocation from the buffer: # Final address in brackets is the SWIOTLB address returned to the caller | virtio-pci 0000:00:07.0: orig_addr 0x0 alloc_size 0x2000, iotlb_align_mask 0x800 stride 0x2: got slot 1645-1649/7168 (0x98326800) | virtio-pci 0000:00:07.0: orig_addr 0x0 alloc_size 0x2000, iotlb_align_mask 0x800 stride 0x2: got slot 1649-1653/7168 (0x98328800) | virtio-pci 0000:00:07.0: orig_addr 0x0 alloc_size 0x2000, iotlb_align_mask 0x800 stride 0x2: got slot 1653-1657/7168 (0x9832a800) This ends badly (typically buffer corruption and/or a hang) because swiotlb_alloc() is expecting a page-aligned allocation and so blindly returns a pointer to the 'struct page' corresponding to the allocation, therefore double-allocating the first half (2KiB slot) of the 4KiB page. Fix the problem by treating the allocation alignment separately to any additional alignment requirements from the device, using the maximum of the two as the stride to search the buffer slots and taking care to ensure a minimum of page-alignment for buffers larger than a page. This also resolves swiotlb allocation failures occuring due to the inclusion of ~PAGE_MASK in 'iotlb_align_mask' for large allocations and resulting in alignment requirements exceeding swiotlb_max_mapping_size(). Fixes: bbb73a103fbb ("swiotlb: fix a braino in the alignment check fix") Fixes: 0eee5ae10256 ("swiotlb: fix slot alignment checks") Signed-off-by: Will Deacon Reviewed-by: Michael Kelley Reviewed-by: Petr Tesarik Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 77974cea3e69..980a7ec70418 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1004,7 +1004,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = - dma_get_min_align_mask(dev) | alloc_align_mask; + dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, slots_checked, count = 0, i; @@ -1015,19 +1015,18 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); + /* + * For mappings with an alignment requirement don't bother looping to + * unaligned slots once we found an aligned one. + */ + stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask)); + /* * For allocations of PAGE_SIZE or larger only look for page aligned * allocations. */ if (alloc_size >= PAGE_SIZE) - iotlb_align_mask |= ~PAGE_MASK; - iotlb_align_mask &= ~(IO_TLB_SIZE - 1); - - /* - * For mappings with an alignment requirement don't bother looping to - * unaligned slots once we found an aligned one. - */ - stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; + stride = umax(stride, PAGE_SHIFT - IO_TLB_SHIFT + 1); spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) @@ -1037,11 +1036,14 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool index = area->index; for (slots_checked = 0; slots_checked < pool->area_nslabs; ) { - slot_index = slot_base + index; + phys_addr_t tlb_addr; - if (orig_addr && - (slot_addr(tbl_dma_addr, slot_index) & - iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { + slot_index = slot_base + index; + tlb_addr = slot_addr(tbl_dma_addr, slot_index); + + if ((tlb_addr & alloc_align_mask) || + (orig_addr && (tlb_addr & iotlb_align_mask) != + (orig_addr & iotlb_align_mask))) { index = wrap_area_index(pool, index + 1); slots_checked++; continue; From 823353b7cf0ea9dfb09f5181d5fb2825d727200b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Mar 2024 15:28:25 +0000 Subject: [PATCH 191/496] swiotlb: Enforce page alignment in swiotlb_alloc() When allocating pages from a restricted DMA pool in swiotlb_alloc(), the buffer address is blindly converted to a 'struct page *' that is returned to the caller. In the unlikely event of an allocation bug, page-unaligned addresses are not detected and slots can silently be double-allocated. Add a simple check of the buffer alignment in swiotlb_alloc() to make debugging a little easier if something has gone wonky. Signed-off-by: Will Deacon Reviewed-by: Michael Kelley Reviewed-by: Petr Tesarik Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 980a7ec70418..88114433f1e6 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1689,6 +1689,12 @@ struct page *swiotlb_alloc(struct device *dev, size_t size) return NULL; tlb_addr = slot_addr(pool->start, index); + if (unlikely(!PAGE_ALIGNED(tlb_addr))) { + dev_WARN_ONCE(dev, 1, "Cannot allocate pages from non page-aligned swiotlb addr 0x%pa.\n", + &tlb_addr); + swiotlb_release_slots(dev, tlb_addr); + return NULL; + } return pfn_to_page(PFN_DOWN(tlb_addr)); } From cbf53074a528191df82b4dba1e3d21191102255e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Mar 2024 15:28:26 +0000 Subject: [PATCH 192/496] swiotlb: Honour dma_alloc_coherent() alignment in swiotlb_alloc() core-api/dma-api-howto.rst states the following properties of dma_alloc_coherent(): | The CPU virtual address and the DMA address are both guaranteed to | be aligned to the smallest PAGE_SIZE order which is greater than or | equal to the requested size. However, swiotlb_alloc() passes zero for the 'alloc_align_mask' parameter of swiotlb_find_slots() and so this property is not upheld. Instead, allocations larger than a page are aligned to PAGE_SIZE, Calculate the mask corresponding to the page order suitable for holding the allocation and pass that to swiotlb_find_slots(). Fixes: e81e99bacc9f ("swiotlb: Support aligned swiotlb buffers") Signed-off-by: Will Deacon Reviewed-by: Michael Kelley Reviewed-by: Petr Tesarik Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 88114433f1e6..a3645a9ae68e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1679,12 +1679,14 @@ struct page *swiotlb_alloc(struct device *dev, size_t size) struct io_tlb_mem *mem = dev->dma_io_tlb_mem; struct io_tlb_pool *pool; phys_addr_t tlb_addr; + unsigned int align; int index; if (!mem) return NULL; - index = swiotlb_find_slots(dev, 0, size, 0, &pool); + align = (1 << (get_order(size) + PAGE_SHIFT)) - 1; + index = swiotlb_find_slots(dev, 0, size, align, &pool); if (index == -1) return NULL; From 51b30ecb73b481d5fac6ccf2ecb4a309c9ee3310 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Mar 2024 15:28:27 +0000 Subject: [PATCH 193/496] swiotlb: Fix alignment checks when both allocation and DMA masks are present Nicolin reports that swiotlb buffer allocations fail for an NVME device behind an IOMMU using 64KiB pages. This is because we end up with a minimum allocation alignment of 64KiB (for the IOMMU to map the buffer safely) but a minimum DMA alignment mask corresponding to a 4KiB NVME page (i.e. preserving the 4KiB page offset from the original allocation). If the original address is not 4KiB-aligned, the allocation will fail because swiotlb_search_pool_area() erroneously compares these unmasked bits with the 64KiB-aligned candidate allocation. Tweak swiotlb_search_pool_area() so that the DMA alignment mask is reduced based on the required alignment of the allocation. Fixes: 82612d66d51d ("iommu: Allow the dma-iommu api to use bounce buffers") Link: https://lore.kernel.org/r/cover.1707851466.git.nicolinc@nvidia.com Reported-by: Nicolin Chen Signed-off-by: Will Deacon Reviewed-by: Michael Kelley Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a3645a9ae68e..f212943e51ca 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1003,8 +1003,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, pool->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); - unsigned int iotlb_align_mask = - dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int iotlb_align_mask = dma_get_min_align_mask(dev); unsigned int nslots = nr_slots(alloc_size), stride; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, slots_checked, count = 0, i; @@ -1015,6 +1014,14 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); + /* + * Ensure that the allocation is at least slot-aligned and update + * 'iotlb_align_mask' to ignore bits that will be preserved when + * offsetting into the allocation. + */ + alloc_align_mask |= (IO_TLB_SIZE - 1); + iotlb_align_mask &= ~alloc_align_mask; + /* * For mappings with an alignment requirement don't bother looping to * unaligned slots once we found an aligned one. From afc5aa46ed560f01ceda897c053c6a40c77ce5c4 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 8 Mar 2024 15:28:28 +0000 Subject: [PATCH 194/496] iommu/dma: Force swiotlb_max_mapping_size on an untrusted device The swiotlb does not support a mapping size > swiotlb_max_mapping_size(). On the other hand, with a 64KB PAGE_SIZE configuration, it's observed that an NVME device can map a size between 300KB~512KB, which certainly failed the swiotlb mappings, though the default pool of swiotlb has many slots: systemd[1]: Started Journal Service. => nvme 0000:00:01.0: swiotlb buffer is full (sz: 327680 bytes), total 32768 (slots), used 32 (slots) note: journal-offline[392] exited with irqs disabled note: journal-offline[392] exited with preempt_count 1 Call trace: [ 3.099918] swiotlb_tbl_map_single+0x214/0x240 [ 3.099921] iommu_dma_map_page+0x218/0x328 [ 3.099928] dma_map_page_attrs+0x2e8/0x3a0 [ 3.101985] nvme_prep_rq.part.0+0x408/0x878 [nvme] [ 3.102308] nvme_queue_rqs+0xc0/0x300 [nvme] [ 3.102313] blk_mq_flush_plug_list.part.0+0x57c/0x600 [ 3.102321] blk_add_rq_to_plug+0x180/0x2a0 [ 3.102323] blk_mq_submit_bio+0x4c8/0x6b8 [ 3.103463] __submit_bio+0x44/0x220 [ 3.103468] submit_bio_noacct_nocheck+0x2b8/0x360 [ 3.103470] submit_bio_noacct+0x180/0x6c8 [ 3.103471] submit_bio+0x34/0x130 [ 3.103473] ext4_bio_write_folio+0x5a4/0x8c8 [ 3.104766] mpage_submit_folio+0xa0/0x100 [ 3.104769] mpage_map_and_submit_buffers+0x1a4/0x400 [ 3.104771] ext4_do_writepages+0x6a0/0xd78 [ 3.105615] ext4_writepages+0x80/0x118 [ 3.105616] do_writepages+0x90/0x1e8 [ 3.105619] filemap_fdatawrite_wbc+0x94/0xe0 [ 3.105622] __filemap_fdatawrite_range+0x68/0xb8 [ 3.106656] file_write_and_wait_range+0x84/0x120 [ 3.106658] ext4_sync_file+0x7c/0x4c0 [ 3.106660] vfs_fsync_range+0x3c/0xa8 [ 3.106663] do_fsync+0x44/0xc0 Since untrusted devices might go down the swiotlb pathway with dma-iommu, these devices should not map a size larger than swiotlb_max_mapping_size. To fix this bug, add iommu_dma_max_mapping_size() for untrusted devices to take into account swiotlb_max_mapping_size() v.s. iova_rcache_range() from the iommu_dma_opt_mapping_size(). Fixes: 82612d66d51d ("iommu: Allow the dma-iommu api to use bounce buffers") Link: https://lore.kernel.org/r/ee51a3a5c32cf885b18f6416171802669f4a718a.1707851466.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen [will: Drop redundant is_swiotlb_active(dev) check] Signed-off-by: Will Deacon Reviewed-by: Michael Kelley Acked-by: Robin Murphy Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- drivers/iommu/dma-iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 50ccc4f1ef81..639efa0c4072 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1706,6 +1706,14 @@ static size_t iommu_dma_opt_mapping_size(void) return iova_rcache_range(); } +static size_t iommu_dma_max_mapping_size(struct device *dev) +{ + if (dev_is_untrusted(dev)) + return swiotlb_max_mapping_size(dev); + + return SIZE_MAX; +} + static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, @@ -1728,6 +1736,7 @@ static const struct dma_map_ops iommu_dma_ops = { .unmap_resource = iommu_dma_unmap_resource, .get_merge_boundary = iommu_dma_get_merge_boundary, .opt_mapping_size = iommu_dma_opt_mapping_size, + .max_mapping_size = iommu_dma_max_mapping_size, }; /* From 14cebf689a78e8a1c041138af221ef6eac6bc7da Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 8 Mar 2024 15:28:29 +0000 Subject: [PATCH 195/496] swiotlb: Reinstate page-alignment for mappings >= PAGE_SIZE For swiotlb allocations >= PAGE_SIZE, the slab search historically adjusted the stride to avoid checking unaligned slots. This had the side-effect of aligning large mapping requests to PAGE_SIZE, but that was broken by 0eee5ae10256 ("swiotlb: fix slot alignment checks"). Since this alignment could be relied upon drivers, reinstate PAGE_SIZE alignment for swiotlb mappings >= PAGE_SIZE. Reported-by: Michael Kelley Signed-off-by: Will Deacon Reviewed-by: Robin Murphy Reviewed-by: Petr Tesarik Tested-by: Nicolin Chen Tested-by: Michael Kelley Signed-off-by: Christoph Hellwig --- kernel/dma/swiotlb.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f212943e51ca..86fe172b5958 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1014,6 +1014,17 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool BUG_ON(!nslots); BUG_ON(area_index >= pool->nareas); + /* + * Historically, swiotlb allocations >= PAGE_SIZE were guaranteed to be + * page-aligned in the absence of any other alignment requirements. + * 'alloc_align_mask' was later introduced to specify the alignment + * explicitly, however this is passed as zero for streaming mappings + * and so we preserve the old behaviour there in case any drivers are + * relying on it. + */ + if (!alloc_align_mask && !iotlb_align_mask && alloc_size >= PAGE_SIZE) + alloc_align_mask = PAGE_SIZE - 1; + /* * Ensure that the allocation is at least slot-aligned and update * 'iotlb_align_mask' to ignore bits that will be preserved when @@ -1028,13 +1039,6 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool */ stride = get_max_slots(max(alloc_align_mask, iotlb_align_mask)); - /* - * For allocations of PAGE_SIZE or larger only look for page aligned - * allocations. - */ - if (alloc_size >= PAGE_SIZE) - stride = umax(stride, PAGE_SHIFT - IO_TLB_SHIFT + 1); - spin_lock_irqsave(&area->lock, flags); if (unlikely(nslots > pool->area_nslabs - area->used)) goto not_found; From 99f5819bee676ca70114423b0b29f43474b5fadf Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 4 Mar 2024 20:59:23 +0500 Subject: [PATCH 196/496] selftests/exec: binfmt_script: Add the overall result line according to TAP The following line is missing from the test's execution. Add it to make it fully TAP conformant: # Totals: pass:27 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Muhammad Usama Anjum Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240304155928.1818928-1-usama.anjum@collabora.com Signed-off-by: Kees Cook --- tools/testing/selftests/exec/binfmt_script.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/exec/binfmt_script.py b/tools/testing/selftests/exec/binfmt_script.py index 05f94a741c7a..2c575a2c0eab 100755 --- a/tools/testing/selftests/exec/binfmt_script.py +++ b/tools/testing/selftests/exec/binfmt_script.py @@ -16,6 +16,8 @@ SIZE=256 NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."])) test_num=0 +pass_num=0 +fail_num=0 code='''#!/usr/bin/perl print "Executed interpreter! Args:\n"; @@ -42,7 +44,7 @@ foreach my $a (@ARGV) { # ... def test(name, size, good=True, leading="", root="./", target="/perl", fill="A", arg="", newline="\n", hashbang="#!"): - global test_num, tests, NAME_MAX + global test_num, pass_num, fail_num, tests, NAME_MAX test_num += 1 if test_num > tests: raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)" @@ -80,16 +82,20 @@ def test(name, size, good=True, leading="", root="./", target="/perl", if good: print("ok %d - binfmt_script %s (successful good exec)" % (test_num, name)) + pass_num += 1 else: print("not ok %d - binfmt_script %s succeeded when it should have failed" % (test_num, name)) + fail_num = 1 else: if good: print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)" % (test_num, name, proc.returncode)) + fail_num = 1 else: print("ok %d - binfmt_script %s (correctly failed bad exec)" % (test_num, name)) + pass_num += 1 # Clean up crazy binaries os.unlink(script) @@ -166,6 +172,8 @@ test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ") test(name="two-under-leading", size=int(SIZE/2), leading=" ") test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ") +print("# Totals: pass:%d fail:%d xfail:0 xpass:0 skip:0 error:0" % (pass_num, fail_num)) + if test_num != tests: raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d" % (test_num, tests)) From c4095067736b7ed50316a2bc7c9577941e87ad45 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 4 Mar 2024 20:59:24 +0500 Subject: [PATCH 197/496] selftests/exec: load_address: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Signed-off-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240304155928.1818928-2-usama.anjum@collabora.com Signed-off-by: Kees Cook --- tools/testing/selftests/exec/load_address.c | 36 +++++++++------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c index d487c2f6a615..17e3207d34ae 100644 --- a/tools/testing/selftests/exec/load_address.c +++ b/tools/testing/selftests/exec/load_address.c @@ -5,6 +5,7 @@ #include #include #include +#include "../kselftest.h" struct Statistics { unsigned long long load_address; @@ -41,28 +42,23 @@ int main(int argc, char **argv) unsigned long long misalign; int ret; - ret = dl_iterate_phdr(ExtractStatistics, &extracted); - if (ret != 1) { - fprintf(stderr, "FAILED\n"); - return 1; - } + ksft_print_header(); + ksft_set_plan(1); - if (extracted.alignment == 0) { - fprintf(stderr, "No alignment found\n"); - return 1; - } else if (extracted.alignment & (extracted.alignment - 1)) { - fprintf(stderr, "Alignment is not a power of 2\n"); - return 1; - } + ret = dl_iterate_phdr(ExtractStatistics, &extracted); + if (ret != 1) + ksft_exit_fail_msg("FAILED: dl_iterate_phdr\n"); + + if (extracted.alignment == 0) + ksft_exit_fail_msg("FAILED: No alignment found\n"); + else if (extracted.alignment & (extracted.alignment - 1)) + ksft_exit_fail_msg("FAILED: Alignment is not a power of 2\n"); misalign = extracted.load_address & (extracted.alignment - 1); - if (misalign) { - printf("alignment = %llu, load_address = %llu\n", - extracted.alignment, extracted.load_address); - fprintf(stderr, "FAILED\n"); - return 1; - } + if (misalign) + ksft_exit_fail_msg("FAILED: alignment = %llu, load_address = %llu\n", + extracted.alignment, extracted.load_address); - fprintf(stderr, "PASS\n"); - return 0; + ksft_test_result_pass("Completed\n"); + ksft_finished(); } From 1d0e51b24c8383450e631a0110e99d7cf9c4a762 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 4 Mar 2024 20:59:25 +0500 Subject: [PATCH 198/496] selftests/exec: recursion-depth: conform test to TAP format output Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. While at it, do minor cleanups like move the declarations of the variables on top of the function. Signed-off-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240304155928.1818928-3-usama.anjum@collabora.com Signed-off-by: Kees Cook --- .../testing/selftests/exec/recursion-depth.c | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c index 2dbd5bc45b3e..b2f37d86a5f6 100644 --- a/tools/testing/selftests/exec/recursion-depth.c +++ b/tools/testing/selftests/exec/recursion-depth.c @@ -23,45 +23,44 @@ #include #include #include +#include "../kselftest.h" int main(void) { + int fd, rv; + + ksft_print_header(); + ksft_set_plan(1); + if (unshare(CLONE_NEWNS) == -1) { if (errno == ENOSYS || errno == EPERM) { - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 4; + ksft_test_result_skip("error: unshare, errno %d\n", errno); + ksft_finished(); } - fprintf(stderr, "error: unshare, errno %d\n", errno); - return 1; - } - if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { - fprintf(stderr, "error: mount '/', errno %d\n", errno); - return 1; + ksft_exit_fail_msg("error: unshare, errno %d\n", errno); } + + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL) == -1) + ksft_exit_fail_msg("error: mount '/', errno %d\n", errno); + /* Require "exec" filesystem. */ - if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) { - fprintf(stderr, "error: mount ramfs, errno %d\n", errno); - return 1; - } + if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) + ksft_exit_fail_msg("error: mount ramfs, errno %d\n", errno); #define FILENAME "/tmp/1" - int fd = creat(FILENAME, 0700); - if (fd == -1) { - fprintf(stderr, "error: creat, errno %d\n", errno); - return 1; - } + fd = creat(FILENAME, 0700); + if (fd == -1) + ksft_exit_fail_msg("error: creat, errno %d\n", errno); + #define S "#!" FILENAME "\n" - if (write(fd, S, strlen(S)) != strlen(S)) { - fprintf(stderr, "error: write, errno %d\n", errno); - return 1; - } + if (write(fd, S, strlen(S)) != strlen(S)) + ksft_exit_fail_msg("error: write, errno %d\n", errno); + close(fd); - int rv = execve(FILENAME, NULL, NULL); - if (rv == -1 && errno == ELOOP) { - return 0; - } - fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno); - return 1; + rv = execve(FILENAME, NULL, NULL); + ksft_test_result(rv == -1 && errno == ELOOP, + "execve failed as expected (ret %d, errno %d)\n", rv, errno); + ksft_finished(); } From 2c5c0ba1179d31b0a030b45a16df6181d1bc3ea6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 13 Mar 2024 15:52:40 +0000 Subject: [PATCH 199/496] io_uring: simplify io_pages_free We never pass a null (top-level) pointer, remove the check. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0e1a46f9a5cd38e6876905e8030bdff9b0845e96.1710343154.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e7d7a456b489..48c8d74e86ab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2696,13 +2696,9 @@ void io_mem_free(void *ptr) static void io_pages_free(struct page ***pages, int npages) { - struct page **page_array; + struct page **page_array = *pages; int i; - if (!pages) - return; - - page_array = *pages; if (!page_array) return; From 9219e4a9d4ad57323837f7c3562964e61840b17a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 13 Mar 2024 15:52:41 +0000 Subject: [PATCH 200/496] io_uring/kbuf: rename is_mapped In buffer lists we have ->is_mapped as well as ->is_mmap, it's pretty hard to stay sane double checking which one means what, and in the long run there is a high chance of an eventual bug. Rename ->is_mapped into ->is_buf_ring. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c4838f4d8ad506ad6373f1c305aee2d2c1a89786.1710343154.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 20 ++++++++++---------- io_uring/kbuf.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 9be42bff936b..693c26da4ee1 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -199,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->is_mapped) + if (bl->is_buf_ring) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -253,7 +253,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->is_mapped) { + if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { /* @@ -274,7 +274,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); - bl->is_mapped = 0; + bl->is_buf_ring = 0; return i; } @@ -361,7 +361,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->is_mapped) + if (!bl->is_buf_ring) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -519,7 +519,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->is_mapped) { + if (bl->is_buf_ring) { ret = -EINVAL; goto err; } @@ -575,7 +575,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->buf_ring = br; - bl->is_mapped = 1; + bl->is_buf_ring = 1; bl->is_mmap = 0; return 0; error_unpin: @@ -642,7 +642,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, } ibf->inuse = 1; bl->buf_ring = ibf->mem; - bl->is_mapped = 1; + bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; } @@ -688,7 +688,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->is_mapped || !list_empty(&bl->buf_list)) + if (bl->is_buf_ring || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -730,7 +730,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!bl->is_buf_ring) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); @@ -757,7 +757,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) return -ENOENT; - if (!bl->is_mapped) + if (!bl->is_buf_ring) return -EINVAL; buf_status.head = bl->head; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 5218bfd79e87..1c7b654ee726 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -26,7 +26,7 @@ struct io_buffer_list { __u16 mask; /* ring mapped provided buffers */ - __u8 is_mapped; + __u8 is_buf_ring; /* ring mapped provided buffers, but mmap'ed by application */ __u8 is_mmap; /* bl is visible from an RCU point of view for lookup */ From 67d1189d1095d471ed7fa426c7e384a7140a5dd7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 13 Mar 2024 17:39:12 -0400 Subject: [PATCH 201/496] io_uring: Fix release of pinned pages when __io_uaddr_map fails Looking at the error path of __io_uaddr_map, if we fail after pinning the pages for any reasons, ret will be set to -EINVAL and the error handler won't properly release the pinned pages. I didn't manage to trigger it without forcing a failure, but it can happen in real life when memory is heavily fragmented. Signed-off-by: Gabriel Krisman Bertazi Fixes: 223ef4743164 ("io_uring: don't allow IORING_SETUP_NO_MMAP rings on highmem pages") Link: https://lore.kernel.org/r/20240313213912.1920-1-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 48c8d74e86ab..3ae4bb988906 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2714,7 +2714,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, struct page **page_array; unsigned int nr_pages; void *page_addr; - int ret, i; + int ret, i, pinned; *npages = 0; @@ -2728,12 +2728,12 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, if (!page_array) return ERR_PTR(-ENOMEM); - ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - page_array); - if (ret != nr_pages) { -err: - io_pages_free(&page_array, ret > 0 ? ret : 0); - return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); + + pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + page_array); + if (pinned != nr_pages) { + ret = (pinned < 0) ? pinned : -EFAULT; + goto free_pages; } page_addr = page_address(page_array[0]); @@ -2747,7 +2747,7 @@ err: * didn't support this feature. */ if (PageHighMem(page_array[i])) - goto err; + goto free_pages; /* * No support for discontig pages for now, should either be a @@ -2756,13 +2756,17 @@ err: * just fail them with EINVAL. */ if (page_address(page_array[i]) != page_addr) - goto err; + goto free_pages; page_addr += PAGE_SIZE; } *pages = page_array; *npages = nr_pages; return page_to_virt(page_array[0]); + +free_pages: + io_pages_free(&page_array, pinned > 0 ? pinned : 0); + return ERR_PTR(ret); } static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, From cec60af1972d830dc837da76b472cf9cce7945cf Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Sat, 9 Mar 2024 08:13:48 +0800 Subject: [PATCH 202/496] wifi: rtw89: coex: fix configuration for shared antenna for 8922A WiFi 2x2 + BT combo cards can be two or three physical antenna. For two antenna case, one antenna is shared by WiFi and BT, and different configuration should be applied. Fix the typo. This problem was found by Coccicheck, and actually that is a typo instead: rtw8922a.c:2235:2-4: WARNING: possible condition with no effect (if == else) Fixes: 652c9642eda6 ("wifi: rtw89: coex: add init_info H2C command format version 7") Closes: https://lore.kernel.org/linux-wireless/20240308074539.04512f66@kernel.org/ Cc: Ching-Te Ku Cc: Jakub Kicinski Signed-off-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://msgid.link/20240309001348.9906-1-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/rtw8922a.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw89/rtw8922a.c b/drivers/net/wireless/realtek/rtw89/rtw8922a.c index 367459bd1345..708132d5be2a 100644 --- a/drivers/net/wireless/realtek/rtw89/rtw8922a.c +++ b/drivers/net/wireless/realtek/rtw89/rtw8922a.c @@ -2233,7 +2233,7 @@ static void rtw8922a_btc_init_cfg(struct rtw89_dev *rtwdev) * Shared-Ant && BTG-path:WL mask(0x55f), others:WL THRU(0x5ff) */ if (btc->ant_type == BTC_ANT_SHARED && btc->btg_pos == path) - rtw8922a_set_trx_mask(rtwdev, path, BTC_BT_TX_GROUP, 0x5ff); + rtw8922a_set_trx_mask(rtwdev, path, BTC_BT_TX_GROUP, 0x55f); else rtw8922a_set_trx_mask(rtwdev, path, BTC_BT_TX_GROUP, 0x5ff); From 5d515eb1295151aa3e50af69fc726823aba7bac3 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Mon, 4 Mar 2024 10:12:25 +0100 Subject: [PATCH 203/496] drm/sun4i: hdmi: Fix u64 div on 32bit arch Commit 358e76fd613a ("drm/sun4i: hdmi: Consolidate atomic_check and mode_valid") changed the clock rate from an unsigned long to an unsigned long long resulting in a a 64-bit division that might not be supported on all platforms. The resulted in compilation being broken at least for m68k, xtensa and some arm configurations, at least. Fixes: 358e76fd613a ("drm/sun4i: hdmi: Consolidate atomic_check and mode_valid") Reported-by: Geert Uytterhoeven Reported-by: Naresh Kamboju Closes: https://lore.kernel.org/r/CA+G9fYvG9KE15PGNoLu+SBVyShe+u5HBLQ81+kK9Zop6u=ywmw@mail.gmail.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403011839.KLiXh4wC-lkp@intel.com/ Acked-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20240304091225.366325-1-mripard@kernel.org Signed-off-by: Maxime Ripard --- drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c index 69001a3dc0df..2d1880c61b50 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c @@ -166,7 +166,7 @@ sun4i_hdmi_connector_clock_valid(const struct drm_connector *connector, unsigned long long clock) { const struct sun4i_hdmi *hdmi = drm_connector_to_sun4i_hdmi(connector); - unsigned long diff = clock / 200; /* +-0.5% allowed by HDMI spec */ + unsigned long diff = div_u64(clock, 200); /* +-0.5% allowed by HDMI spec */ long rounded_rate; if (mode->flags & DRM_MODE_FLAG_DBLCLK) From 6d5501d59cf659651e100fc4c5617d444c22ba74 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 10 Mar 2024 01:38:43 +0200 Subject: [PATCH 204/496] drm/bridge: correct DRM_BRIDGE_OP_EDID documentation While the commit d807ad80d811 ("drm/bridge: add ->edid_read hook and drm_bridge_edid_read()") and the commit 27b8f91c08d9 ("drm/bridge: remove ->get_edid callback") replaced ->get_edid() callback with the ->edid_read(), they failed to update documentation. Fix the drm_bridge docs to point to edid_read(). Fixes: 27b8f91c08d9 ("drm/bridge: remove ->get_edid callback") Reviewed-by: Jani Nikula Reviewed-by: Neil Armstrong Signed-off-by: Dmitry Baryshkov Link: https://patchwork.freedesktop.org/patch/msgid/20240310-drm-bridge-fix-docs-v1-1-70d3d741cb7a@linaro.org --- include/drm/drm_bridge.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/drm/drm_bridge.h b/include/drm/drm_bridge.h index 3606e1a7f965..4baca0d9107b 100644 --- a/include/drm/drm_bridge.h +++ b/include/drm/drm_bridge.h @@ -541,7 +541,7 @@ struct drm_bridge_funcs { * The @get_modes callback is mostly intended to support non-probeable * displays such as many fixed panels. Bridges that support reading * EDID shall leave @get_modes unimplemented and implement the - * &drm_bridge_funcs->get_edid callback instead. + * &drm_bridge_funcs->edid_read callback instead. * * This callback is optional. Bridges that implement it shall set the * DRM_BRIDGE_OP_MODES flag in their &drm_bridge->ops. @@ -687,7 +687,7 @@ enum drm_bridge_ops { /** * @DRM_BRIDGE_OP_EDID: The bridge can retrieve the EDID of the display * connected to its output. Bridges that set this flag shall implement - * the &drm_bridge_funcs->get_edid callback. + * the &drm_bridge_funcs->edid_read callback. */ DRM_BRIDGE_OP_EDID = BIT(1), /** From 29895ce18311ddd702973ddb3a6c687db663e0fb Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 13 Mar 2024 12:45:30 -0700 Subject: [PATCH 205/496] spi: Fix error code checking in spi_mem_exec_op() After commit cff49d58f57e ("spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP"), our SPI NOR flashes would stop probing with the following visible in the kernel log: [ 2.196300] brcmstb_qspi f0440920.qspi: using bspi-mspi mode [ 2.210295] spi-nor: probe of spi1.0 failed with error -95 It turns out that the check in spi_mem_exec_op() was changed to check for -ENOTSUPP (old error code) or -EOPNOTSUPP (new error code), but this means that for drivers that were converted, the second condition is now true, and we stop falling through like we used to. Fix the error to check for neither error being neither -ENOTSUPP *nor* -EOPNOTSUPP. Fixes: cff49d58f57e ("spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP") Reviewed-by: Michael Walle Reviewed-by: Pratyush Yadav Signed-off-by: Florian Fainelli Reviewed-by: Miquel Raynal Reviewed-by: Mika Westerberg Reviewed-by: Tudor Ambarus Link: https://msgid.link/r/20240313194530.3150446-1-florian.fainelli@broadcom.com Signed-off-by: Mark Brown --- drivers/spi/spi-mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-mem.c b/drivers/spi/spi-mem.c index c9d6d42a88f5..17b8baf749e6 100644 --- a/drivers/spi/spi-mem.c +++ b/drivers/spi/spi-mem.c @@ -382,7 +382,7 @@ int spi_mem_exec_op(struct spi_mem *mem, const struct spi_mem_op *op) * read path) and expect the core to use the regular SPI * interface in other cases. */ - if (!ret || ret != -ENOTSUPP || ret != -EOPNOTSUPP) { + if (!ret || (ret != -ENOTSUPP && ret != -EOPNOTSUPP)) { spi_mem_add_op_stats(ctlr->pcpu_statistics, op, ret); spi_mem_add_op_stats(mem->spi->pcpu_statistics, op, ret); From 861b3415e4dee06cc00cd1754808a7827b9105bf Mon Sep 17 00:00:00 2001 From: Jiawei Wang Date: Wed, 13 Mar 2024 09:58:52 +0800 Subject: [PATCH 206/496] ASoC: amd: yc: Revert "Fix non-functional mic on Lenovo 21J2" This reverts commit ed00a6945dc32462c2d3744a3518d2316da66fcc, which added a quirk entry to enable the Yellow Carp (YC) driver for the Lenovo 21J2 laptop. Although the microphone functioned with the YC driver, it resulted in incorrect driver usage. The Lenovo 21J2 is not a Yellow Carp platform, but a Pink Sardine platform, which already has an upstreamed driver. The microphone on the Lenovo 21J2 operates correctly with the CONFIG_SND_SOC_AMD_PS flag enabled and does not require the quirk entry. So this patch removes the quirk entry. Thanks to Mukunda Vijendar [1] for pointing this out. Link: https://lore.kernel.org/linux-sound/023092e1-689c-4b00-b93f-4092c3724fb6@amd.com/ [1] Signed-off-by: Jiawei Wang Link: https://lore.kernel.org/linux-sound/023092e1-689c-4b00-b93f-4092c3724fb6@amd.com/ [1] Link: https://msgid.link/r/20240313015853.3573242-2-me@jwang.link Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 1ab69a53174e..69c68d8e7a6b 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -199,13 +199,6 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "21HY"), } }, - { - .driver_data = &acp6x_card, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_NAME, "21J2"), - } - }, { .driver_data = &acp6x_card, .matches = { From 37bee1855d0e3b6dbeb8de71895f6f68cad137be Mon Sep 17 00:00:00 2001 From: Jiawei Wang Date: Wed, 13 Mar 2024 09:58:53 +0800 Subject: [PATCH 207/496] ASoC: amd: yc: Revert "add new YC platform variant (0x63) support" This reverts commit 316a784839b21b122e1761cdca54677bb19a47fa, that enabled Yellow Carp (YC) driver for PCI revision id 0x63. Mukunda Vijendar [1] points out that revision 0x63 is Pink Sardine platform, not Yellow Carp. The YC driver should not be enabled for this platform. This patch prevents the YC driver from being incorrectly enabled. Link: https://lore.kernel.org/linux-sound/023092e1-689c-4b00-b93f-4092c3724fb6@amd.com/ [1] Signed-off-by: Jiawei Wang Link: https://msgid.link/r/20240313015853.3573242-3-me@jwang.link Signed-off-by: Mark Brown --- sound/soc/amd/yc/pci-acp6x.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/amd/yc/pci-acp6x.c b/sound/soc/amd/yc/pci-acp6x.c index 694b8e313902..7af6a349b1d4 100644 --- a/sound/soc/amd/yc/pci-acp6x.c +++ b/sound/soc/amd/yc/pci-acp6x.c @@ -162,7 +162,6 @@ static int snd_acp6x_probe(struct pci_dev *pci, /* Yellow Carp device check */ switch (pci->revision) { case 0x60: - case 0x63: case 0x6f: break; default: From b5b4287accd702f562a49a60b10dbfaf7d40270f Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Tue, 30 Jan 2024 17:07:00 -0800 Subject: [PATCH 208/496] riscv: mm: Use hint address in mmap if available On riscv it is guaranteed that the address returned by mmap is less than the hint address. Allow mmap to return an address all the way up to addr, if provided, rather than just up to the lower address space. This provides a performance benefit as well, allowing mmap to exit after checking that the address is in range rather than searching for a valid address. It is possible to provide an address that uses at most the same number of bits, however it is significantly more computationally expensive to provide that number rather than setting the max to be the hint address. There is the instruction clz/clzw in Zbb that returns the highest set bit which could be used to performantly implement this, but it would still be slower than the current implementation. At worst case, half of the address would not be able to be allocated when a hint address is provided. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240130-use_mmap_hint_address-v3-1-8a655cfa8bcb@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f19f861cda54..8ece7a8f0e18 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -14,22 +14,16 @@ #include -#ifdef CONFIG_64BIT -#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1)) -#define STACK_TOP_MAX TASK_SIZE_64 - #define arch_get_mmap_end(addr, len, flags) \ ({ \ unsigned long mmap_end; \ typeof(addr) _addr = (addr); \ - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + if ((_addr) == 0 || \ + (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) || \ + ((_addr + len) > BIT(VA_BITS - 1))) \ mmap_end = STACK_TOP_MAX; \ - else if ((_addr) >= VA_USER_SV57) \ - mmap_end = STACK_TOP_MAX; \ - else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \ - mmap_end = VA_USER_SV48; \ else \ - mmap_end = VA_USER_SV39; \ + mmap_end = (_addr + len); \ mmap_end; \ }) @@ -39,17 +33,18 @@ typeof(addr) _addr = (addr); \ typeof(base) _base = (base); \ unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \ - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + if ((_addr) == 0 || \ + (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) || \ + ((_addr + len) > BIT(VA_BITS - 1))) \ mmap_base = (_base); \ - else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \ - mmap_base = VA_USER_SV57 - rnd_gap; \ - else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \ - mmap_base = VA_USER_SV48 - rnd_gap; \ else \ - mmap_base = VA_USER_SV39 - rnd_gap; \ + mmap_base = (_addr + len) - rnd_gap; \ mmap_base; \ }) +#ifdef CONFIG_64BIT +#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1)) +#define STACK_TOP_MAX TASK_SIZE_64 #else #define DEFAULT_MAP_WINDOW TASK_SIZE #define STACK_TOP_MAX TASK_SIZE From 73d05262a2ca28c72c5aec57d79b47c251fe77c5 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Tue, 30 Jan 2024 17:07:01 -0800 Subject: [PATCH 209/496] selftests: riscv: Generalize mm selftests The behavior of mmap on riscv is defined to not provide an address that uses more bits than the hint address, if provided. Make the tests reflect that. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240130-use_mmap_hint_address-v3-2-8a655cfa8bcb@rivosinc.com Signed-off-by: Palmer Dabbelt --- .../selftests/riscv/mm/mmap_bottomup.c | 23 +--- .../testing/selftests/riscv/mm/mmap_default.c | 23 +--- tools/testing/selftests/riscv/mm/mmap_test.h | 107 +++++++++++------- 3 files changed, 67 insertions(+), 86 deletions(-) diff --git a/tools/testing/selftests/riscv/mm/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/mmap_bottomup.c index 1757d19ca89b..7f7d3eb8b9c9 100644 --- a/tools/testing/selftests/riscv/mm/mmap_bottomup.c +++ b/tools/testing/selftests/riscv/mm/mmap_bottomup.c @@ -6,30 +6,9 @@ TEST(infinite_rlimit) { -// Only works on 64 bit -#if __riscv_xlen == 64 - struct addresses mmap_addresses; - EXPECT_EQ(BOTTOM_UP, memory_layout()); - do_mmaps(&mmap_addresses); - - EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr); - - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr); - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr); - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr); - EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr); -#endif + TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_default.c b/tools/testing/selftests/riscv/mm/mmap_default.c index c63c60b9397e..2ba3ec990006 100644 --- a/tools/testing/selftests/riscv/mm/mmap_default.c +++ b/tools/testing/selftests/riscv/mm/mmap_default.c @@ -6,30 +6,9 @@ TEST(default_rlimit) { -// Only works on 64 bit -#if __riscv_xlen == 64 - struct addresses mmap_addresses; - EXPECT_EQ(TOP_DOWN, memory_layout()); - do_mmaps(&mmap_addresses); - - EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr); - EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr); - - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr); - EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr); - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr); - EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr); - EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr); -#endif + TEST_MMAPS; } TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/mmap_test.h b/tools/testing/selftests/riscv/mm/mmap_test.h index 9b8434f62f57..36e78d991d5e 100644 --- a/tools/testing/selftests/riscv/mm/mmap_test.h +++ b/tools/testing/selftests/riscv/mm/mmap_test.h @@ -4,60 +4,83 @@ #include #include #include +#include +#include "../../kselftest_harness.h" #define TOP_DOWN 0 #define BOTTOM_UP 1 -struct addresses { - int *no_hint; - int *on_37_addr; - int *on_38_addr; - int *on_46_addr; - int *on_47_addr; - int *on_55_addr; - int *on_56_addr; +#if __riscv_xlen == 64 +uint64_t random_addresses[] = { + 0x19764f0d73b3a9f0, 0x016049584cecef59, 0x3580bdd3562f4acd, + 0x1164219f20b17da0, 0x07d97fcb40ff2373, 0x76ec528921272ee7, + 0x4dd48c38a3de3f70, 0x2e11415055f6997d, 0x14b43334ac476c02, + 0x375a60795aff19f6, 0x47f3051725b8ee1a, 0x4e697cf240494a9f, + 0x456b59b5c2f9e9d1, 0x101724379d63cb96, 0x7fe9ad31619528c1, + 0x2f417247c495c2ea, 0x329a5a5b82943a5e, 0x06d7a9d6adcd3827, + 0x327b0b9ee37f62d5, 0x17c7b1851dfd9b76, 0x006ebb6456ec2cd9, + 0x00836cd14146a134, 0x00e5c4dcde7126db, 0x004c29feadf75753, + 0x00d8b20149ed930c, 0x00d71574c269387a, 0x0006ebe4a82acb7a, + 0x0016135df51f471b, 0x00758bdb55455160, 0x00d0bdd949b13b32, + 0x00ecea01e7c5f54b, 0x00e37b071b9948b1, 0x0011fdd00ff57ab3, + 0x00e407294b52f5ea, 0x00567748c200ed20, 0x000d073084651046, + 0x00ac896f4365463c, 0x00eb0d49a0b26216, 0x0066a2564a982a31, + 0x002e0d20237784ae, 0x0000554ff8a77a76, 0x00006ce07a54c012, + 0x000009570516d799, 0x00000954ca15b84d, 0x0000684f0d453379, + 0x00002ae5816302b5, 0x0000042403fb54bf, 0x00004bad7392bf30, + 0x00003e73bfa4b5e3, 0x00005442c29978e0, 0x00002803f11286b6, + 0x000073875d745fc6, 0x00007cede9cb8240, 0x000027df84cc6a4f, + 0x00006d7e0e74242a, 0x00004afd0b836e02, 0x000047d0e837cd82, + 0x00003b42405efeda, 0x00001531bafa4c95, 0x00007172cae34ac4, }; +#else +uint32_t random_addresses[] = { + 0x8dc302e0, 0x929ab1e0, 0xb47683ba, 0xea519c73, 0xa19f1c90, 0xc49ba213, + 0x8f57c625, 0xadfe5137, 0x874d4d95, 0xaa20f09d, 0xcf21ebfc, 0xda7737f1, + 0xcedf392a, 0x83026c14, 0xccedca52, 0xc6ccf826, 0xe0cd9415, 0x997472ca, + 0xa21a44c1, 0xe82196f5, 0xa23fd66b, 0xc28d5590, 0xd009cdce, 0xcf0be646, + 0x8fc8c7ff, 0xe2a85984, 0xa3d3236b, 0x89a0619d, 0xc03db924, 0xb5d4cc1b, + 0xb96ee04c, 0xd191da48, 0xb432a000, 0xaa2bebbc, 0xa2fcb289, 0xb0cca89b, + 0xb0c18d6a, 0x88f58deb, 0xa4d42d1c, 0xe4d74e86, 0x99902b09, 0x8f786d31, + 0xbec5e381, 0x9a727e65, 0xa9a65040, 0xa880d789, 0x8f1b335e, 0xfc821c1e, + 0x97e34be4, 0xbbef84ed, 0xf447d197, 0xfd7ceee2, 0xe632348d, 0xee4590f4, + 0x958992a5, 0xd57e05d6, 0xfd240970, 0xc5b0dcff, 0xd96da2c2, 0xa7ae041d, +}; +#endif -static inline void do_mmaps(struct addresses *mmap_addresses) +#define PROT (PROT_READ | PROT_WRITE) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS) + +/* mmap must return a value that doesn't use more bits than the hint address. */ +static inline unsigned long get_max_value(unsigned long input) { - /* - * Place all of the hint addresses on the boundaries of mmap - * sv39, sv48, sv57 - * User addresses end at 1<<38, 1<<47, 1<<56 respectively - */ - void *on_37_bits = (void *)(1UL << 37); - void *on_38_bits = (void *)(1UL << 38); - void *on_46_bits = (void *)(1UL << 46); - void *on_47_bits = (void *)(1UL << 47); - void *on_55_bits = (void *)(1UL << 55); - void *on_56_bits = (void *)(1UL << 56); + unsigned long max_bit = (1UL << (((sizeof(unsigned long) * 8) - 1 - + __builtin_clzl(input)))); - int prot = PROT_READ | PROT_WRITE; - int flags = MAP_PRIVATE | MAP_ANONYMOUS; - - mmap_addresses->no_hint = - mmap(NULL, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_37_addr = - mmap(on_37_bits, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_38_addr = - mmap(on_38_bits, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_46_addr = - mmap(on_46_bits, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_47_addr = - mmap(on_47_bits, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_55_addr = - mmap(on_55_bits, 5 * sizeof(int), prot, flags, 0, 0); - mmap_addresses->on_56_addr = - mmap(on_56_bits, 5 * sizeof(int), prot, flags, 0, 0); + return max_bit + (max_bit - 1); } +#define TEST_MMAPS \ + ({ \ + void *mmap_addr; \ + for (int i = 0; i < ARRAY_SIZE(random_addresses); i++) { \ + mmap_addr = mmap((void *)random_addresses[i], \ + 5 * sizeof(int), PROT, FLAGS, 0, 0); \ + EXPECT_NE(MAP_FAILED, mmap_addr); \ + EXPECT_GE((void *)get_max_value(random_addresses[i]), \ + mmap_addr); \ + mmap_addr = mmap((void *)random_addresses[i], \ + 5 * sizeof(int), PROT, FLAGS, 0, 0); \ + EXPECT_NE(MAP_FAILED, mmap_addr); \ + EXPECT_GE((void *)get_max_value(random_addresses[i]), \ + mmap_addr); \ + } \ + }) + static inline int memory_layout(void) { - int prot = PROT_READ | PROT_WRITE; - int flags = MAP_PRIVATE | MAP_ANONYMOUS; - - void *value1 = mmap(NULL, sizeof(int), prot, flags, 0, 0); - void *value2 = mmap(NULL, sizeof(int), prot, flags, 0, 0); + void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0); + void *value2 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0); return value2 > value1; } From 371a3c2055dbb8a88754902fe19aa4fcc4318c29 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Tue, 30 Jan 2024 17:07:02 -0800 Subject: [PATCH 210/496] docs: riscv: Define behavior of mmap Define mmap on riscv to not provide an address that uses more bits than the hint address, if provided. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240130-use_mmap_hint_address-v3-3-8a655cfa8bcb@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/arch/riscv/vm-layout.rst | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/Documentation/arch/riscv/vm-layout.rst b/Documentation/arch/riscv/vm-layout.rst index 69ff6da1dbf8..e476b4386bd9 100644 --- a/Documentation/arch/riscv/vm-layout.rst +++ b/Documentation/arch/riscv/vm-layout.rst @@ -144,14 +144,8 @@ passing 0 into the hint address parameter of mmap. On CPUs with an address space smaller than sv48, the CPU maximum supported address space will be the default. Software can "opt-in" to receiving VAs from another VA space by providing -a hint address to mmap. A hint address passed to mmap will cause the largest -address space that fits entirely into the hint to be used, unless there is no -space left in the address space. If there is no space available in the requested -address space, an address in the next smallest available address space will be -returned. - -For example, in order to obtain 48-bit VA space, a hint address greater than -:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace -ending at :code:`1 << 47` and the addresses beyond this are reserved for the -kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater -than or equal to :code:`1 << 56` must be provided. +a hint address to mmap. When a hint address is passed to mmap, the returned +address will never use more bits than the hint address. For example, if a hint +address of `1 << 40` is passed to mmap, a valid returned address will never use +bits 41 through 63. If no mappable addresses are available in that range, mmap +will return `MAP_FAILED`. From de105068fead55ed5c07ade75e9c8e7f86a00d1d Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 11 Mar 2024 10:09:27 +0800 Subject: [PATCH 211/496] nvme: fix reconnection fail due to reserved tag allocation We found a issue on production environment while using NVMe over RDMA, admin_q reconnect failed forever while remote target and network is ok. After dig into it, we found it may caused by a ABBA deadlock due to tag allocation. In my case, the tag was hold by a keep alive request waiting inside admin_q, as we quiesced admin_q while reset ctrl, so the request maked as idle and will not process before reset success. As fabric_q shares tagset with admin_q, while reconnect remote target, we need a tag for connect command, but the only one reserved tag was held by keep alive command which waiting inside admin_q. As a result, we failed to reconnect admin_q forever. In order to fix this issue, I think we should keep two reserved tags for admin queue. Fixes: ed01fee283a0 ("nvme-fabrics: only reserve a single tag") Signed-off-by: Chunguang Xu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 ++++-- drivers/nvme/host/fabrics.h | 7 ------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0dcaf3973dc4..fb55b6ac0385 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4385,7 +4385,8 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->ops = ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect and keep alive */ + set->reserved_tags = 2; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_NO_SCHED; if (ctrl->ops->flags & NVME_F_BLOCKING) @@ -4454,7 +4455,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) set->reserved_tags = NVME_AQ_DEPTH; else if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect */ + set->reserved_tags = 1; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 06cc54851b1b..37c974c38dcb 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -18,13 +18,6 @@ /* default is -1: the fail fast mechanism is disabled */ #define NVMF_DEF_FAIL_FAST_TMO -1 -/* - * Reserved one command for internal usage. This command is used for sending - * the connect command, as well as for the keep alive command on the admin - * queue once live. - */ -#define NVMF_RESERVED_TAGS 1 - /* * Define a host as seen by the target. We allocate one at boot, but also * allow the override it when creating controllers. This is both to provide From dcad6f5f4303a224ac7a5802e7deffd46cf6f6cb Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 13 Mar 2024 10:29:05 +0800 Subject: [PATCH 212/496] nvme: use nvme_disk_is_ns_head helper Use nvme_disk_is_ns_head helper instead of check fops directly, and also drop CONFIG_NVME_MULTIPATH check. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pr.c | 3 +-- drivers/nvme/host/sysfs.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index fc3eed00f9ff..e05571b2a1b0 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -97,8 +97,7 @@ static int nvme_sc_to_pr_err(int nvme_sc) static int nvme_send_pr_command(struct block_device *bdev, struct nvme_command *c, void *data, unsigned int data_len) { - if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - nvme_disk_is_ns_head(bdev->bd_disk)) + if (nvme_disk_is_ns_head(bdev->bd_disk)) return nvme_send_ns_head_pr_command(bdev, c, data, data_len); return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 6c7f1d5c056f..243ebc4d471a 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -236,8 +236,7 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, struct block_device *bdev = disk->part0; int ret; - if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - bdev->bd_disk->fops == &nvme_ns_head_ops) + if (nvme_disk_is_ns_head(bdev->bd_disk)) ret = ns_head_update_nuse(head); else ret = ns_update_nuse(bdev->bd_disk->private_data); From 8d539f755c316dd38c8c43c0c25d1b9d26a3b003 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Tue, 12 Mar 2024 15:52:43 +0800 Subject: [PATCH 213/496] nvme: parse zns command's zsa and zrasf to string Parse zone mgmt send commands's zsa and receive command's zrasf to string to make the trace log more human-readable. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 1c36fcedea20..5e94b3c6d58a 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -164,12 +164,27 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) { + static const char * const zsa_strs[] = { + [0x01] = "close zone", + [0x02] = "finish zone", + [0x03] = "open zone", + [0x04] = "reset zone", + [0x05] = "offline zone", + [0x10] = "set zone descriptor extension" + }; const char *ret = trace_seq_buffer_ptr(p); u64 slba = get_unaligned_le64(cdw10); + const char *zsa_str; u8 zsa = cdw10[12]; u8 all = cdw10[13]; - trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all); + if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa]) + zsa_str = zsa_strs[zsa]; + else + zsa_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u", + slba, zsa, zsa_str, all); trace_seq_putc(p, 0); return ret; @@ -177,15 +192,32 @@ static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) { + static const char * const zrasf_strs[] = { + [0x00] = "list all zones", + [0x01] = "list the zones in the ZSE: Empty state", + [0x02] = "list the zones in the ZSIO: Implicitly Opened state", + [0x03] = "list the zones in the ZSEO: Explicitly Opened state", + [0x04] = "list the zones in the ZSC: Closed state", + [0x05] = "list the zones in the ZSF: Full state", + [0x06] = "list the zones in the ZSRO: Read Only state", + [0x07] = "list the zones in the ZSO: Offline state", + [0x09] = "list the zones that have the zone attribute" + }; const char *ret = trace_seq_buffer_ptr(p); u64 slba = get_unaligned_le64(cdw10); u32 numd = get_unaligned_le32(cdw10 + 8); u8 zra = cdw10[12]; u8 zrasf = cdw10[13]; + const char *zrasf_str; u8 pr = cdw10[14]; - trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u", - slba, numd, zra, zrasf, pr); + if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf]) + zrasf_str = zrasf_strs[zrasf]; + else + zrasf_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u", + slba, numd, zra, zrasf, zrasf_str, pr); trace_seq_putc(p, 0); return ret; From 6a0164f9f4a0b629fd7a5414d6c7d21999141b5e Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 31 Jan 2024 17:12:20 +0800 Subject: [PATCH 214/496] nvme: add tracing of reservation commands Add detailed parsing of reservation commands to make the trace log more consistent and human-readable. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5e94b3c6d58a..7e6719a3b624 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -223,6 +223,60 @@ static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 rrega = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 ptpl = (cdw10[3] >> 6) & 0x3; + + trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u", + rrega, iekey, ptpl); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 racqa = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 rtype = cdw10[1]; + + trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u", + racqa, iekey, rtype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 rrela = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 rtype = cdw10[1]; + + trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u", + rrela, iekey, rtype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_report(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u32 numd = get_unaligned_le32(cdw10); + u8 eds = cdw10[4] & 0x1; + + trace_seq_printf(p, "numd=%u, eds=%u", numd, eds); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -275,6 +329,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_zone_mgmt_send(p, cdw10); case nvme_cmd_zone_mgmt_recv: return nvme_trace_zone_mgmt_recv(p, cdw10); + case nvme_cmd_resv_register: + return nvme_trace_resv_reg(p, cdw10); + case nvme_cmd_resv_acquire: + return nvme_trace_resv_acq(p, cdw10); + case nvme_cmd_resv_release: + return nvme_trace_resv_rel(p, cdw10); + case nvme_cmd_resv_report: + return nvme_trace_resv_report(p, cdw10); default: return nvme_trace_common(p, cdw10); } From 798edad968ace3c15c3180ba72e427630022e2d9 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 31 Jan 2024 17:12:22 +0800 Subject: [PATCH 215/496] nvme: parse format command's lbafu when tracing Add the parse of format command's lbafu to calculate lbaf. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 7e6719a3b624..0288315f0050 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -119,7 +119,10 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p, static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); - u8 lbaf = cdw10[0] & 0xF; + /* + * lbafu(bit 13:12) is already in the upper 4 bits, lbafl: bit 03:00. + */ + u8 lbaf = (cdw10[1] & 0x30) | (cdw10[0] & 0xF); u8 mset = (cdw10[0] >> 4) & 0x1; u8 pi = (cdw10[0] >> 5) & 0x7; u8 pil = cdw10[1] & 0x1; From dc528770edb138e26a533f8a77de5c4db18ea7f3 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Tue, 12 Mar 2024 21:21:41 -0500 Subject: [PATCH 216/496] cifs: defer close file handles having RH lease Previously we only deferred closing file handles with RHW lease. To enhance performance benefits from deferred closes, we now include handles with RH leases as well. Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/file.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index ec25d3c3e1ee..e7d5e8f972a0 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1073,6 +1073,19 @@ void smb2_deferred_work_close(struct work_struct *work) _cifsFileInfo_put(cfile, true, false); } +static bool +smb2_can_defer_close(struct inode *inode, struct cifs_deferred_close *dclose) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cinode = CIFS_I(inode); + + return (cifs_sb->ctx->closetimeo && cinode->lease_granted && dclose && + (cinode->oplock == CIFS_CACHE_RHW_FLG || + cinode->oplock == CIFS_CACHE_RH_FLG) && + !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags)); + +} + int cifs_close(struct inode *inode, struct file *file) { struct cifsFileInfo *cfile; @@ -1086,10 +1099,8 @@ int cifs_close(struct inode *inode, struct file *file) cfile = file->private_data; file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); - if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG) - && cinode->lease_granted && - !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && - dclose && !(cfile->status_file_deleted)) { + if ((cfile->status_file_deleted == false) && + (smb2_can_defer_close(inode, dclose))) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); From 13c0a74747cb7fdadf58c5d3a7d52cfca2d51736 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 13 Mar 2024 10:40:41 +0000 Subject: [PATCH 217/496] cifs: make sure server interfaces are requested only for SMB3+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some code paths for querying server interfaces make a false assumption that it will only get called for SMB3+. Since this function now can get called from a generic code paths, the correct thing to do is to have specific handler for this functionality per SMB dialect, and call this handler. This change adds such a handler and implements this handler only for SMB 3.0 and 3.1.1. Cc: stable@vger.kernel.org Cc: Jan Čermák Reported-by: Paulo Alcantara Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 3 +++ fs/smb/client/connect.c | 6 +++++- fs/smb/client/smb2ops.c | 2 ++ fs/smb/client/smb2pdu.c | 5 +++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 8be62ed053a2..3da625d53235 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -355,6 +355,9 @@ struct smb_version_operations { /* informational QFS call */ void (*qfs_tcon)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *); + /* query for server interfaces */ + int (*query_server_interfaces)(const unsigned int, struct cifs_tcon *, + bool); /* check if a path is accessible or not */ int (*is_path_accessible)(const unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const char *); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 86ae578904a2..4cbb79418e50 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -123,12 +123,16 @@ static void smb2_query_server_interfaces(struct work_struct *work) struct cifs_tcon *tcon = container_of(work, struct cifs_tcon, query_interfaces.work); + struct TCP_Server_Info *server = tcon->ses->server; /* * query server network interfaces, in case they change */ + if (!server->ops->query_server_interfaces) + return; + xid = get_xid(); - rc = SMB3_request_interfaces(xid, tcon, false); + rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); if (rc) { diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6ee22d0dbc00..2ed456948f34 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -5290,6 +5290,7 @@ struct smb_version_operations smb30_operations = { .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, .qfs_tcon = smb3_qfs_tcon, + .query_server_interfaces = SMB3_request_interfaces, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, @@ -5405,6 +5406,7 @@ struct smb_version_operations smb311_operations = { .tree_connect = SMB2_tcon, .tree_disconnect = SMB2_tdis, .qfs_tcon = smb3_qfs_tcon, + .query_server_interfaces = SMB3_request_interfaces, .is_path_accessible = smb2_is_path_accessible, .can_echo = smb2_can_echo, .echo = SMB2_echo, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index e5e6b14f8cae..3ea688558e6c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -409,14 +409,15 @@ skip_sess_setup: spin_unlock(&ses->ses_lock); if (!rc && - (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) && + server->ops->query_server_interfaces) { mutex_unlock(&ses->session_mutex); /* * query server network interfaces, in case they change */ xid = get_xid(); - rc = SMB3_request_interfaces(xid, tcon, false); + rc = server->ops->query_server_interfaces(xid, tcon, false); free_xid(xid); if (rc == -EOPNOTSUPP && ses->chan_count > 1) { From 16a57d7681110b25708c7042688412238e6f73a9 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 13 Mar 2024 10:40:40 +0000 Subject: [PATCH 218/496] cifs: reduce warning log level for server not advertising interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several users have reported this log getting dumped too regularly to kernel log. The likely root cause has been identified, and it suggests that this situation is expected for some configurations (for example SMB2.1). Since the function returns appropriately even for such cases, it is fairly harmless to make this a debug log. When needed, the verbosity can be increased to capture this log. Cc: stable@vger.kernel.org Reported-by: Jan Čermák Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/sess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 8f37373fd333..3216f786908f 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -230,7 +230,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) spin_lock(&ses->iface_lock); if (!ses->iface_count) { spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "server %s does not advertise interfaces\n", + cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname); break; } @@ -396,7 +396,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) spin_lock(&ses->iface_lock); if (!ses->iface_count) { spin_unlock(&ses->iface_lock); - cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname); + cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname); return; } From f1b8224b4e6ed59e7e6f5c548673c67410098d8d Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Fri, 1 Mar 2024 17:53:44 +0300 Subject: [PATCH 219/496] cifs: open_cached_dir(): add FILE_READ_EA to desired access Since smb2_query_eas() reads EA and uses cached directory, open_cached_dir() should request FILE_READ_EA access. Otherwise listxattr() and getxattr() will fail with EACCES (0xc0000022 STATUS_ACCESS_DENIED SMB status). Link: https://bugzilla.kernel.org/show_bug.cgi?id=218543 Cc: stable@vger.kernel.org Signed-off-by: Eugene Korenevsky Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 3de5047a7ff9..a0017724d523 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -239,7 +239,8 @@ replay_again: .tcon = tcon, .path = path, .create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE), - .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES, + .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES | + FILE_READ_EA, .disposition = FILE_OPEN, .fid = pfid, .replay = !!(retries), From fc20c523211a38b87fc850a959cb2149e4fd64b0 Mon Sep 17 00:00:00 2001 From: Meetakshi Setiya Date: Thu, 14 Mar 2024 08:05:49 -0400 Subject: [PATCH 220/496] cifs: fixes for get_inode_info Fix potential memory leaks, add error checking, remove unnecessary initialisation of status_file_deleted and do not use cifs_iget() to get inode in reparse_info_to_fattr since fattrs may not be fully set. Fixes: ffceb7640cbf ("smb: client: do not defer close open handles to deleted files") Reported-by: Paulo Alcantara Signed-off-by: Meetakshi Setiya Signed-off-by: Steve French --- fs/smb/client/file.c | 1 - fs/smb/client/inode.c | 24 +++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index e7d5e8f972a0..16aadce492b2 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -486,7 +486,6 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->uid = current_fsuid(); cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; - cfile->status_file_deleted = false; cfile->invalidHandle = false; cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 8177ec59afee..6092729bf7f6 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -820,8 +820,10 @@ cifs_get_file_info(struct file *filp) void *page = alloc_dentry_path(); const unsigned char *path; - if (!server->ops->query_file_info) + if (!server->ops->query_file_info) { + free_dentry_path(page); return -ENOSYS; + } xid = get_xid(); rc = server->ops->query_file_info(xid, tcon, cfile, &data); @@ -835,8 +837,8 @@ cifs_get_file_info(struct file *filp) } path = build_path_from_dentry(dentry, page); if (IS_ERR(path)) { - free_dentry_path(page); - return PTR_ERR(path); + rc = PTR_ERR(path); + goto cgfi_exit; } cifs_open_info_to_fattr(&fattr, &data, inode->i_sb); if (fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) @@ -1009,7 +1011,6 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, struct kvec rsp_iov, *iov = NULL; int rsp_buftype = CIFS_NO_BUFFER; u32 tag = data->reparse.tag; - struct inode *inode = NULL; int rc = 0; if (!tag && server->ops->query_reparse_point) { @@ -1049,12 +1050,8 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, if (tcon->posix_extensions) smb311_posix_info_to_fattr(fattr, data, sb); - else { + else cifs_open_info_to_fattr(fattr, data, sb); - inode = cifs_iget(sb, fattr); - if (inode && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) - cifs_mark_open_handles_for_deleted_file(inode, full_path); - } out: fattr->cf_cifstag = data->reparse.tag; free_rsp_buf(rsp_buftype, rsp_iov.iov_base); @@ -1109,9 +1106,9 @@ static int cifs_get_fattr(struct cifs_open_info_data *data, full_path, fattr); } else { cifs_open_info_to_fattr(fattr, data, sb); - if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) - cifs_mark_open_handles_for_deleted_file(*inode, full_path); } + if (!rc && fattr->cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(*inode, full_path); break; case -EREMOTE: /* DFS link, no metadata available on this server */ @@ -1340,6 +1337,8 @@ int smb311_posix_get_inode_info(struct inode **inode, goto out; rc = update_inode_info(sb, &fattr, inode); + if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(*inode, full_path); out: kfree(fattr.cf_symlink_target); return rc; @@ -1501,6 +1500,9 @@ iget_root: goto out; } + if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING) + cifs_mark_open_handles_for_deleted_file(inode, path); + if (rc && tcon->pipe) { cifs_dbg(FYI, "ipc connection - fake read inode\n"); spin_lock(&inode->i_lock); From 2760161d149f8d60c3f767fc62a823a1ead9d367 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 14 Mar 2024 23:36:36 +0530 Subject: [PATCH 221/496] cifs: remove redundant variable assignment This removes an unnecessary variable assignment. The assigned value will be overwritten by cifs_fattr_to_inode before it is accessed, making the line redundant. Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 6092729bf7f6..d28ab0af6049 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -401,7 +401,6 @@ cifs_get_file_info_unix(struct file *filp) cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb); } else if (rc == -EREMOTE) { cifs_create_junction_fattr(&fattr, inode->i_sb); - rc = 0; } else goto cifs_gfiunix_out; @@ -846,7 +845,6 @@ cifs_get_file_info(struct file *filp) break; case -EREMOTE: cifs_create_junction_fattr(&fattr, inode->i_sb); - rc = 0; break; case -EOPNOTSUPP: case -EINVAL: From 44d79142ede8162fd67bf8ca4ddbda1fbcfa94f1 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 14 Mar 2024 17:49:31 +0000 Subject: [PATCH 222/496] bpf: Temporarily disable atomic operations in BPF arena Currently, the x86 JIT handling PROBE_MEM32 tagged accesses is not equipped to handle atomic accesses into PTR_TO_ARENA, as no PROBE_MEM32 tagging is performed and no handling is enabled for them. This will lead to unsafety as the offset into arena will dereferenced directly without turning it into a base + offset access into the arena region. Since the changes to the x86 JIT will be fairly involved, for now, temporarily disallow use of PTR_TO_ARENA as the destination operand for atomics until support is added to the JIT backend. Fixes: 2fe99eb0ccf2 ("bpf: Add x86-64 JIT support for PROBE_MEM32 pseudo instructions.") Reported-by: Kumar Kartikeya Dwivedi Signed-off-by: Puranjay Mohan Acked-by: Kumar Kartikeya Dwivedi Message-ID: <20240314174931.98702-1-puranjay12@gmail.com> Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 63749ad5ac6b..1dd3b99d1bb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5682,6 +5682,13 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static bool is_arena_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return reg->type == PTR_TO_ARENA; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7019,7 +7026,8 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg)) { + is_sk_reg(env, insn->dst_reg) || + is_arena_reg(env, insn->dst_reg)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); From 386021394394eccef248dc5eb9c9370240821a8c Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 12 Mar 2024 11:39:07 -0700 Subject: [PATCH 223/496] drm/xe: Invalidate userptr VMA on page pin fault MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than return an error to the user or ban the VM when userptr VMA page pin fails with -EFAULT, invalidate VMA mappings. This supports the UMD use case of freeing userptr while still having bindings. Now that non-faulting VMs can invalidate VMAs, drop the usm prefix for the tile_invalidated member. v2: - Fix build error (CI) v3: - Don't invalidate VMA if in fault mode, rather kill VM (Thomas) - Update commit message with tile_invalidated name chagne (Thomas) - Wait VM bookkeep slots with VM resv lock (Thomas) v4: - Move list_del_init(&userptr.repin_link) after error check (Thomas) - Assert not in fault mode (Matthew) Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Brost Reviewed-by: Thomas Hellström Link: https://patchwork.freedesktop.org/patch/msgid/20240312183907.933835-1-matthew.brost@intel.com (cherry picked from commit 521db22a1d70dbc596a07544a738416025b1b63c) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_gt_pagefault.c | 4 ++-- drivers/gpu/drm/xe/xe_trace.h | 2 +- drivers/gpu/drm/xe/xe_vm.c | 32 +++++++++++++++++++++------- drivers/gpu/drm/xe/xe_vm_types.h | 7 ++---- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c index 73c535193a98..241c294270d9 100644 --- a/drivers/gpu/drm/xe/xe_gt_pagefault.c +++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c @@ -69,7 +69,7 @@ static bool access_is_atomic(enum access_type access_type) static bool vma_is_valid(struct xe_tile *tile, struct xe_vma *vma) { return BIT(tile->id) & vma->tile_present && - !(BIT(tile->id) & vma->usm.tile_invalidated); + !(BIT(tile->id) & vma->tile_invalidated); } static bool vma_matches(struct xe_vma *vma, u64 page_addr) @@ -226,7 +226,7 @@ retry_userptr: if (xe_vma_is_userptr(vma)) ret = xe_vma_userptr_check_repin(to_userptr_vma(vma)); - vma->usm.tile_invalidated &= ~BIT(tile->id); + vma->tile_invalidated &= ~BIT(tile->id); unlock_dma_resv: drm_exec_fini(&exec); diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h index 3b97633d81d8..d82c138f1ece 100644 --- a/drivers/gpu/drm/xe/xe_trace.h +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -464,7 +464,7 @@ DEFINE_EVENT(xe_vma, xe_vma_userptr_invalidate, TP_ARGS(vma) ); -DEFINE_EVENT(xe_vma, xe_vma_usm_invalidate, +DEFINE_EVENT(xe_vma, xe_vma_invalidate, TP_PROTO(struct xe_vma *vma), TP_ARGS(vma) ); diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c index e3bde897f6e8..3dd8bb64bb00 100644 --- a/drivers/gpu/drm/xe/xe_vm.c +++ b/drivers/gpu/drm/xe/xe_vm.c @@ -708,6 +708,7 @@ int xe_vm_userptr_pin(struct xe_vm *vm) int err = 0; LIST_HEAD(tmp_evict); + xe_assert(vm->xe, !xe_vm_in_fault_mode(vm)); lockdep_assert_held_write(&vm->lock); /* Collect invalidated userptrs */ @@ -724,11 +725,27 @@ int xe_vm_userptr_pin(struct xe_vm *vm) list_for_each_entry_safe(uvma, next, &vm->userptr.repin_list, userptr.repin_link) { err = xe_vma_userptr_pin_pages(uvma); - if (err < 0) - return err; + if (err == -EFAULT) { + list_del_init(&uvma->userptr.repin_link); - list_del_init(&uvma->userptr.repin_link); - list_move_tail(&uvma->vma.combined_links.rebind, &vm->rebind_list); + /* Wait for pending binds */ + xe_vm_lock(vm, false); + dma_resv_wait_timeout(xe_vm_resv(vm), + DMA_RESV_USAGE_BOOKKEEP, + false, MAX_SCHEDULE_TIMEOUT); + + err = xe_vm_invalidate_vma(&uvma->vma); + xe_vm_unlock(vm); + if (err) + return err; + } else { + if (err < 0) + return err; + + list_del_init(&uvma->userptr.repin_link); + list_move_tail(&uvma->vma.combined_links.rebind, + &vm->rebind_list); + } } return 0; @@ -1987,7 +2004,7 @@ static int xe_vm_prefetch(struct xe_vm *vm, struct xe_vma *vma, return err; } - if (vma->tile_mask != (vma->tile_present & ~vma->usm.tile_invalidated)) { + if (vma->tile_mask != (vma->tile_present & ~vma->tile_invalidated)) { return xe_vm_bind(vm, vma, q, xe_vma_bo(vma), syncs, num_syncs, true, first_op, last_op); } else { @@ -3185,9 +3202,8 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) u8 id; int ret; - xe_assert(xe, xe_vm_in_fault_mode(xe_vma_vm(vma))); xe_assert(xe, !xe_vma_is_null(vma)); - trace_xe_vma_usm_invalidate(vma); + trace_xe_vma_invalidate(vma); /* Check that we don't race with page-table updates */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { @@ -3225,7 +3241,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) } } - vma->usm.tile_invalidated = vma->tile_mask; + vma->tile_invalidated = vma->tile_mask; return 0; } diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index 292f8cadb40f..713996f7dc59 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -83,11 +83,8 @@ struct xe_vma { struct work_struct destroy_work; }; - /** @usm: unified shared memory state */ - struct { - /** @tile_invalidated: VMA has been invalidated */ - u8 tile_invalidated; - } usm; + /** @tile_invalidated: VMA has been invalidated */ + u8 tile_invalidated; /** @tile_mask: Tile mask of where to create binding for this VMA */ u8 tile_mask; From d58b4ef63b5024993906e74f04fda8220ad4c162 Mon Sep 17 00:00:00 2001 From: Himal Prasad Ghimiray Date: Wed, 13 Mar 2024 20:35:44 +0530 Subject: [PATCH 224/496] drm/xe: Return if kobj creation is failed Return after warning regarding kobj creation failure. Fixes: 4ae3aeab32d7 ("drm/xe: Add vram frequency sysfs attributes") Cc: Sujaritha Sundaresan Cc: Tejas Upadhyay Cc: Bommu Krishnaiah Reviewed-by: Tejas Upadhyay Signed-off-by: Himal Prasad Ghimiray Link: https://patchwork.freedesktop.org/patch/msgid/20240313150545.2830408-2-himal.prasad.ghimiray@intel.com Signed-off-by: Rodrigo Vivi (cherry picked from commit 989d07ac6bb7d269e975f85e8f683f496faa0380) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_vram_freq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_vram_freq.c b/drivers/gpu/drm/xe/xe_vram_freq.c index 079cc283a186..c5f6b5a5d117 100644 --- a/drivers/gpu/drm/xe/xe_vram_freq.c +++ b/drivers/gpu/drm/xe/xe_vram_freq.c @@ -111,8 +111,10 @@ void xe_vram_freq_sysfs_init(struct xe_tile *tile) return; kobj = kobject_create_and_add("memory", tile->sysfs); - if (!kobj) + if (!kobj) { drm_warn(&xe->drm, "failed to add memory directory, err: %d\n", -ENOMEM); + return; + } err = sysfs_create_group(kobj, &freq_group_attrs); if (err) { From dd8a07f06dfd946e0eea1a3323d52e7c28a6ed80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Roberto=20de=20Souza?= Date: Wed, 13 Mar 2024 10:13:18 -0700 Subject: [PATCH 225/496] drm/xe: Skip VMAs pin when requesting signal to the last XE_EXEC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doing a XE_EXEC with num_batch_buffer == 0 makes signals passed as argument to be signaled when the last real XE_EXEC is completed. But to do that it was first pinning all VMAs in drm_gpuvm_exec_lock(), this patch remove this pinning as it is not required. This change also help Mesa implementing memory over-commiting recovery as it needs to unbind not needed VMAs when the whole VM can't fit in GPU memory but it can only do the unbiding when the last XE_EXEC is completed. So with this change Mesa can get the signal it want without getting out-of-memory errors. Fixes: eb9702ad2986 ("drm/xe: Allow num_batch_buffer / num_binds == 0 in IOCTLs") Cc: Thomas Hellstrom Co-developed-by: Matthew Brost Signed-off-by: José Roberto de Souza Reviewed-by: Matthew Brost Signed-off-by: Matthew Brost Link: https://patchwork.freedesktop.org/patch/msgid/20240313171318.121066-1-jose.souza@intel.com (cherry picked from commit 58480c1c912ff8146d067301a0d04cca318b4a66) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_exec.c | 41 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 952496c6260d..826c8b389672 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -235,6 +235,29 @@ retry: goto err_unlock_list; } + if (!args->num_batch_buffer) { + err = xe_vm_lock(vm, true); + if (err) + goto err_unlock_list; + + if (!xe_vm_in_lr_mode(vm)) { + struct dma_fence *fence; + + fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm); + if (IS_ERR(fence)) { + err = PTR_ERR(fence); + goto err_unlock_list; + } + for (i = 0; i < num_syncs; i++) + xe_sync_entry_signal(&syncs[i], NULL, fence); + xe_exec_queue_last_fence_set(q, vm, fence); + dma_fence_put(fence); + } + + xe_vm_unlock(vm); + goto err_unlock_list; + } + vm_exec.vm = &vm->gpuvm; vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT; if (xe_vm_in_lr_mode(vm)) { @@ -254,24 +277,6 @@ retry: goto err_exec; } - if (!args->num_batch_buffer) { - if (!xe_vm_in_lr_mode(vm)) { - struct dma_fence *fence; - - fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm); - if (IS_ERR(fence)) { - err = PTR_ERR(fence); - goto err_exec; - } - for (i = 0; i < num_syncs; i++) - xe_sync_entry_signal(&syncs[i], NULL, fence); - xe_exec_queue_last_fence_set(q, vm, fence); - dma_fence_put(fence); - } - - goto err_exec; - } - if (xe_exec_queue_is_lr(q) && xe_exec_queue_ring_full(q)) { err = -EWOULDBLOCK; /* Aliased to -EAGAIN */ skip_retry = true; From 74098a989b9c3370f768140b7783a7aaec2759b3 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 26 Feb 2024 16:39:13 +0100 Subject: [PATCH 226/496] btrfs: zoned: use zone aware sb location for scrub At the moment scrub_supers() doesn't grab the super block's location via the zoned device aware btrfs_sb_log_location() but via btrfs_sb_offset(). This leads to checksum errors on 'scrub' as we're not accessing the correct location of the super block. So use btrfs_sb_log_location() for getting the super blocks location on scrub. Reported-by: WA AM Link: http://lore.kernel.org/linux-btrfs/CANU2Z0EvUzfYxczLgGUiREoMndE9WdQnbaawV5Fv5gNXptPUKw@mail.gmail.com CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Qu Wenruo Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c4bd0e60db59..fa25004ab04e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2812,7 +2812,17 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); + ret = btrfs_sb_log_location(scrub_dev, i, 0, &bytenr); + if (ret == -ENOENT) + break; + + if (ret) { + spin_lock(&sctx->stat_lock); + sctx->stat.super_errors++; + spin_unlock(&sctx->stat_lock); + continue; + } + if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->commit_total_bytes) break; From 215b2bf72a05d702287fc45d854b4f5019f9aad1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Mar 2024 15:13:52 -0800 Subject: [PATCH 227/496] xfs: fix dev_t usage in xmbuf tracepoints Fix some inconsistencies in the xmbuf tracepoints -- they should be reporting the major/minor of the filesystem that they're associated with, so that we have some clue on whose behalf the xmbuf was created. Fix the xmbuf_free tracepoint to report the same. Don't call the trace function until the xmbuf is fully initialized. Fixes: 5076a6040ca1 ("xfs: support in-memory buffer cache target") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_buf_mem.c | 4 ++-- fs/xfs/xfs_trace.h | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 8ad38c64708e..9bb2d24de709 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -81,8 +81,6 @@ xmbuf_alloc( /* ensure all writes are below EOF to avoid pagecache zeroing */ i_size_write(inode, inode->i_sb->s_maxbytes); - trace_xmbuf_create(btp); - error = xfs_buf_cache_init(btp->bt_cache); if (error) goto out_file; @@ -99,6 +97,8 @@ xmbuf_alloc( if (error) goto out_bcache; + trace_xmbuf_create(btp); + *btpp = btp; return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 56b07d8ed431..aea97fc074f8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4626,6 +4626,7 @@ TRACE_EVENT(xmbuf_create, char *path; struct file *file = btp->bt_file; + __entry->dev = btp->bt_mount->m_super->s_dev; __entry->ino = file_inode(file)->i_ino; memset(pathname, 0, sizeof(pathname)); path = file_path(file, pathname, sizeof(pathname) - 1); @@ -4633,7 +4634,8 @@ TRACE_EVENT(xmbuf_create, path = "(unknown)"; strncpy(__entry->pathname, path, sizeof(__entry->pathname)); ), - TP_printk("xmino 0x%lx path '%s'", + TP_printk("dev %d:%d xmino 0x%lx path '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pathname) ); @@ -4642,6 +4644,7 @@ TRACE_EVENT(xmbuf_free, TP_PROTO(struct xfs_buftarg *btp), TP_ARGS(btp), TP_STRUCT__entry( + __field(dev_t, dev) __field(unsigned long, ino) __field(unsigned long long, bytes) __field(loff_t, size) @@ -4650,11 +4653,13 @@ TRACE_EVENT(xmbuf_free, struct file *file = btp->bt_file; struct inode *inode = file_inode(file); + __entry->dev = btp->bt_mount->m_super->s_dev; __entry->size = i_size_read(inode); __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes; __entry->ino = inode->i_ino; ), - TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->bytes, __entry->size) From 0c6ca06aad84bac097f5c005d911db92dba3ae94 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 15 Mar 2024 12:16:39 +1100 Subject: [PATCH 228/496] xfs: quota radix tree allocations need to be NOFS on insert In converting the XFS code from GFP_NOFS to scoped contexts, we converted the quota radix tree to GFP_KERNEL. Unfortunately, it was not clearly documented that this set was because there is a dependency on the quotainfo->qi_tree_lock being taken in memory reclaim to remove dquots from the radix tree. In hindsight this is obvious, but the radix tree allocations on insert are not immediately obvious, and we avoid this for the inode cache radix trees by using preloading and hence completely avoiding the radix tree node allocation under tree lock constraints. Hence there are a few solutions here. The first is to reinstate GFP_NOFS for the radix tree and add a comment explaining why GFP_NOFS is used. The second is to use memalloc_nofs_save() on the radix tree insert context, which makes it obvious that the radix tree insert runs under GFP_NOFS constraints. The third option is to simply replace the radix tree and it's lock with an xarray which can do memory allocation safely in an insert context. The first is OK, but not really the direction we want to head. The second is my preferred short term solution. The third - converting XFS radix trees to xarray - is the longer term solution. Hence to fix the regression here, we take option 2 as it moves us in the direction we want to head with memory allocation and GFP_NOFS removal. Reported-by: syzbot+8fdff861a781522bda4d@syzkaller.appspotmail.com Reported-by: syzbot+d247769793ec169e4bf9@syzkaller.appspotmail.com Fixes: 94a69db2367e ("xfs: use __GFP_NOLOCKDEP instead of GFP_NOFS") Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_dquot.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 30d36596a2e4..c98cb468c357 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -811,6 +811,12 @@ restart: * caller should throw away the dquot and start over. Otherwise, the dquot * is returned locked (and held by the cache) as if there had been a cache * hit. + * + * The insert needs to be done under memalloc_nofs context because the radix + * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in + * memory reclaim when freeing unused dquots, so we cannot have the radix tree + * node allocation recursing into filesystem reclaim whilst we hold the + * qi_tree_lock. */ static int xfs_qm_dqget_cache_insert( @@ -820,25 +826,27 @@ xfs_qm_dqget_cache_insert( xfs_dqid_t id, struct xfs_dquot *dqp) { + unsigned int nofs_flags; int error; + nofs_flags = memalloc_nofs_save(); mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { /* Duplicate found! Caller must try again. */ - mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); - return error; + goto out_unlock; } /* Return a locked dquot to the caller, with a reference taken. */ xfs_dqlock(dqp); dqp->q_nrefs = 1; - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); - return 0; +out_unlock: + mutex_unlock(&qi->qi_tree_lock); + memalloc_nofs_restore(nofs_flags); + return error; } /* Check our input parameters. */ From 748c7ebac8dbcf54c2840a9a74fc9efeb4d76b7f Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Thu, 14 Mar 2024 17:58:13 +0800 Subject: [PATCH 229/496] fbdev: uvesafb: Convert sprintf/snprintf to sysfs_emit Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). sprintf() will be converted as weel if they have. Generally, this patch is generated by make coccicheck M= MODE=patch \ COCCI=scripts/coccinelle/api/device_attr_show.cocci No functional change intended CC: Helge Deller CC: linux-fbdev@vger.kernel.org CC: dri-devel@lists.freedesktop.org Signed-off-by: Li Zhijian Signed-off-by: Helge Deller --- drivers/video/fbdev/uvesafb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c index e1f421e91b4f..73f00c079a94 100644 --- a/drivers/video/fbdev/uvesafb.c +++ b/drivers/video/fbdev/uvesafb.c @@ -1546,7 +1546,7 @@ static ssize_t uvesafb_show_vbe_ver(struct device *dev, struct fb_info *info = dev_get_drvdata(dev); struct uvesafb_par *par = info->par; - return snprintf(buf, PAGE_SIZE, "%.4x\n", par->vbe_ib.vbe_version); + return sysfs_emit(buf, "%.4x\n", par->vbe_ib.vbe_version); } static DEVICE_ATTR(vbe_version, S_IRUGO, uvesafb_show_vbe_ver, NULL); From 974191720ae76ba3613c4aa2301c297b4de27e46 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 29 Feb 2024 22:50:10 +1100 Subject: [PATCH 230/496] fbdev: mb862xxfb: Fix defined but not used error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit socrates_gc_mode is defined at the top-level but then only used inside an #ifdef CONFIG_FB_MB862XX_LIME, leading to an error with some configs: drivers/video/fbdev/mb862xx/mb862xxfbdrv.c:36:31: error: ‘socrates_gc_mode’ defined but not used 36 | static struct mb862xx_gc_mode socrates_gc_mode = { Fix it by moving socrates_gc_mode inside that ifdef, immediately prior to the only function where it's used. Signed-off-by: Michael Ellerman Signed-off-by: Helge Deller --- drivers/video/fbdev/mb862xx/mb862xxfbdrv.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c b/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c index 7c402e9fd7a9..baec312d7b33 100644 --- a/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c +++ b/drivers/video/fbdev/mb862xx/mb862xxfbdrv.c @@ -32,15 +32,6 @@ #define CARMINE_MEM_SIZE 0x8000000 #define DRV_NAME "mb862xxfb" -#if defined(CONFIG_SOCRATES) -static struct mb862xx_gc_mode socrates_gc_mode = { - /* Mode for Prime View PM070WL4 TFT LCD Panel */ - { "800x480", 45, 800, 480, 40000, 86, 42, 33, 10, 128, 2, 0, 0, 0 }, - /* 16 bits/pixel, 16MB, 133MHz, SDRAM memory mode value */ - 16, 0x1000000, GC_CCF_COT_133, 0x4157ba63 -}; -#endif - /* Helpers */ static inline int h_total(struct fb_var_screeninfo *var) { @@ -666,6 +657,15 @@ static int mb862xx_gdc_init(struct mb862xxfb_par *par) return 0; } +#if defined(CONFIG_SOCRATES) +static struct mb862xx_gc_mode socrates_gc_mode = { + /* Mode for Prime View PM070WL4 TFT LCD Panel */ + { "800x480", 45, 800, 480, 40000, 86, 42, 33, 10, 128, 2, 0, 0, 0 }, + /* 16 bits/pixel, 16MB, 133MHz, SDRAM memory mode value */ + 16, 0x1000000, GC_CCF_COT_133, 0x4157ba63 +}; +#endif + static int of_platform_mb862xx_probe(struct platform_device *ofdev) { struct device_node *np = ofdev->dev.of_node; From bc87bb342f106a0402186bcb588fcbe945dced4b Mon Sep 17 00:00:00 2001 From: Aleksandr Burakov Date: Fri, 1 Mar 2024 14:35:43 +0300 Subject: [PATCH 231/496] fbdev: viafb: fix typo in hw_bitblt_1 and hw_bitblt_2 There are some actions with value 'tmp' but 'dst_addr' is checked instead. It is obvious that a copy-paste error was made here and the value of variable 'tmp' should be checked here. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Aleksandr Burakov Signed-off-by: Helge Deller --- drivers/video/fbdev/via/accel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/video/fbdev/via/accel.c b/drivers/video/fbdev/via/accel.c index 0a1bc7a4d785..1e04026f0809 100644 --- a/drivers/video/fbdev/via/accel.c +++ b/drivers/video/fbdev/via/accel.c @@ -115,7 +115,7 @@ static int hw_bitblt_1(void __iomem *engine, u8 op, u32 width, u32 height, if (op != VIA_BITBLT_FILL) { tmp = src_mem ? 0 : src_addr; - if (dst_addr & 0xE0000007) { + if (tmp & 0xE0000007) { printk(KERN_WARNING "hw_bitblt_1: Unsupported source " "address %X\n", tmp); return -EINVAL; @@ -260,7 +260,7 @@ static int hw_bitblt_2(void __iomem *engine, u8 op, u32 width, u32 height, writel(tmp, engine + 0x18); tmp = src_mem ? 0 : src_addr; - if (dst_addr & 0xE0000007) { + if (tmp & 0xE0000007) { printk(KERN_WARNING "hw_bitblt_2: Unsupported source " "address %X\n", tmp); return -EINVAL; From f1a785101d50f5844ed29341142e7224b87f705d Mon Sep 17 00:00:00 2001 From: Karolina Stolarek Date: Wed, 13 Mar 2024 15:21:42 +0100 Subject: [PATCH 232/496] drm/tests: Build KMS helpers when DRM_KUNIT_TEST_HELPERS is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 66671944e176 ("drm/tests: helpers: Add atomic helpers") introduced a dependency on CRTC helpers in KUnit test helpers. Select the former when building KUnit test helpers to avoid linker errors. Fixes: 66671944e176 ("drm/tests: helpers: Add atomic helpers") Cc: Maxime Ripard Cc: Maíra Canal Signed-off-by: Karolina Stolarek Link: https://lore.kernel.org/r/20240313142142.1318718-1-karolina.stolarek@intel.com Signed-off-by: Maxime Ripard --- drivers/gpu/drm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 872edb47bb53..ba45c998b9f8 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -68,6 +68,7 @@ config DRM_USE_DYNAMIC_DEBUG config DRM_KUNIT_TEST_HELPERS tristate depends on DRM && KUNIT + select DRM_KMS_HELPER help KUnit Helpers for KMS drivers. @@ -80,7 +81,6 @@ config DRM_KUNIT_TEST select DRM_EXEC select DRM_EXPORT_FOR_TESTS if m select DRM_GEM_SHMEM_HELPER - select DRM_KMS_HELPER select DRM_KUNIT_TEST_HELPERS select DRM_LIB_RANDOM select PRIME_NUMBERS From 5384cc0d1a88c27448a6a4e65b8abe6486de8012 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Mar 2024 10:34:43 +0800 Subject: [PATCH 233/496] scripts/bpf_doc: Use silent mode when exec make cmd When getting kernel version via make, the result may be polluted by other output, like directory change info. e.g. $ export MAKEFLAGS="-w" $ make kernelversion make: Entering directory '/home/net' 6.8.0 make: Leaving directory '/home/net' This will distort the reStructuredText output and make latter rst2man failed like: [...] bpf-helpers.rst:20: (WARNING/2) Field list ends without a blank line; unexpected unindent. [...] Using silent mode would help. e.g. $ make -s --no-print-directory kernelversion 6.8.0 Fixes: fd0a38f9c37d ("scripts/bpf: Set version attribute for bpf-helpers(7) man page") Signed-off-by: Michael Hofmann Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Acked-by: Alejandro Colomar Link: https://lore.kernel.org/bpf/20240315023443.2364442-1-liuhangbin@gmail.com --- scripts/bpf_doc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 4606944984ee..c55878bddfdd 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -414,8 +414,8 @@ class PrinterRST(Printer): version = version.stdout.decode().rstrip() except: try: - version = subprocess.run(['make', 'kernelversion'], cwd=linuxRoot, - capture_output=True, check=True) + version = subprocess.run(['make', '-s', '--no-print-directory', 'kernelversion'], + cwd=linuxRoot, capture_output=True, check=True) version = version.stdout.decode().rstrip() except: return 'Linux' From aae08491b9438347e9656c44021824ad236052b4 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 15 Mar 2024 13:36:05 +0000 Subject: [PATCH 234/496] MAINTAINERS: Update email address for Quentin Monnet With Isovalent being acquired by Cisco, I expect my related email address to disappear sooner or later. Update my email entries in MAINTAINERS and .mailmap with my kernel.org address instead. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/lkml/20240315133606.65971-1-qmo@kernel.org --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index e90797de3256..a5578ae99315 100644 --- a/.mailmap +++ b/.mailmap @@ -495,7 +495,8 @@ Prasad Sodagudi Punit Agrawal Qais Yousef Qais Yousef -Quentin Monnet +Quentin Monnet +Quentin Monnet Quentin Perret Rafael J. Wysocki Rajeev Nandan diff --git a/MAINTAINERS b/MAINTAINERS index 54775eaaf7b3..1a16ed5df18c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3941,7 +3941,7 @@ F: kernel/bpf/bpf_lru* F: kernel/bpf/cgroup.c BPF [TOOLING] (bpftool) -M: Quentin Monnet +M: Quentin Monnet L: bpf@vger.kernel.org S: Maintained F: kernel/bpf/disasm.* From 5e3afe580a9f5ca173a6bd55ffe10948796ef7e5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 15 Mar 2024 15:29:51 +0000 Subject: [PATCH 235/496] io_uring: fix poll_remove stalled req completion Taking the ctx lock is not enough to use the deferred request completion infrastructure, it'll get queued into the list but no one would expect it there, so it will sit there until next io_submit_flush_completions(). It's hard to care about the cancellation path, so complete it via tw. Fixes: ef7dfac51d8ed ("io_uring/poll: serialize poll linked timer start with poll removal") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c446740bc16858f8a2a8dcdce899812f21d15f23.1710514702.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 5f779139cae1..6db1dcb2c797 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -996,7 +996,6 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - struct io_tw_state ts = { .locked = true }; io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); @@ -1045,7 +1044,8 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - io_req_task_complete(preq, &ts); + preq->io_task_work.func = io_req_task_complete; + io_req_task_work_add(preq); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { From 8076972468584d4a21dab9aa50e388b3ea9ad8c7 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 26 Feb 2024 13:07:24 +0106 Subject: [PATCH 236/496] printk: Update @console_may_schedule in console_trylock_spinning() console_trylock_spinning() may takeover the console lock from a schedulable context. Update @console_may_schedule to make sure it reflects a trylock acquire. Reported-by: Mukesh Ojha Closes: https://lore.kernel.org/lkml/20240222090538.23017-1-quic_mojha@quicinc.com Fixes: dbdda842fe96 ("printk: Add console owner and waiter logic to load balance console writes") Signed-off-by: John Ogness Reviewed-by: Mukesh Ojha Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/875xybmo2z.fsf@jogness.linutronix.de Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b06f63e276c1..612c73333848 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2009,6 +2009,12 @@ static int console_trylock_spinning(void) */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + /* + * Update @console_may_schedule for trylock because the previous + * owner may have been schedulable. + */ + console_may_schedule = 0; + return 1; } From 33c3d813330718c403a60d220f03fbece0f4fb5c Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 20 Feb 2024 22:16:03 +0200 Subject: [PATCH 237/496] ASoC: SOF: amd: Move signed_fw_image to struct acp_quirk_entry The signed_fw_image member of struct sof_amd_acp_desc is used to enable signed firmware support in the driver via the acp_sof_quirk_table. In preparation to support additional use cases of the quirk table (i.e. adding new flags), move signed_fw_image to a new struct acp_quirk_entry and update all references to it accordingly. No functional changes intended. Signed-off-by: Cristian Ciocaltea Link: https://msgid.link/r/20240220201623.438944-2-cristian.ciocaltea@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp-loader.c | 2 +- sound/soc/sof/amd/acp.c | 45 ++++++++++++++++++---------------- sound/soc/sof/amd/acp.h | 6 ++++- sound/soc/sof/amd/vangogh.c | 9 +++++-- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/sound/soc/sof/amd/acp-loader.c b/sound/soc/sof/amd/acp-loader.c index d2d21478399e..aad904839b81 100644 --- a/sound/soc/sof/amd/acp-loader.c +++ b/sound/soc/sof/amd/acp-loader.c @@ -173,7 +173,7 @@ int acp_dsp_pre_fw_run(struct snd_sof_dev *sdev) adata = sdev->pdata->hw_pdata; - if (adata->signed_fw_image) + if (adata->quirks && adata->quirks->signed_fw_image) size_fw = adata->fw_bin_size - ACP_FIRMWARE_SIGNATURE; else size_fw = adata->fw_bin_size; diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 9b3c26210db3..9d9197fa83ed 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -20,12 +20,14 @@ #include "acp.h" #include "acp-dsp-offset.h" -#define SECURED_FIRMWARE 1 - static bool enable_fw_debug; module_param(enable_fw_debug, bool, 0444); MODULE_PARM_DESC(enable_fw_debug, "Enable Firmware debug"); +static struct acp_quirk_entry quirk_valve_galileo = { + .signed_fw_image = true, +}; + const struct dmi_system_id acp_sof_quirk_table[] = { { /* Steam Deck OLED device */ @@ -33,7 +35,7 @@ const struct dmi_system_id acp_sof_quirk_table[] = { DMI_MATCH(DMI_SYS_VENDOR, "Valve"), DMI_MATCH(DMI_PRODUCT_NAME, "Galileo"), }, - .driver_data = (void *)SECURED_FIRMWARE, + .driver_data = &quirk_valve_galileo, }, {} }; @@ -254,7 +256,7 @@ int configure_and_run_sha_dma(struct acp_dev_data *adata, void *image_addr, } } - if (adata->signed_fw_image) + if (adata->quirks && adata->quirks->signed_fw_image) snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_DMA_INCLUDE_HDR, ACP_SHA_HEADER); snd_sof_dsp_write(sdev, ACP_DSP_BAR, ACP_SHA_DMA_STRT_ADDR, start_addr); @@ -738,26 +740,27 @@ skip_soundwire: sdev->debug_box.offset = sdev->host_box.offset + sdev->host_box.size; sdev->debug_box.size = BOX_SIZE_1024; - adata->signed_fw_image = false; dmi_id = dmi_first_match(acp_sof_quirk_table); - if (dmi_id && dmi_id->driver_data) { - adata->fw_code_bin = devm_kasprintf(sdev->dev, GFP_KERNEL, - "sof-%s-code.bin", - chip->name); - if (!adata->fw_code_bin) { - ret = -ENOMEM; - goto free_ipc_irq; - } + if (dmi_id) { + adata->quirks = dmi_id->driver_data; - adata->fw_data_bin = devm_kasprintf(sdev->dev, GFP_KERNEL, - "sof-%s-data.bin", - chip->name); - if (!adata->fw_data_bin) { - ret = -ENOMEM; - goto free_ipc_irq; - } + if (adata->quirks->signed_fw_image) { + adata->fw_code_bin = devm_kasprintf(sdev->dev, GFP_KERNEL, + "sof-%s-code.bin", + chip->name); + if (!adata->fw_code_bin) { + ret = -ENOMEM; + goto free_ipc_irq; + } - adata->signed_fw_image = dmi_id->driver_data; + adata->fw_data_bin = devm_kasprintf(sdev->dev, GFP_KERNEL, + "sof-%s-data.bin", + chip->name); + if (!adata->fw_data_bin) { + ret = -ENOMEM; + goto free_ipc_irq; + } + } } adata->enable_fw_debug = enable_fw_debug; diff --git a/sound/soc/sof/amd/acp.h b/sound/soc/sof/amd/acp.h index 947068da39b5..b648ed194b9f 100644 --- a/sound/soc/sof/amd/acp.h +++ b/sound/soc/sof/amd/acp.h @@ -207,6 +207,10 @@ struct sof_amd_acp_desc { u64 sdw_acpi_dev_addr; }; +struct acp_quirk_entry { + bool signed_fw_image; +}; + /* Common device data struct for ACP devices */ struct acp_dev_data { struct snd_sof_dev *dev; @@ -236,7 +240,7 @@ struct acp_dev_data { u8 *data_buf; dma_addr_t sram_dma_addr; u8 *sram_data_buf; - bool signed_fw_image; + struct acp_quirk_entry *quirks; struct dma_descriptor dscr_info[ACP_MAX_DESC]; struct acp_dsp_stream stream_buf[ACP_MAX_STREAM]; struct acp_dsp_stream *dtrace_stream; diff --git a/sound/soc/sof/amd/vangogh.c b/sound/soc/sof/amd/vangogh.c index de15d21aa6d9..bc6ffdb5471a 100644 --- a/sound/soc/sof/amd/vangogh.c +++ b/sound/soc/sof/amd/vangogh.c @@ -143,6 +143,7 @@ EXPORT_SYMBOL_NS(sof_vangogh_ops, SND_SOC_SOF_AMD_COMMON); int sof_vangogh_ops_init(struct snd_sof_dev *sdev) { const struct dmi_system_id *dmi_id; + struct acp_quirk_entry *quirks; /* common defaults */ memcpy(&sof_vangogh_ops, &sof_acp_common_ops, sizeof(struct snd_sof_dsp_ops)); @@ -151,8 +152,12 @@ int sof_vangogh_ops_init(struct snd_sof_dev *sdev) sof_vangogh_ops.num_drv = ARRAY_SIZE(vangogh_sof_dai); dmi_id = dmi_first_match(acp_sof_quirk_table); - if (dmi_id && dmi_id->driver_data) - sof_vangogh_ops.load_firmware = acp_sof_load_signed_firmware; + if (dmi_id) { + quirks = dmi_id->driver_data; + + if (quirks->signed_fw_image) + sof_vangogh_ops.load_firmware = acp_sof_load_signed_firmware; + } return 0; } From 094d11768f740f11483dad4efcd9bbcffa4ce146 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 20 Feb 2024 22:16:04 +0200 Subject: [PATCH 238/496] ASoC: SOF: amd: Skip IRAM/DRAM size modification for Steam Deck OLED The recent introduction of the ACP/PSP communication for IRAM/DRAM fence register modification breaks the audio support on Valve's Steam Deck OLED device. It causes IPC timeout errors when trying to load DSP topology during probing: 1707255557.688176 kernel: snd_sof_amd_vangogh 0000:04:00.5: ipc tx timed out for 0x30100000 (msg/reply size: 48/0) 1707255557.689035 kernel: snd_sof_amd_vangogh 0000:04:00.5: ------------[ IPC dump start ]------------ 1707255557.689421 kernel: snd_sof_amd_vangogh 0000:04:00.5: dsp_msg = 0x0 dsp_ack = 0x91d14f6f host_msg = 0x1 host_ack = 0xead0f1a4 irq_stat > 1707255557.689730 kernel: snd_sof_amd_vangogh 0000:04:00.5: ------------[ IPC dump end ]------------ 1707255557.690074 kernel: snd_sof_amd_vangogh 0000:04:00.5: ------------[ DSP dump start ]------------ 1707255557.690376 kernel: snd_sof_amd_vangogh 0000:04:00.5: IPC timeout 1707255557.690744 kernel: snd_sof_amd_vangogh 0000:04:00.5: fw_state: SOF_FW_BOOT_COMPLETE (7) 1707255557.691037 kernel: snd_sof_amd_vangogh 0000:04:00.5: invalid header size 0xdb43fe7. FW oops is bogus 1707255557.694824 kernel: snd_sof_amd_vangogh 0000:04:00.5: unexpected fault 0x6942d3b3 trace 0x6942d3b3 1707255557.695392 kernel: snd_sof_amd_vangogh 0000:04:00.5: ------------[ DSP dump end ]------------ 1707255557.695755 kernel: snd_sof_amd_vangogh 0000:04:00.5: Failed to setup widget PIPELINE.6.ACPHS1.IN 1707255557.696069 kernel: snd_sof_amd_vangogh 0000:04:00.5: error: tplg component load failed -110 1707255557.696374 kernel: snd_sof_amd_vangogh 0000:04:00.5: error: failed to load DSP topology -22 1707255557.697904 kernel: snd_sof_amd_vangogh 0000:04:00.5: ASoC: error at snd_soc_component_probe on 0000:04:00.5: -22 1707255557.698405 kernel: sof_mach nau8821-max: ASoC: failed to instantiate card -22 1707255557.701061 kernel: sof_mach nau8821-max: error -EINVAL: Failed to register card(sof-nau8821-max) 1707255557.701624 kernel: sof_mach: probe of nau8821-max failed with error -22 Introduce a new member skip_iram_dram_size_mod to struct acp_quirk_entry and use it to skip IRAM/DRAM size modification for Vangogh Galileo device. Fixes: 55d7bbe43346 ("ASoC: SOF: amd: Add acp-psp mailbox interface for iram-dram fence register modification") Signed-off-by: Cristian Ciocaltea Link: https://msgid.link/r/20240220201623.438944-3-cristian.ciocaltea@collabora.com Signed-off-by: Mark Brown --- sound/soc/sof/amd/acp.c | 3 ++- sound/soc/sof/amd/acp.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/sof/amd/acp.c b/sound/soc/sof/amd/acp.c index 9d9197fa83ed..be7dc1e02284 100644 --- a/sound/soc/sof/amd/acp.c +++ b/sound/soc/sof/amd/acp.c @@ -26,6 +26,7 @@ MODULE_PARM_DESC(enable_fw_debug, "Enable Firmware debug"); static struct acp_quirk_entry quirk_valve_galileo = { .signed_fw_image = true, + .skip_iram_dram_size_mod = true, }; const struct dmi_system_id acp_sof_quirk_table[] = { @@ -280,7 +281,7 @@ int configure_and_run_sha_dma(struct acp_dev_data *adata, void *image_addr, } /* psp_send_cmd only required for vangogh platform (rev - 5) */ - if (desc->rev == 5) { + if (desc->rev == 5 && !(adata->quirks && adata->quirks->skip_iram_dram_size_mod)) { /* Modify IRAM and DRAM size */ ret = psp_send_cmd(adata, MBOX_ACP_IRAM_DRAM_FENCE_COMMAND | IRAM_DRAM_FENCE_2); if (ret) diff --git a/sound/soc/sof/amd/acp.h b/sound/soc/sof/amd/acp.h index b648ed194b9f..e229bb6b849d 100644 --- a/sound/soc/sof/amd/acp.h +++ b/sound/soc/sof/amd/acp.h @@ -209,6 +209,7 @@ struct sof_amd_acp_desc { struct acp_quirk_entry { bool signed_fw_image; + bool skip_iram_dram_size_mod; }; /* Common device data struct for ACP devices */ From 2bb7e0c49302feec1c2f777bbfe8726169986ed8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 4 Mar 2024 09:02:47 +0100 Subject: [PATCH 239/496] riscv: Fix compilation error with FAST_GUP and rv32 By surrounding the definition of pte_leaf_size() with a ifdef napot as it should have been. Fixes: e0fe5ab4192c ("riscv: Fix pte_leaf_size() for NAPOT") Signed-off-by: Alexandre Ghiti Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20240304080247.387710-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 89f5f1bd6e46..2fdf7c85066f 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -439,9 +439,11 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } +#ifdef CONFIG_RISCV_ISA_SVNAPOT #define pte_leaf_size(pte) (pte_napot(pte) ? \ napot_cont_size(napot_cont_order(pte)) :\ PAGE_SIZE) +#endif #ifdef CONFIG_NUMA_BALANCING /* From 700c2d9b1b179e723a1b6201d3307c37ede90f99 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Wed, 21 Feb 2024 18:02:52 +0800 Subject: [PATCH 240/496] riscv: vector: Fix a typo of preempt_v The term "preempt_v" represents the RISCV_PREEMPT_V field of riscv_v_flags and is used in lots of comments. Here corrects the miss-spelling "prempt_v". And s/acheived/achieved/. Reviewed-by: Andy Chiu Signed-off-by: Song Shuai Link: https://lore.kernel.org/r/20240221100252.3990445-1-songshuaishuai@tinylab.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/simd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/simd.h b/arch/riscv/include/asm/simd.h index 54efbf523d49..adb50f3ec205 100644 --- a/arch/riscv/include/asm/simd.h +++ b/arch/riscv/include/asm/simd.h @@ -34,9 +34,9 @@ static __must_check inline bool may_use_simd(void) return false; /* - * Nesting is acheived in preempt_v by spreading the control for + * Nesting is achieved in preempt_v by spreading the control for * preemptible and non-preemptible kernel-mode Vector into two fields. - * Always try to match with prempt_v if kernel V-context exists. Then, + * Always try to match with preempt_v if kernel V-context exists. Then, * fallback to check non preempt_v if nesting happens, or if the config * is not set. */ From ee498a38f3177d9ee0213839d3a05b94272aa48c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:31 -0700 Subject: [PATCH 241/496] bpf: Clarify bpf_arena comments. Clarify two bpf_arena comments, use existing SZ_4G #define, improve page_cnt check. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-2-alexei.starovoitov@gmail.com --- kernel/bpf/arena.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 86571e760dd6..343c3456c8dd 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -38,7 +38,7 @@ /* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */ #define GUARD_SZ (1ull << sizeof(((struct bpf_insn *)0)->off) * 8) -#define KERN_VM_SZ ((1ull << 32) + GUARD_SZ) +#define KERN_VM_SZ (SZ_4G + GUARD_SZ) struct bpf_arena { struct bpf_map map; @@ -110,7 +110,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr) return ERR_PTR(-EINVAL); vm_range = (u64)attr->max_entries * PAGE_SIZE; - if (vm_range > (1ull << 32)) + if (vm_range > SZ_4G) return ERR_PTR(-E2BIG); if ((attr->map_extra >> 32) != ((attr->map_extra + vm_range - 1) >> 32)) @@ -301,7 +301,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad if (pgoff) return -EINVAL; - if (len > (1ull << 32)) + if (len > SZ_4G) return -E2BIG; /* if user_vm_start was specified at arena creation time */ @@ -322,7 +322,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad if (WARN_ON_ONCE(arena->user_vm_start)) /* checks at map creation time should prevent this */ return -EFAULT; - return round_up(ret, 1ull << 32); + return round_up(ret, SZ_4G); } static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) @@ -346,7 +346,7 @@ static int arena_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) return -EBUSY; /* Earlier checks should prevent this */ - if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > (1ull << 32) || vma->vm_pgoff)) + if (WARN_ON_ONCE(vma->vm_end - vma->vm_start > SZ_4G || vma->vm_pgoff)) return -EFAULT; if (remember_vma(arena, vma)) @@ -420,7 +420,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (uaddr & ~PAGE_MASK) return 0; pgoff = compute_pgoff(arena, uaddr); - if (pgoff + page_cnt > page_cnt_max) + if (pgoff > page_cnt_max - page_cnt) /* requested address will be outside of user VMA */ return 0; } @@ -447,7 +447,13 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt goto out; uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE); - /* Earlier checks make sure that uaddr32 + page_cnt * PAGE_SIZE will not overflow 32-bit */ + /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1 + * will not overflow 32-bit. Lower 32-bit need to represent + * contiguous user address range. + * Map these pages at kern_vm_start base. + * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow + * lower 32-bit and it's ok. + */ ret = vm_area_map_pages(arena->kern_vm, kern_vm_start + uaddr32, kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE, pages); if (ret) { @@ -510,6 +516,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) if (!page) continue; if (page_cnt == 1 && page_mapped(page)) /* mapped by some user process */ + /* Optimization for the common case of page_cnt==1: + * If page wasn't mapped into some user vma there + * is no need to call zap_pages which is slow. When + * page_cnt is big it's faster to do the batched zap. + */ zap_pages(arena, full_uaddr, 1); vm_area_unmap_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE); __free_page(page); From 10ebe835c937a11870690aa44c7c970fe906ff54 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:32 -0700 Subject: [PATCH 242/496] libbpf, selftests/bpf: Adjust libbpf, bpftool, selftests to match LLVM The selftests use to tell LLVM about special pointers. For LLVM there is nothing "arena" about them. They are simply pointers in a different address space. Hence LLVM diff https://github.com/llvm/llvm-project/pull/85161 renamed: . macro __BPF_FEATURE_ARENA_CAST -> __BPF_FEATURE_ADDR_SPACE_CAST . global variables in __attribute__((address_space(N))) are now placed in section named ".addr_space.N" instead of ".arena.N". Adjust libbpf, bpftool, and selftests to match LLVM. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-3-alexei.starovoitov@gmail.com --- tools/bpf/bpftool/gen.c | 2 +- tools/lib/bpf/libbpf.c | 2 +- tools/testing/selftests/bpf/bpf_arena_common.h | 2 +- tools/testing/selftests/bpf/progs/arena_htab.c | 2 +- tools/testing/selftests/bpf/progs/arena_list.c | 10 +++++----- tools/testing/selftests/bpf/progs/verifier_arena.c | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4fa4ade1ce74..540c0f2c4fda 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -121,7 +121,7 @@ static bool get_datasec_ident(const char *sec_name, char *buf, size_t buf_sz) int i, n; /* recognize hard coded LLVM section name */ - if (strcmp(sec_name, ".arena.1") == 0) { + if (strcmp(sec_name, ".addr_space.1") == 0) { /* this is the name to use in skeleton */ snprintf(buf, buf_sz, "arena"); return true; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index efab29b8935b..36e26f4f5997 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -498,7 +498,7 @@ struct bpf_struct_ops { #define KSYMS_SEC ".ksyms" #define STRUCT_OPS_SEC ".struct_ops" #define STRUCT_OPS_LINK_SEC ".struct_ops.link" -#define ARENA_SEC ".arena.1" +#define ARENA_SEC ".addr_space.1" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, diff --git a/tools/testing/selftests/bpf/bpf_arena_common.h b/tools/testing/selftests/bpf/bpf_arena_common.h index bcf195c64a45..567491f3e1b5 100644 --- a/tools/testing/selftests/bpf/bpf_arena_common.h +++ b/tools/testing/selftests/bpf/bpf_arena_common.h @@ -32,7 +32,7 @@ */ #endif -#if defined(__BPF_FEATURE_ARENA_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) #define __arena __attribute__((address_space(1))) #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ diff --git a/tools/testing/selftests/bpf/progs/arena_htab.c b/tools/testing/selftests/bpf/progs/arena_htab.c index b7bb712cacfd..1e6ac187a6a0 100644 --- a/tools/testing/selftests/bpf/progs/arena_htab.c +++ b/tools/testing/selftests/bpf/progs/arena_htab.c @@ -22,7 +22,7 @@ int zero = 0; SEC("syscall") int arena_htab_llvm(void *ctx) { -#if defined(__BPF_FEATURE_ARENA_CAST) || defined(BPF_ARENA_FORCE_ASM) +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) || defined(BPF_ARENA_FORCE_ASM) struct htab __arena *htab; __u64 i; diff --git a/tools/testing/selftests/bpf/progs/arena_list.c b/tools/testing/selftests/bpf/progs/arena_list.c index cd35b8448435..c0422c58cee2 100644 --- a/tools/testing/selftests/bpf/progs/arena_list.c +++ b/tools/testing/selftests/bpf/progs/arena_list.c @@ -30,13 +30,13 @@ int list_sum; int cnt; bool skip = false; -#ifdef __BPF_FEATURE_ARENA_CAST +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST long __arena arena_sum; int __arena test_val = 1; struct arena_list_head __arena global_head; #else -long arena_sum SEC(".arena.1"); -int test_val SEC(".arena.1"); +long arena_sum SEC(".addr_space.1"); +int test_val SEC(".addr_space.1"); #endif int zero; @@ -44,7 +44,7 @@ int zero; SEC("syscall") int arena_list_add(void *ctx) { -#ifdef __BPF_FEATURE_ARENA_CAST +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST __u64 i; list_head = &global_head; @@ -66,7 +66,7 @@ int arena_list_add(void *ctx) SEC("syscall") int arena_list_del(void *ctx) { -#ifdef __BPF_FEATURE_ARENA_CAST +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST struct elem __arena *n; int sum = 0; diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 5540b05ff9ee..969bc091060b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -19,7 +19,7 @@ SEC("syscall") __success __retval(0) int basic_alloc1(void *ctx) { -#if defined(__BPF_FEATURE_ARENA_CAST) +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile int __arena *page1, *page2, *no_page, *page3; page1 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); @@ -58,7 +58,7 @@ SEC("syscall") __success __retval(0) int basic_alloc2(void *ctx) { -#if defined(__BPF_FEATURE_ARENA_CAST) +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) volatile char __arena *page1, *page2, *page3, *page4; page1 = bpf_arena_alloc_pages(&arena, NULL, 2, NUMA_NO_NODE, 0); From 9a2d5a966b47e5657b22dfa257365b7ef2abc3c0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:33 -0700 Subject: [PATCH 243/496] selftests/bpf: Remove hard coded PAGE_SIZE macro. Remove hard coded PAGE_SIZE. Add #include instead (that works on x86-64 and s390) and fallback to slow getpagesize() for aarch64. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-4-alexei.starovoitov@gmail.com --- tools/testing/selftests/bpf/prog_tests/arena_htab.c | 8 +++++--- tools/testing/selftests/bpf/prog_tests/arena_list.c | 7 +++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/arena_htab.c b/tools/testing/selftests/bpf/prog_tests/arena_htab.c index 0766702de846..d69fd2465f53 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_htab.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_htab.c @@ -3,12 +3,14 @@ #include #include #include - +#include +#ifndef PAGE_SIZE /* on some archs it comes in sys/user.h */ +#include +#define PAGE_SIZE getpagesize() +#endif #include "arena_htab_asm.skel.h" #include "arena_htab.skel.h" -#define PAGE_SIZE 4096 - #include "bpf_arena_htab.h" static void test_arena_htab_common(struct htab *htab) diff --git a/tools/testing/selftests/bpf/prog_tests/arena_list.c b/tools/testing/selftests/bpf/prog_tests/arena_list.c index e61886debab1..d15867cddde0 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_list.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_list.c @@ -3,8 +3,11 @@ #include #include #include - -#define PAGE_SIZE 4096 +#include +#ifndef PAGE_SIZE /* on some archs it comes in sys/user.h */ +#include +#define PAGE_SIZE getpagesize() +#endif #include "bpf_arena_list.h" #include "arena_list.skel.h" From a90c5845db958701ddc7659bc4f6db6fa647e449 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Mar 2024 19:18:34 -0700 Subject: [PATCH 244/496] selftests/bpf: Add arena test case for 4Gbyte corner case Check that 4Gbyte arena can be allocated and overflow/underflow access in the first and the last page behaves as expected. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20240315021834.62988-5-alexei.starovoitov@gmail.com --- .../selftests/bpf/prog_tests/verifier.c | 2 + .../bpf/progs/verifier_arena_large.c | 68 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/verifier_arena_large.c diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 985273832f89..c4f9f306646e 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -5,6 +5,7 @@ #include "cap_helpers.h" #include "verifier_and.skel.h" #include "verifier_arena.skel.h" +#include "verifier_arena_large.skel.h" #include "verifier_array_access.skel.h" #include "verifier_basic_stack.skel.h" #include "verifier_bitfield_write.skel.h" @@ -120,6 +121,7 @@ static void run_tests_aux(const char *skel_name, void test_verifier_and(void) { RUN(verifier_and); } void test_verifier_arena(void) { RUN(verifier_arena); } +void test_verifier_arena_large(void) { RUN(verifier_arena_large); } void test_verifier_basic_stack(void) { RUN(verifier_basic_stack); } void test_verifier_bitfield_write(void) { RUN(verifier_bitfield_write); } void test_verifier_bounds(void) { RUN(verifier_bounds); } diff --git a/tools/testing/selftests/bpf/progs/verifier_arena_large.c b/tools/testing/selftests/bpf/progs/verifier_arena_large.c new file mode 100644 index 000000000000..ef66ea460264 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_arena_large.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_experimental.h" +#include "bpf_arena_common.h" + +#define ARENA_SIZE (1ull << 32) + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, ARENA_SIZE / PAGE_SIZE); +} arena SEC(".maps"); + +SEC("syscall") +__success __retval(0) +int big_alloc1(void *ctx) +{ +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) + volatile char __arena *page1, *page2, *no_page, *page3; + void __arena *base; + + page1 = base = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page1) + return 1; + *page1 = 1; + page2 = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE - PAGE_SIZE, + 1, NUMA_NO_NODE, 0); + if (!page2) + return 2; + *page2 = 2; + no_page = bpf_arena_alloc_pages(&arena, base + ARENA_SIZE, + 1, NUMA_NO_NODE, 0); + if (no_page) + return 3; + if (*page1 != 1) + return 4; + if (*page2 != 2) + return 5; + bpf_arena_free_pages(&arena, (void __arena *)page1, 1); + if (*page2 != 2) + return 6; + if (*page1 != 0) /* use-after-free should return 0 */ + return 7; + page3 = bpf_arena_alloc_pages(&arena, NULL, 1, NUMA_NO_NODE, 0); + if (!page3) + return 8; + *page3 = 3; + if (page1 != page3) + return 9; + if (*page2 != 2) + return 10; + if (*(page1 + PAGE_SIZE) != 0) + return 11; + if (*(page1 - PAGE_SIZE) != 0) + return 12; + if (*(page2 + PAGE_SIZE) != 0) + return 13; + if (*(page2 - PAGE_SIZE) != 0) + return 14; +#endif + return 0; +} +char _license[] SEC("license") = "GPL"; From 30dab608c3cb99c2a05b76289fd05551703979ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Mar 2024 15:37:15 -0600 Subject: [PATCH 245/496] io_uring/futex: always remove futex entry for cancel all We know the request is either being removed, or already in the process of being removed through task_work, so we can delete it from our futex list upfront. This is important for remove all conditions, as we otherwise will find it multiple times and prevent cancelation progress. Cc: stable@vger.kernel.org Fixes: 194bb58c6090 ("io_uring: add support for futex wake and wait") Fixes: 8f350194d5cf ("io_uring: add support for vectored futex waits") Signed-off-by: Jens Axboe --- io_uring/futex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/futex.c b/io_uring/futex.c index 3c3575303c3d..792a03df58de 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -159,6 +159,7 @@ bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; + hlist_del_init(&req->hash_node); __io_futex_cancel(ctx, req); found = true; } From 2b35b8b43e07b1a6f06fdd84cf4b9eb24785896d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Mar 2024 15:42:49 -0600 Subject: [PATCH 246/496] io_uring/waitid: always remove waitid entry for cancel all We know the request is either being removed, or already in the process of being removed through task_work, so we can delete it from our waitid list upfront. This is important for remove all conditions, as we otherwise will find it multiple times and prevent cancelation progress. Remove the dead check in cancelation as well for the hash_node being empty or not. We already have a waitid reference check for ownership, so we don't need to check the list too. Cc: stable@vger.kernel.org Fixes: f31ecf671ddc ("io_uring: add IORING_OP_WAITID support") Signed-off-by: Jens Axboe --- io_uring/waitid.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 6f851978606d..77d340666cb9 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -125,12 +125,6 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) lockdep_assert_held(&req->ctx->uring_lock); - /* - * Did cancel find it meanwhile? - */ - if (hlist_unhashed(&req->hash_node)) - return; - hlist_del_init(&req->hash_node); ret = io_waitid_finish(req, ret); @@ -202,6 +196,7 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { if (!io_match_task_safe(req, task, cancel_all)) continue; + hlist_del_init(&req->hash_node); __io_waitid_cancel(ctx, req); found = true; } From 152609795dbf02f004c86049b75c23f4e68071d8 Mon Sep 17 00:00:00 2001 From: Samuel Thibault Date: Sat, 16 Mar 2024 01:10:21 +0100 Subject: [PATCH 247/496] fbcon: Increase maximum font width x height to 64 x 128 By using bitmaps we actually support whatever size we would want, but the console currently limits fonts to 64x128 (which gives 60x16 text on 4k screens), so we don't need more for now, and we can easily increase later. Signed-off-by: Samuel Thibault Signed-off-by: Helge Deller --- drivers/firmware/efi/earlycon.c | 2 +- drivers/video/fbdev/arkfb.c | 15 +++++++++++---- drivers/video/fbdev/core/fbcon.c | 16 +++++++++------- drivers/video/fbdev/core/fbmem.c | 12 ++++++------ drivers/video/fbdev/core/svgalib.c | 15 +++++++++++---- drivers/video/fbdev/s3fb.c | 15 +++++++++++---- drivers/video/fbdev/vga16fb.c | 6 +++++- drivers/video/fbdev/vt8623fb.c | 15 +++++++++++---- drivers/video/sticore.c | 2 +- include/linux/fb.h | 18 ++++++++++++------ include/linux/font.h | 3 ++- lib/fonts/fonts.c | 15 +++++++++------ 12 files changed, 89 insertions(+), 45 deletions(-) diff --git a/drivers/firmware/efi/earlycon.c b/drivers/firmware/efi/earlycon.c index f80a9af3d16e..d18a1a5de144 100644 --- a/drivers/firmware/efi/earlycon.c +++ b/drivers/firmware/efi/earlycon.c @@ -252,7 +252,7 @@ static int __init efi_earlycon_setup(struct earlycon_device *device, if (si->lfb_depth != 32) return -ENODEV; - font = get_default_font(xres, yres, -1, -1); + font = get_default_font(xres, yres, NULL, NULL); if (!font) return -ENODEV; diff --git a/drivers/video/fbdev/arkfb.c b/drivers/video/fbdev/arkfb.c index dca9c0325b3f..082501feceb9 100644 --- a/drivers/video/fbdev/arkfb.c +++ b/drivers/video/fbdev/arkfb.c @@ -622,8 +622,13 @@ static int arkfb_set_par(struct fb_info *info) info->tileops = NULL; /* in 4bpp supports 8p wide tiles only, any tiles otherwise */ - info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0); - info->pixmap.blit_y = ~(u32)0; + if (bpp == 4) { + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + } else { + bitmap_fill(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + } + bitmap_fill(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); offset_value = (info->var.xres_virtual * bpp) / 64; screen_size = info->var.yres_virtual * info->fix.line_length; @@ -635,8 +640,10 @@ static int arkfb_set_par(struct fb_info *info) info->tileops = &arkfb_tile_ops; /* supports 8x16 tiles only */ - info->pixmap.blit_x = 1 << (8 - 1); - info->pixmap.blit_y = 1 << (16 - 1); + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + bitmap_zero(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); + set_bit(16 - 1, info->pixmap.blit_y); offset_value = info->var.xres_virtual / 16; screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 1183e7a871f8..4b67b32fdbc7 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2485,12 +2485,12 @@ static int fbcon_set_font(struct vc_data *vc, struct console_font *font, h > FBCON_SWAP(info->var.rotate, info->var.yres, info->var.xres)) return -EINVAL; - if (font->width > 32 || font->height > 32) + if (font->width > FB_MAX_BLIT_WIDTH || font->height > FB_MAX_BLIT_HEIGHT) return -EINVAL; /* Make sure drawing engine can handle the font */ - if (!(info->pixmap.blit_x & BIT(font->width - 1)) || - !(info->pixmap.blit_y & BIT(font->height - 1))) + if (!test_bit(font->width - 1, info->pixmap.blit_x) || + !test_bit(font->height - 1, info->pixmap.blit_y)) return -EINVAL; /* Make sure driver can handle the font length */ @@ -3084,8 +3084,8 @@ void fbcon_get_requirement(struct fb_info *info, vc = vc_cons[i].d; if (vc && vc->vc_mode == KD_TEXT && info->node == con2fb_map[i]) { - caps->x |= 1 << (vc->vc_font.width - 1); - caps->y |= 1 << (vc->vc_font.height - 1); + set_bit(vc->vc_font.width - 1, caps->x); + set_bit(vc->vc_font.height - 1, caps->y); charcnt = vc->vc_font.charcount; if (caps->len < charcnt) caps->len = charcnt; @@ -3096,8 +3096,10 @@ void fbcon_get_requirement(struct fb_info *info, if (vc && vc->vc_mode == KD_TEXT && info->node == con2fb_map[fg_console]) { - caps->x = 1 << (vc->vc_font.width - 1); - caps->y = 1 << (vc->vc_font.height - 1); + bitmap_zero(caps->x, FB_MAX_BLIT_WIDTH); + set_bit(vc->vc_font.width - 1, caps->x); + bitmap_zero(caps->y, FB_MAX_BLIT_HEIGHT); + set_bit(vc->vc_font.height - 1, caps->y); caps->len = vc->vc_font.charcount; } } diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index fc206755f5f6..5ca18bfe11f6 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -212,8 +212,8 @@ static int fb_check_caps(struct fb_info *info, struct fb_var_screeninfo *var, fbcon_get_requirement(info, &caps); info->fbops->fb_get_caps(info, &fbcaps, var); - if (((fbcaps.x ^ caps.x) & caps.x) || - ((fbcaps.y ^ caps.y) & caps.y) || + if (!bitmap_subset(caps.x, fbcaps.x, FB_MAX_BLIT_WIDTH) || + !bitmap_subset(caps.y, fbcaps.y, FB_MAX_BLIT_HEIGHT) || (fbcaps.len < caps.len)) err = -EINVAL; @@ -420,11 +420,11 @@ static int do_register_framebuffer(struct fb_info *fb_info) } fb_info->pixmap.offset = 0; - if (!fb_info->pixmap.blit_x) - fb_info->pixmap.blit_x = ~(u32)0; + if (bitmap_empty(fb_info->pixmap.blit_x, FB_MAX_BLIT_WIDTH)) + bitmap_fill(fb_info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); - if (!fb_info->pixmap.blit_y) - fb_info->pixmap.blit_y = ~(u32)0; + if (bitmap_empty(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT)) + bitmap_fill(fb_info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); if (!fb_info->modelist.prev || !fb_info->modelist.next) INIT_LIST_HEAD(&fb_info->modelist); diff --git a/drivers/video/fbdev/core/svgalib.c b/drivers/video/fbdev/core/svgalib.c index 2cba158888ea..821b89a0a645 100644 --- a/drivers/video/fbdev/core/svgalib.c +++ b/drivers/video/fbdev/core/svgalib.c @@ -354,12 +354,19 @@ void svga_get_caps(struct fb_info *info, struct fb_blit_caps *caps, { if (var->bits_per_pixel == 0) { /* can only support 256 8x16 bitmap */ - caps->x = 1 << (8 - 1); - caps->y = 1 << (16 - 1); + bitmap_zero(caps->x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, caps->x); + bitmap_zero(caps->y, FB_MAX_BLIT_HEIGHT); + set_bit(16 - 1, caps->y); caps->len = 256; } else { - caps->x = (var->bits_per_pixel == 4) ? 1 << (8 - 1) : ~(u32)0; - caps->y = ~(u32)0; + if (var->bits_per_pixel == 4) { + bitmap_zero(caps->x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, caps->x); + } else { + bitmap_fill(caps->x, FB_MAX_BLIT_WIDTH); + } + bitmap_fill(caps->y, FB_MAX_BLIT_HEIGHT); caps->len = ~(u32)0; } } diff --git a/drivers/video/fbdev/s3fb.c b/drivers/video/fbdev/s3fb.c index 07722a5ea8ef..ff84106ecf1c 100644 --- a/drivers/video/fbdev/s3fb.c +++ b/drivers/video/fbdev/s3fb.c @@ -617,8 +617,13 @@ static int s3fb_set_par(struct fb_info *info) info->tileops = NULL; /* in 4bpp supports 8p wide tiles only, any tiles otherwise */ - info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0); - info->pixmap.blit_y = ~(u32)0; + if (bpp == 4) { + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + } else { + bitmap_fill(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + } + bitmap_fill(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); offset_value = (info->var.xres_virtual * bpp) / 64; screen_size = info->var.yres_virtual * info->fix.line_length; @@ -630,8 +635,10 @@ static int s3fb_set_par(struct fb_info *info) info->tileops = fasttext ? &s3fb_fast_tile_ops : &s3fb_tile_ops; /* supports 8x16 tiles only */ - info->pixmap.blit_x = 1 << (8 - 1); - info->pixmap.blit_y = 1 << (16 - 1); + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + bitmap_zero(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); + set_bit(16 - 1, info->pixmap.blit_y); offset_value = info->var.xres_virtual / 16; screen_size = (info->var.xres_virtual * info->var.yres_virtual) / 64; diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c index b485e9198201..a87bafbb119c 100644 --- a/drivers/video/fbdev/vga16fb.c +++ b/drivers/video/fbdev/vga16fb.c @@ -1353,7 +1353,11 @@ static int vga16fb_probe(struct platform_device *dev) info->var = vga16fb_defined; info->fix = vga16fb_fix; /* supports rectangles with widths of multiples of 8 */ - info->pixmap.blit_x = 1 << 7 | 1 << 15 | 1 << 23 | 1 << 31; + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + set_bit(16 - 1, info->pixmap.blit_x); + set_bit(24 - 1, info->pixmap.blit_x); + set_bit(32 - 1, info->pixmap.blit_x); info->flags = FBINFO_HWACCEL_YPAN; i = (info->var.bits_per_pixel == 8) ? 256 : 16; diff --git a/drivers/video/fbdev/vt8623fb.c b/drivers/video/fbdev/vt8623fb.c index f8d022cb61e8..df984f3a7ff6 100644 --- a/drivers/video/fbdev/vt8623fb.c +++ b/drivers/video/fbdev/vt8623fb.c @@ -390,8 +390,13 @@ static int vt8623fb_set_par(struct fb_info *info) info->tileops = NULL; /* in 4bpp supports 8p wide tiles only, any tiles otherwise */ - info->pixmap.blit_x = (bpp == 4) ? (1 << (8 - 1)) : (~(u32)0); - info->pixmap.blit_y = ~(u32)0; + if (bpp == 4) { + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + } else { + bitmap_fill(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + } + bitmap_fill(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); offset_value = (info->var.xres_virtual * bpp) / 64; fetch_value = ((info->var.xres * bpp) / 128) + 4; @@ -408,8 +413,10 @@ static int vt8623fb_set_par(struct fb_info *info) info->tileops = &vt8623fb_tile_ops; /* supports 8x16 tiles only */ - info->pixmap.blit_x = 1 << (8 - 1); - info->pixmap.blit_y = 1 << (16 - 1); + bitmap_zero(info->pixmap.blit_x, FB_MAX_BLIT_WIDTH); + set_bit(8 - 1, info->pixmap.blit_x); + bitmap_zero(info->pixmap.blit_y, FB_MAX_BLIT_HEIGHT); + set_bit(16 - 1, info->pixmap.blit_y); offset_value = info->var.xres_virtual / 16; fetch_value = (info->var.xres / 8) + 8; diff --git a/drivers/video/sticore.c b/drivers/video/sticore.c index 7115b325817f..88a1758616e0 100644 --- a/drivers/video/sticore.c +++ b/drivers/video/sticore.c @@ -529,7 +529,7 @@ sti_select_fbfont(struct sti_cooked_rom *cooked_rom, const char *fbfont_name) if (fbfont_name && strlen(fbfont_name)) fbfont = find_font(fbfont_name); if (!fbfont) - fbfont = get_default_font(1024,768, ~(u32)0, ~(u32)0); + fbfont = get_default_font(1024, 768, NULL, NULL); if (!fbfont) return NULL; diff --git a/include/linux/fb.h b/include/linux/fb.h index 05dc9624897d..7d7c7791fd26 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -143,9 +143,13 @@ struct fb_event { void *data; }; +/* Enough for the VT console needs, see its max_font_width/height */ +#define FB_MAX_BLIT_WIDTH 64 +#define FB_MAX_BLIT_HEIGHT 128 + struct fb_blit_caps { - u32 x; - u32 y; + DECLARE_BITMAP(x, FB_MAX_BLIT_WIDTH); + DECLARE_BITMAP(y, FB_MAX_BLIT_HEIGHT); u32 len; u32 flags; }; @@ -192,10 +196,12 @@ struct fb_pixmap { u32 scan_align; /* alignment per scanline */ u32 access_align; /* alignment per read/write (bits) */ u32 flags; /* see FB_PIXMAP_* */ - u32 blit_x; /* supported bit block dimensions (1-32)*/ - u32 blit_y; /* Format: blit_x = 1 << (width - 1) */ - /* blit_y = 1 << (height - 1) */ - /* if 0, will be set to 0xffffffff (all)*/ + /* supported bit block dimensions */ + /* Format: test_bit(width - 1, blit_x) */ + /* test_bit(height - 1, blit_y) */ + /* if zero, will be set to full (all) */ + DECLARE_BITMAP(blit_x, FB_MAX_BLIT_WIDTH); + DECLARE_BITMAP(blit_y, FB_MAX_BLIT_HEIGHT); /* access methods */ void (*writeio)(struct fb_info *info, void __iomem *dst, void *src, unsigned int size); void (*readio) (struct fb_info *info, void *dst, void __iomem *src, unsigned int size); diff --git a/include/linux/font.h b/include/linux/font.h index abf1442ce719..81caffd51bb4 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -57,7 +57,8 @@ extern const struct font_desc *find_font(const char *name); /* Get the default font for a specific screen size */ extern const struct font_desc *get_default_font(int xres, int yres, - u32 font_w, u32 font_h); + unsigned long *font_w, + unsigned long *font_h); /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 973866438608..47e34950b665 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -96,18 +96,21 @@ EXPORT_SYMBOL(find_font); * get_default_font - get default font * @xres: screen size of X * @yres: screen size of Y - * @font_w: bit array of supported widths (1 - 32) - * @font_h: bit array of supported heights (1 - 32) + * @font_w: bit array of supported widths (1 - FB_MAX_BLIT_WIDTH) + * @font_h: bit array of supported heights (1 - FB_MAX_BLIT_HEIGHT) * * Get the default font for a specified screen size. * Dimensions are in pixels. * + * font_w or font_h being NULL means all values are supported. + * * Returns %NULL if no font is found, or a pointer to the * chosen font. * */ -const struct font_desc *get_default_font(int xres, int yres, u32 font_w, - u32 font_h) +const struct font_desc *get_default_font(int xres, int yres, + unsigned long *font_w, + unsigned long *font_h) { int i, c, cc, res; const struct font_desc *f, *g; @@ -135,8 +138,8 @@ const struct font_desc *get_default_font(int xres, int yres, u32 font_w, if (res > 20) c += 20 - res; - if ((font_w & (1U << (f->width - 1))) && - (font_h & (1U << (f->height - 1)))) + if ((!font_w || test_bit(f->width - 1, font_w)) && + (!font_h || test_bit(f->height - 1, font_h))) c += 1000; if (c > cc) { From f3a640cca951ef9715597e68f5363afc0f452a88 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Mar 2024 16:36:23 -0600 Subject: [PATCH 248/496] io_uring/net: ensure async prep handlers always initialize ->done_io If we get a request with IOSQE_ASYNC set, then we first run the prep async handlers. But if we then fail setting it up and want to post a CQE with -EINVAL, we use ->done_io. This was previously guarded with REQ_F_PARTIAL_IO, and the normal setup handlers do set it up before any potential errors, but we need to cover the async setup too. Fixes: 9817ad85899f ("io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io") Signed-off-by: Jens Axboe --- io_uring/net.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 19451f0dbf81..1e7665ff6ef7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -326,7 +326,10 @@ int io_send_prep_async(struct io_kiocb *req) struct io_async_msghdr *io; int ret; - if (!zc->addr || req_has_async_data(req)) + if (req_has_async_data(req)) + return 0; + zc->done_io = 0; + if (!zc->addr) return 0; io = io_msg_alloc_async_prep(req); if (!io) @@ -353,8 +356,10 @@ static int io_setup_async_addr(struct io_kiocb *req, int io_sendmsg_prep_async(struct io_kiocb *req) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int ret; + sr->done_io = 0; if (!io_msg_alloc_async_prep(req)) return -ENOMEM; ret = io_sendmsg_copy_hdr(req, req->async_data); @@ -608,9 +613,11 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, int io_recvmsg_prep_async(struct io_kiocb *req) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *iomsg; int ret; + sr->done_io = 0; if (!io_msg_alloc_async_prep(req)) return -ENOMEM; iomsg = req->async_data; From e21e1c45e1fe2e31732f40256b49c04e76a17cee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 09:51:40 -0600 Subject: [PATCH 249/496] io_uring: clear opcode specific data for an early failure If failure happens before the opcode prep handler is called, ensure that we clear the opcode specific area of the request, which holds data specific to that request type. This prevents errors where opcode handlers either don't get to clear per-request private data since prep isn't even called. Reported-and-tested-by: syzbot+f8e9a371388aa62ecab4@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3ae4bb988906..5d4b448fdc50 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2181,6 +2181,13 @@ static void io_init_req_drain(struct io_kiocb *req) } } +static __cold int io_init_fail_req(struct io_kiocb *req, int err) +{ + /* ensure per-opcode data is cleared if we fail before prep */ + memset(&req->cmd.data, 0, sizeof(req->cmd.data)); + return err; +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) @@ -2202,29 +2209,29 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; - return -EINVAL; + return io_init_fail_req(req, -EINVAL); } def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (sqe_flags & IOSQE_BUFFER_SELECT) { if (!def->buffer_select) - return -EOPNOTSUPP; + return io_init_fail_req(req, -EOPNOTSUPP); req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) ctx->drain_disabled = true; if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) - return -EOPNOTSUPP; + return io_init_fail_req(req, -EOPNOTSUPP); io_init_req_drain(req); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ if (ctx->drain_active) req->flags |= REQ_F_FORCE_ASYNC; @@ -2237,9 +2244,9 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (!def->ioprio && sqe->ioprio) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); if (def->needs_file) { struct io_submit_state *state = &ctx->submit_state; @@ -2263,12 +2270,12 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) - return -EINVAL; + return io_init_fail_req(req, -EINVAL); get_cred(req->creds); ret = security_uring_override_creds(req->creds); if (ret) { put_cred(req->creds); - return ret; + return io_init_fail_req(req, ret); } req->flags |= REQ_F_CREDS; } From 78cb0945f7141961781f815168f6873ad2b7ed29 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 16 Mar 2024 12:18:21 +0100 Subject: [PATCH 250/496] powerpc: Handle error in mark_rodata_ro() and mark_initmem_nx() mark_rodata_ro() and mark_initmem_nx() use functions that can fail like set_memory_nx() and set_memory_ro(), leading to a not protected kernel. In case of failure, panic. Link: https://github.com/KSPP/linux/issues/7 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/b16329611deb89e1af505d43f0e2a91310584d26.1710587887.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/book3s32/mmu.c | 7 +++++-- arch/powerpc/mm/mmu_decl.h | 8 +++---- arch/powerpc/mm/nohash/8xx.c | 33 ++++++++++++++++++----------- arch/powerpc/mm/nohash/e500.c | 10 ++++++--- arch/powerpc/mm/pgtable_32.c | 38 +++++++++++++++++++++++++--------- 5 files changed, 65 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 5445587bfe84..100f999871bc 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -193,7 +193,7 @@ static bool is_module_segment(unsigned long addr) return true; } -void mmu_mark_initmem_nx(void) +int mmu_mark_initmem_nx(void) { int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; @@ -230,9 +230,10 @@ void mmu_mark_initmem_nx(void) mtsr(mfsr(i << 28) | 0x10000000, i << 28); } + return 0; } -void mmu_mark_rodata_ro(void) +int mmu_mark_rodata_ro(void) { int nb = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; int i; @@ -245,6 +246,8 @@ void mmu_mark_rodata_ro(void) } update_bats(); + + return 0; } /* diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 8e84bc214d13..6949c2c937e7 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -160,11 +160,11 @@ static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; } #endif #if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || defined(CONFIG_PPC_E500) -void mmu_mark_initmem_nx(void); -void mmu_mark_rodata_ro(void); +int mmu_mark_initmem_nx(void); +int mmu_mark_rodata_ro(void); #else -static inline void mmu_mark_initmem_nx(void) { } -static inline void mmu_mark_rodata_ro(void) { } +static inline int mmu_mark_initmem_nx(void) { return 0; } +static inline int mmu_mark_rodata_ro(void) { return 0; } #endif #ifdef CONFIG_PPC_8xx diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 6be6421086ed..43d4842bb1c7 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -119,23 +119,26 @@ void __init mmu_mapin_immr(void) PAGE_KERNEL_NCG, MMU_PAGE_512K, true); } -static void mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, - pgprot_t prot, bool new) +static int mmu_mapin_ram_chunk(unsigned long offset, unsigned long top, + pgprot_t prot, bool new) { unsigned long v = PAGE_OFFSET + offset; unsigned long p = offset; + int err = 0; WARN_ON(!IS_ALIGNED(offset, SZ_512K) || !IS_ALIGNED(top, SZ_512K)); - for (; p < ALIGN(p, SZ_8M) && p < top; p += SZ_512K, v += SZ_512K) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); - for (; p < ALIGN_DOWN(top, SZ_8M) && p < top; p += SZ_8M, v += SZ_8M) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new); - for (; p < ALIGN_DOWN(top, SZ_512K) && p < top; p += SZ_512K, v += SZ_512K) - __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN(p, SZ_8M) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); + for (; p < ALIGN_DOWN(top, SZ_8M) && p < top && !err; p += SZ_8M, v += SZ_8M) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_8M, new); + for (; p < ALIGN_DOWN(top, SZ_512K) && p < top && !err; p += SZ_512K, v += SZ_512K) + err = __early_map_kernel_hugepage(v, p, prot, MMU_PAGE_512K, new); if (!new) flush_tlb_kernel_range(PAGE_OFFSET + v, PAGE_OFFSET + top); + + return err; } unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) @@ -166,27 +169,33 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return top; } -void mmu_mark_initmem_nx(void) +int mmu_mark_initmem_nx(void) { unsigned long etext8 = ALIGN(__pa(_etext), SZ_8M); unsigned long sinittext = __pa(_sinittext); unsigned long boundary = strict_kernel_rwx_enabled() ? sinittext : etext8; unsigned long einittext8 = ALIGN(__pa(_einittext), SZ_8M); + int err = 0; if (!debug_pagealloc_enabled_or_kfence()) - mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); + err = mmu_mapin_ram_chunk(boundary, einittext8, PAGE_KERNEL, false); mmu_pin_tlb(block_mapped_ram, false); + + return err; } #ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) +int mmu_mark_rodata_ro(void) { unsigned long sinittext = __pa(_sinittext); + int err; - mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false); + err = mmu_mapin_ram_chunk(0, sinittext, PAGE_KERNEL_ROX, false); if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) mmu_pin_tlb(block_mapped_ram, true); + + return err; } #endif diff --git a/arch/powerpc/mm/nohash/e500.c b/arch/powerpc/mm/nohash/e500.c index 921c3521ec11..266fb22131fc 100644 --- a/arch/powerpc/mm/nohash/e500.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -285,19 +285,23 @@ void __init adjust_total_lowmem(void) } #ifdef CONFIG_STRICT_KERNEL_RWX -void mmu_mark_rodata_ro(void) +int mmu_mark_rodata_ro(void) { unsigned long remapped; remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, false, false); - WARN_ON(__max_low_memory != remapped); + if (WARN_ON(__max_low_memory != remapped)) + return -EINVAL; + + return 0; } #endif -void mmu_mark_initmem_nx(void) +int mmu_mark_initmem_nx(void) { /* Everything is done in mmu_mark_rodata_ro() */ + return 0; } void setup_initial_memory_limit(phys_addr_t first_memblock_base, diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index face94977cb2..cfd622ebf774 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -130,31 +130,41 @@ void __init mapin_ram(void) } } -void mark_initmem_nx(void) +static int __mark_initmem_nx(void) { unsigned long numpages = PFN_UP((unsigned long)_einittext) - PFN_DOWN((unsigned long)_sinittext); + int err; - mmu_mark_initmem_nx(); + err = mmu_mark_initmem_nx(); if (!v_block_mapped((unsigned long)_sinittext)) { - set_memory_nx((unsigned long)_sinittext, numpages); - set_memory_rw((unsigned long)_sinittext, numpages); + err = set_memory_nx((unsigned long)_sinittext, numpages); + if (err) + return err; + err = set_memory_rw((unsigned long)_sinittext, numpages); } + return err; +} + +void mark_initmem_nx(void) +{ + int err = __mark_initmem_nx(); + + if (err) + panic("%s() failed, err = %d\n", __func__, err); } #ifdef CONFIG_STRICT_KERNEL_RWX -void mark_rodata_ro(void) +static int __mark_rodata_ro(void) { unsigned long numpages; if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX) && mmu_has_feature(MMU_FTR_HPTE_TABLE)) pr_warn("This platform has HASH MMU, STRICT_MODULE_RWX won't work\n"); - if (v_block_mapped((unsigned long)_stext + 1)) { - mmu_mark_rodata_ro(); - return; - } + if (v_block_mapped((unsigned long)_stext + 1)) + return mmu_mark_rodata_ro(); /* * mark text and rodata as read only. __end_rodata is set by @@ -164,6 +174,14 @@ void mark_rodata_ro(void) numpages = PFN_UP((unsigned long)__end_rodata) - PFN_DOWN((unsigned long)_stext); - set_memory_ro((unsigned long)_stext, numpages); + return set_memory_ro((unsigned long)_stext, numpages); +} + +void mark_rodata_ro(void) +{ + int err = __mark_rodata_ro(); + + if (err) + panic("%s() failed, err = %d\n", __func__, err); } #endif From 56a34d799bfa53064e7b8bd354aacd176aeaecc8 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 26 Feb 2024 16:00:08 +0530 Subject: [PATCH 251/496] kexec/kdump: make struct crash_mem available without CONFIG_CRASH_DUMP struct crash_mem defined under include/linux/crash_core.h represents a list of memory ranges. While it is used to represent memory ranges for kdump kernel, it can also be used for other kind of memory ranges. In fact, KEXEC_FILE_LOAD syscall in powerpc uses this structure to represent reserved memory ranges and exclude memory ranges needed to find the right memory regions to load kexec kernel. So, make the definition of crash_mem structure available for !CONFIG_CRASH_DUMP case too. Signed-off-by: Hari Bathini Acked-by: Baoquan He Signed-off-by: Michael Ellerman Link: https://msgid.link/20240226103010.589537-2-hbathini@linux.ibm.com --- include/linux/crash_core.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 23270b16e1db..d33352c2e386 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -8,6 +8,12 @@ struct kimage; +struct crash_mem { + unsigned int max_nr_ranges; + unsigned int nr_ranges; + struct range ranges[] __counted_by(max_nr_ranges); +}; + #ifdef CONFIG_CRASH_DUMP int crash_shrink_memory(unsigned long new_size); @@ -51,12 +57,6 @@ static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } /* Alignment required for elf header segment */ #define ELF_CORE_HEADER_ALIGN 4096 -struct crash_mem { - unsigned int max_nr_ranges; - unsigned int nr_ranges; - struct range ranges[] __counted_by(max_nr_ranges); -}; - extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend); From 33f2cc0a2e90f7177c49559b434191b02efd0cd5 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 26 Feb 2024 16:00:09 +0530 Subject: [PATCH 252/496] powerpc/kexec: split CONFIG_KEXEC_FILE and CONFIG_CRASH_DUMP CONFIG_KEXEC_FILE does not have to select CONFIG_CRASH_DUMP. Move some code under CONFIG_CRASH_DUMP to support CONFIG_KEXEC_FILE and !CONFIG_CRASH_DUMP case. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240226103010.589537-3-hbathini@linux.ibm.com --- arch/powerpc/kexec/elf_64.c | 4 +- arch/powerpc/kexec/file_load_64.c | 269 ++++++++++++++++-------------- 2 files changed, 142 insertions(+), 131 deletions(-) diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c index 904016cf89ea..6d8951e8e966 100644 --- a/arch/powerpc/kexec/elf_64.c +++ b/arch/powerpc/kexec/elf_64.c @@ -47,7 +47,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, if (ret) return ERR_PTR(ret); - if (image->type == KEXEC_TYPE_CRASH) { + if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) { /* min & max buffer values for kdump case */ kbuf.buf_min = pbuf.buf_min = crashk_res.start; kbuf.buf_max = pbuf.buf_max = @@ -70,7 +70,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem); /* Load additional segments needed for panic kernel */ - if (image->type == KEXEC_TYPE_CRASH) { + if (IS_ENABLED(CONFIG_CRASH_DUMP) && image->type == KEXEC_TYPE_CRASH) { ret = load_crashdump_segments_ppc64(image, &kbuf); if (ret) { pr_err("Failed to load kdump kernel segments\n"); diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 5b4c5cb23354..1bc65de6174f 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -96,119 +96,6 @@ out: return ret; } -/** - * get_usable_memory_ranges - Get usable memory ranges. This list includes - * regions like crashkernel, opal/rtas & tce-table, - * that kdump kernel could use. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_usable_memory_ranges(struct crash_mem **mem_ranges) -{ - int ret; - - /* - * Early boot failure observed on guests when low memory (first memory - * block?) is not added to usable memory. So, add [0, crashk_res.end] - * instead of [crashk_res.start, crashk_res.end] to workaround it. - * Also, crashed kernel's memory must be added to reserve map to - * avoid kdump kernel from using it. - */ - ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1); - if (ret) - goto out; - - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_opal_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_tce_mem_ranges(mem_ranges); -out: - if (ret) - pr_err("Failed to setup usable memory ranges\n"); - return ret; -} - -/** - * get_crash_memory_ranges - Get crash memory ranges. This list includes - * first/crashing kernel's memory regions that - * would be exported via an elfcore. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_crash_memory_ranges(struct crash_mem **mem_ranges) -{ - phys_addr_t base, end; - struct crash_mem *tmem; - u64 i; - int ret; - - for_each_mem_range(i, &base, &end) { - u64 size = end - base; - - /* Skip backup memory region, which needs a separate entry */ - if (base == BACKUP_SRC_START) { - if (size > BACKUP_SRC_SIZE) { - base = BACKUP_SRC_END + 1; - size -= BACKUP_SRC_SIZE; - } else - continue; - } - - ret = add_mem_range(mem_ranges, base, size); - if (ret) - goto out; - - /* Try merging adjacent ranges before reallocation attempt */ - if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges) - sort_memory_ranges(*mem_ranges, true); - } - - /* Reallocate memory ranges if there is no space to split ranges */ - tmem = *mem_ranges; - if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { - tmem = realloc_mem_ranges(mem_ranges); - if (!tmem) - goto out; - } - - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); - if (ret) - goto out; - - /* - * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL - * regions are exported to save their context at the time of - * crash, they should actually be backed up just like the - * first 64K bytes of memory. - */ - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_opal_mem_range(mem_ranges); - if (ret) - goto out; - - /* create a separate program header for the backup region */ - ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE); - if (ret) - goto out; - - sort_memory_ranges(*mem_ranges, false); -out: - if (ret) - pr_err("Failed to setup crash memory ranges\n"); - return ret; -} - /** * get_reserved_memory_ranges - Get reserve memory ranges. This list includes * memory regions that should be added to the @@ -434,6 +321,120 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf, return ret; } +#ifdef CONFIG_CRASH_DUMP +/** + * get_usable_memory_ranges - Get usable memory ranges. This list includes + * regions like crashkernel, opal/rtas & tce-table, + * that kdump kernel could use. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +static int get_usable_memory_ranges(struct crash_mem **mem_ranges) +{ + int ret; + + /* + * Early boot failure observed on guests when low memory (first memory + * block?) is not added to usable memory. So, add [0, crashk_res.end] + * instead of [crashk_res.start, crashk_res.end] to workaround it. + * Also, crashed kernel's memory must be added to reserve map to + * avoid kdump kernel from using it. + */ + ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1); + if (ret) + goto out; + + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_tce_mem_ranges(mem_ranges); +out: + if (ret) + pr_err("Failed to setup usable memory ranges\n"); + return ret; +} + +/** + * get_crash_memory_ranges - Get crash memory ranges. This list includes + * first/crashing kernel's memory regions that + * would be exported via an elfcore. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +static int get_crash_memory_ranges(struct crash_mem **mem_ranges) +{ + phys_addr_t base, end; + struct crash_mem *tmem; + u64 i; + int ret; + + for_each_mem_range(i, &base, &end) { + u64 size = end - base; + + /* Skip backup memory region, which needs a separate entry */ + if (base == BACKUP_SRC_START) { + if (size > BACKUP_SRC_SIZE) { + base = BACKUP_SRC_END + 1; + size -= BACKUP_SRC_SIZE; + } else + continue; + } + + ret = add_mem_range(mem_ranges, base, size); + if (ret) + goto out; + + /* Try merging adjacent ranges before reallocation attempt */ + if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges) + sort_memory_ranges(*mem_ranges, true); + } + + /* Reallocate memory ranges if there is no space to split ranges */ + tmem = *mem_ranges; + if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { + tmem = realloc_mem_ranges(mem_ranges); + if (!tmem) + goto out; + } + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + /* + * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL + * regions are exported to save their context at the time of + * crash, they should actually be backed up just like the + * first 64K bytes of memory. + */ + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + /* create a separate program header for the backup region */ + ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE); + if (ret) + goto out; + + sort_memory_ranges(*mem_ranges, false); +out: + if (ret) + pr_err("Failed to setup crash memory ranges\n"); + return ret; +} + /** * check_realloc_usable_mem - Reallocate buffer if it can't accommodate entries * @um_info: Usable memory buffer and ranges info. @@ -863,6 +864,7 @@ int load_crashdump_segments_ppc64(struct kimage *image, return 0; } +#endif /** * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global @@ -972,26 +974,14 @@ static unsigned int cpu_node_size(void) return size; } -/** - * kexec_extra_fdt_size_ppc64 - Return the estimated additional size needed to - * setup FDT for kexec/kdump kernel. - * @image: kexec image being loaded. - * - * Returns the estimated extra size needed for kexec/kdump kernel FDT. - */ -unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) +static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) { unsigned int cpu_nodes, extra_size = 0; struct device_node *dn; u64 usm_entries; - // Budget some space for the password blob. There's already extra space - // for the key name - if (plpks_is_available()) - extra_size += (unsigned int)plpks_get_passwordlen(); - - if (image->type != KEXEC_TYPE_CRASH) - return extra_size; + if (!IS_ENABLED(CONFIG_CRASH_DUMP) || image->type != KEXEC_TYPE_CRASH) + return 0; /* * For kdump kernel, account for linux,usable-memory and @@ -1019,6 +1009,25 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) return extra_size; } +/** + * kexec_extra_fdt_size_ppc64 - Return the estimated additional size needed to + * setup FDT for kexec/kdump kernel. + * @image: kexec image being loaded. + * + * Returns the estimated extra size needed for kexec/kdump kernel FDT. + */ +unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) +{ + unsigned int extra_size = 0; + + // Budget some space for the password blob. There's already extra space + // for the key name + if (plpks_is_available()) + extra_size += (unsigned int)plpks_get_passwordlen(); + + return extra_size + kdump_extra_fdt_size_ppc64(image); +} + /** * add_node_props - Reads node properties from device node structure and add * them to fdt. @@ -1171,6 +1180,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, struct crash_mem *umem = NULL, *rmem = NULL; int i, nr_ranges, ret; +#ifdef CONFIG_CRASH_DUMP /* * Restrict memory usage for kdump kernel by setting up * usable memory ranges and memory reserve map. @@ -1207,6 +1217,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, goto out; } } +#endif /* Update cpus nodes information to account hotplug CPUs. */ ret = update_cpus_node(fdt); @@ -1278,7 +1289,7 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) buf_min = kbuf->buf_min; buf_max = kbuf->buf_max; /* Segments for kdump kernel should be within crashkernel region */ - if (kbuf->image->type == KEXEC_TYPE_CRASH) { + if (IS_ENABLED(CONFIG_CRASH_DUMP) && kbuf->image->type == KEXEC_TYPE_CRASH) { buf_min = (buf_min < crashk_res.start ? crashk_res.start : buf_min); buf_max = (buf_max > crashk_res.end ? From 5c4233cc0920cc90787aafe950b90f6c57a35b88 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 26 Feb 2024 16:00:10 +0530 Subject: [PATCH 253/496] powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency Remove CONFIG_CRASH_DUMP dependency on CONFIG_KEXEC. CONFIG_KEXEC_CORE was used at places where CONFIG_CRASH_DUMP or CONFIG_CRASH_RESERVE was appropriate. Replace with appropriate #ifdefs to support CONFIG_KEXEC and !CONFIG_CRASH_DUMP configuration option. Also, make CONFIG_FA_DUMP dependent on CONFIG_CRASH_DUMP to avoid unmet dependencies for FA_DUMP with !CONFIG_KEXEC_CORE configuration option. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240226103010.589537-4-hbathini@linux.ibm.com --- arch/powerpc/Kconfig | 9 +-- arch/powerpc/include/asm/kexec.h | 98 ++++++++++++++-------------- arch/powerpc/kernel/prom.c | 2 +- arch/powerpc/kernel/setup-common.c | 2 +- arch/powerpc/kernel/smp.c | 4 +- arch/powerpc/kexec/Makefile | 3 +- arch/powerpc/kexec/core.c | 4 ++ arch/powerpc/platforms/powernv/smp.c | 2 +- 8 files changed, 61 insertions(+), 63 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a68b9e637eda..1c4be3373686 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -607,11 +607,6 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config ARCH_SUPPORTS_KEXEC def_bool PPC_BOOK3S || PPC_E500 || (44x && !SMP) -config ARCH_SELECTS_KEXEC - def_bool y - depends on KEXEC - select CRASH_DUMP - config ARCH_SUPPORTS_KEXEC_FILE def_bool PPC64 @@ -622,7 +617,6 @@ config ARCH_SELECTS_KEXEC_FILE def_bool y depends on KEXEC_FILE select KEXEC_ELF - select CRASH_DUMP select HAVE_IMA_KEXEC if IMA config PPC64_BIG_ENDIAN_ELF_ABI_V2 @@ -694,8 +688,7 @@ config ARCH_SELECTS_CRASH_DUMP config FA_DUMP bool "Firmware-assisted dump" - depends on PPC64 && (PPC_RTAS || PPC_POWERNV) - select CRASH_DUMP + depends on CRASH_DUMP && PPC64 && (PPC_RTAS || PPC_POWERNV) help A robust mechanism to get reliable kernel crash dump with assistance from firmware. This approach does not use kexec, diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index e1b43aa12175..fdb90e24dc74 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -55,59 +55,18 @@ typedef void (*crash_shutdown_t)(void); #ifdef CONFIG_KEXEC_CORE - -/* - * This function is responsible for capturing register states if coming - * via panic or invoking dump using sysrq-trigger. - */ -static inline void crash_setup_regs(struct pt_regs *newregs, - struct pt_regs *oldregs) -{ - if (oldregs) - memcpy(newregs, oldregs, sizeof(*newregs)); - else - ppc_save_regs(newregs); -} +struct kimage; +struct pt_regs; extern void kexec_smp_wait(void); /* get and clear naca physid, wait for master to copy new code to 0 */ -extern int crashing_cpu; -extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); -extern void crash_ipi_callback(struct pt_regs *); -extern int crash_wake_offline; - -struct kimage; -struct pt_regs; extern void default_machine_kexec(struct kimage *image); -extern void default_machine_crash_shutdown(struct pt_regs *regs); -extern int crash_shutdown_register(crash_shutdown_t handler); -extern int crash_shutdown_unregister(crash_shutdown_t handler); - -extern void crash_kexec_prepare(void); -extern void crash_kexec_secondary(struct pt_regs *regs); -int __init overlaps_crashkernel(unsigned long start, unsigned long size); -extern void reserve_crashkernel(void); extern void machine_kexec_mask_interrupts(void); -static inline bool kdump_in_progress(void) -{ - return crashing_cpu >= 0; -} - void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer, unsigned long start_address) __noreturn; - void kexec_copy_flush(struct kimage *image); -#if defined(CONFIG_CRASH_DUMP) -bool is_kdump_kernel(void); -#define is_kdump_kernel is_kdump_kernel -#if defined(CONFIG_PPC_RTAS) -void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); -#define crash_free_reserved_phys_range crash_free_reserved_phys_range -#endif /* CONFIG_PPC_RTAS */ -#endif /* CONFIG_CRASH_DUMP */ - #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_elf64_ops; @@ -152,15 +111,56 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, #endif /* CONFIG_KEXEC_FILE */ -#else /* !CONFIG_KEXEC_CORE */ -static inline void crash_kexec_secondary(struct pt_regs *regs) { } +#endif /* CONFIG_KEXEC_CORE */ -static inline int overlaps_crashkernel(unsigned long start, unsigned long size) +#ifdef CONFIG_CRASH_RESERVE +int __init overlaps_crashkernel(unsigned long start, unsigned long size); +extern void reserve_crashkernel(void); +#else +static inline void reserve_crashkernel(void) {} +static inline int overlaps_crashkernel(unsigned long start, unsigned long size) { return 0; } +#endif + +#if defined(CONFIG_CRASH_DUMP) +/* + * This function is responsible for capturing register states if coming + * via panic or invoking dump using sysrq-trigger. + */ +static inline void crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) { - return 0; + if (oldregs) + memcpy(newregs, oldregs, sizeof(*newregs)); + else + ppc_save_regs(newregs); } -static inline void reserve_crashkernel(void) { ; } +extern int crashing_cpu; +extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); +extern void crash_ipi_callback(struct pt_regs *regs); +extern int crash_wake_offline; + +extern int crash_shutdown_register(crash_shutdown_t handler); +extern int crash_shutdown_unregister(crash_shutdown_t handler); +extern void default_machine_crash_shutdown(struct pt_regs *regs); + +extern void crash_kexec_prepare(void); +extern void crash_kexec_secondary(struct pt_regs *regs); + +static inline bool kdump_in_progress(void) +{ + return crashing_cpu >= 0; +} + +bool is_kdump_kernel(void); +#define is_kdump_kernel is_kdump_kernel +#if defined(CONFIG_PPC_RTAS) +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); +#define crash_free_reserved_phys_range crash_free_reserved_phys_range +#endif /* CONFIG_PPC_RTAS */ + +#else /* !CONFIG_CRASH_DUMP */ +static inline void crash_kexec_secondary(struct pt_regs *regs) { } static inline int crash_shutdown_register(crash_shutdown_t handler) { @@ -183,7 +183,7 @@ static inline void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) { } -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_DUMP */ #ifdef CONFIG_PPC_BOOK3S_64 #include diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1dc32a058156..cd8d8883de90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -475,7 +475,7 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node, tce_alloc_end = *lprop; #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_RESERVE lprop = of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL); if (lprop) crashk_res.start = *lprop; diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 2add292da494..01ed1263e1a9 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -110,7 +110,7 @@ int ppc_do_canonicalize_irqs; EXPORT_SYMBOL(ppc_do_canonicalize_irqs); #endif -#ifdef CONFIG_VMCORE_INFO +#ifdef CONFIG_CRASH_DUMP /* This keeps a track of which one is the crashing cpu. */ int crashing_cpu = -1; #endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a60e4139214b..12e53b3d7923 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -588,7 +588,7 @@ void smp_send_debugger_break(void) } #endif -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) { int cpu; @@ -631,7 +631,7 @@ void crash_smp_send_stop(void) stopped = true; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP if (kexec_crash_image) { crash_kexec_prepare(); return; diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 91e96f5168b7..8e469c4da3f8 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -3,12 +3,13 @@ # Makefile for the linux kernel. # -obj-y += core.o crash.o core_$(BITS).o +obj-y += core.o core_$(BITS).o obj-$(CONFIG_PPC32) += relocate_32.o obj-$(CONFIG_KEXEC_FILE) += file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o +obj-$(CONFIG_CRASH_DUMP) += crash.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_core_$(BITS).o := n diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index 3ff4411ed496..b8333a49ea5d 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -44,10 +44,12 @@ void machine_kexec_mask_interrupts(void) { } } +#ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { default_machine_crash_shutdown(regs); } +#endif void machine_kexec_cleanup(struct kimage *image) { @@ -77,6 +79,7 @@ void machine_kexec(struct kimage *image) for(;;); } +#ifdef CONFIG_CRASH_RESERVE void __init reserve_crashkernel(void) { unsigned long long crash_size, crash_base, total_mem_sz; @@ -251,3 +254,4 @@ static int __init kexec_setup(void) return 0; } late_initcall(kexec_setup); +#endif /* CONFIG_CRASH_RESERVE */ diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 9e1a25398f98..8f14f0581a21 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -434,7 +434,7 @@ void __init pnv_smp_init(void) smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP crash_wake_offline = 1; #endif #endif From 9eec61df55c51415409c7cc47e9a1c8de94a0522 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 5 Mar 2024 18:39:18 +0000 Subject: [PATCH 254/496] irqchip/renesas-rzg2l: Flush posted write in irq_eoi() The irq_eoi() callback of the RZ/G2L interrupt chip clears the relevant interrupt cause bit in the TSCR register by writing to it. This write is not sufficient because the write is posted and therefore not guaranteed to immediately clear the bit. Due to that delay the CPU can raise the just handled interrupt again. Prevent this by reading the register back which causes the posted write to be flushed to the hardware before the read completes. Fixes: 3fed09559cd8 ("irqchip: Add RZ/G2L IA55 Interrupt Controller driver") Signed-off-by: Biju Das Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-renesas-rzg2l.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 9494fc26259c..5285bc817dd0 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -99,8 +99,14 @@ static void rzg2l_irq_eoi(struct irq_data *d) * ISCR can only be cleared if the type is falling-edge, rising-edge or * falling/rising-edge. */ - if ((iscr & bit) && (iitsr & IITSR_IITSEL_MASK(hw_irq))) + if ((iscr & bit) && (iitsr & IITSR_IITSEL_MASK(hw_irq))) { writel_relaxed(iscr & ~bit, priv->base + ISCR); + /* + * Enforce that the posted write is flushed to prevent that the + * just handled interrupt is raised again. + */ + readl_relaxed(priv->base + ISCR); + } } static void rzg2l_tint_eoi(struct irq_data *d) @@ -111,8 +117,14 @@ static void rzg2l_tint_eoi(struct irq_data *d) u32 reg; reg = readl_relaxed(priv->base + TSCR); - if (reg & bit) + if (reg & bit) { writel_relaxed(reg & ~bit, priv->base + TSCR); + /* + * Enforce that the posted write is flushed to prevent that the + * just handled interrupt is raised again. + */ + readl_relaxed(priv->base + TSCR); + } } static void rzg2l_irqc_eoi(struct irq_data *d) From 7cb6362c63df233172eaecddaf9ce2ce2f769112 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 5 Mar 2024 18:39:19 +0000 Subject: [PATCH 255/496] irqchip/renesas-rzg2l: Rename rzg2l_tint_eoi() Rename rzg2l_tint_eoi()->rzg2l_clear_tint_int() and simplify the code by removing redundant priv and hw_irq local variables. Signed-off-by: Biju Das Signed-off-by: Thomas Gleixner Reviewed-by: Geert Uytterhoeven --- drivers/irqchip/irq-renesas-rzg2l.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 5285bc817dd0..599e0aba5cc0 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -109,11 +109,9 @@ static void rzg2l_irq_eoi(struct irq_data *d) } } -static void rzg2l_tint_eoi(struct irq_data *d) +static void rzg2l_clear_tint_int(struct rzg2l_irqc_priv *priv, unsigned int hwirq) { - unsigned int hw_irq = irqd_to_hwirq(d) - IRQC_TINT_START; - struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); - u32 bit = BIT(hw_irq); + u32 bit = BIT(hwirq - IRQC_TINT_START); u32 reg; reg = readl_relaxed(priv->base + TSCR); @@ -136,7 +134,7 @@ static void rzg2l_irqc_eoi(struct irq_data *d) if (hw_irq >= IRQC_IRQ_START && hw_irq <= IRQC_IRQ_COUNT) rzg2l_irq_eoi(d); else if (hw_irq >= IRQC_TINT_START && hw_irq < IRQC_NUM_IRQ) - rzg2l_tint_eoi(d); + rzg2l_clear_tint_int(priv, hw_irq); raw_spin_unlock(&priv->lock); irq_chip_eoi_parent(d); } From b4b5cd61a6fdd92ede0dc39f0850a182affd1323 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 5 Mar 2024 18:39:20 +0000 Subject: [PATCH 256/496] irqchip/renesas-rzg2l: Rename rzg2l_irq_eoi() Rename rzg2l_irq_eoi()->rzg2l_clear_irq_int() and simplify the code by removing redundant priv local variable. Suggested-by: Geert Uytterhoeven Signed-off-by: Biju Das Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-renesas-rzg2l.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 599e0aba5cc0..8133f05590b6 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -85,10 +85,9 @@ static struct rzg2l_irqc_priv *irq_data_to_priv(struct irq_data *data) return data->domain->host_data; } -static void rzg2l_irq_eoi(struct irq_data *d) +static void rzg2l_clear_irq_int(struct rzg2l_irqc_priv *priv, unsigned int hwirq) { - unsigned int hw_irq = irqd_to_hwirq(d) - IRQC_IRQ_START; - struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); + unsigned int hw_irq = hwirq - IRQC_IRQ_START; u32 bit = BIT(hw_irq); u32 iitsr, iscr; @@ -132,7 +131,7 @@ static void rzg2l_irqc_eoi(struct irq_data *d) raw_spin_lock(&priv->lock); if (hw_irq >= IRQC_IRQ_START && hw_irq <= IRQC_IRQ_COUNT) - rzg2l_irq_eoi(d); + rzg2l_clear_irq_int(priv, hw_irq); else if (hw_irq >= IRQC_TINT_START && hw_irq < IRQC_NUM_IRQ) rzg2l_clear_tint_int(priv, hw_irq); raw_spin_unlock(&priv->lock); From 853a6030303f8a8fa54929b68e5665d9b21aa405 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 5 Mar 2024 18:39:21 +0000 Subject: [PATCH 257/496] irqchip/renesas-rzg2l: Prevent spurious interrupts when setting trigger type RZ/G2L interrupt chips require that the interrupt is masked before changing the NMI, IRQ, TINT interrupt settings. Aside of that, after setting an edge trigger type it is required to clear the interrupt status register in order to avoid spurious interrupts. The current implementation fails to do either of that and therefore is prone to generate spurious interrupts when setting the trigger type. Address this by: - Ensuring that the interrupt is masked at the chip level across the update for the TINT chip - Clearing the interrupt status register after updating the trigger mode for edge type interrupts [ tglx: Massaged changelog and reverted the spin_lock_irqsave() change as the set_type() callback is always called with interrupts disabled. ] Fixes: 3fed09559cd8 ("irqchip: Add RZ/G2L IA55 Interrupt Controller driver") Signed-off-by: Biju Das Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-renesas-rzg2l.c | 36 +++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 8133f05590b6..8803facbb3a2 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -181,8 +181,10 @@ static void rzg2l_irqc_irq_enable(struct irq_data *d) static int rzg2l_irq_set_type(struct irq_data *d, unsigned int type) { - unsigned int hw_irq = irqd_to_hwirq(d) - IRQC_IRQ_START; struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); + unsigned int hwirq = irqd_to_hwirq(d); + u32 iitseln = hwirq - IRQC_IRQ_START; + bool clear_irq_int = false; u16 sense, tmp; switch (type & IRQ_TYPE_SENSE_MASK) { @@ -192,14 +194,17 @@ static int rzg2l_irq_set_type(struct irq_data *d, unsigned int type) case IRQ_TYPE_EDGE_FALLING: sense = IITSR_IITSEL_EDGE_FALLING; + clear_irq_int = true; break; case IRQ_TYPE_EDGE_RISING: sense = IITSR_IITSEL_EDGE_RISING; + clear_irq_int = true; break; case IRQ_TYPE_EDGE_BOTH: sense = IITSR_IITSEL_EDGE_BOTH; + clear_irq_int = true; break; default: @@ -208,21 +213,40 @@ static int rzg2l_irq_set_type(struct irq_data *d, unsigned int type) raw_spin_lock(&priv->lock); tmp = readl_relaxed(priv->base + IITSR); - tmp &= ~IITSR_IITSEL_MASK(hw_irq); - tmp |= IITSR_IITSEL(hw_irq, sense); + tmp &= ~IITSR_IITSEL_MASK(iitseln); + tmp |= IITSR_IITSEL(iitseln, sense); + if (clear_irq_int) + rzg2l_clear_irq_int(priv, hwirq); writel_relaxed(tmp, priv->base + IITSR); raw_spin_unlock(&priv->lock); return 0; } +static u32 rzg2l_disable_tint_and_set_tint_source(struct irq_data *d, struct rzg2l_irqc_priv *priv, + u32 reg, u32 tssr_offset, u8 tssr_index) +{ + u32 tint = (u32)(uintptr_t)irq_data_get_irq_chip_data(d); + u32 tien = reg & (TIEN << TSSEL_SHIFT(tssr_offset)); + + /* Clear the relevant byte in reg */ + reg &= ~(TSSEL_MASK << TSSEL_SHIFT(tssr_offset)); + /* Set TINT and leave TIEN clear */ + reg |= tint << TSSEL_SHIFT(tssr_offset); + writel_relaxed(reg, priv->base + TSSR(tssr_index)); + + return reg | tien; +} + static int rzg2l_tint_set_edge(struct irq_data *d, unsigned int type) { struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); unsigned int hwirq = irqd_to_hwirq(d); u32 titseln = hwirq - IRQC_TINT_START; + u32 tssr_offset = TSSR_OFFSET(titseln); + u8 tssr_index = TSSR_INDEX(titseln); u8 index, sense; - u32 reg; + u32 reg, tssr; switch (type & IRQ_TYPE_SENSE_MASK) { case IRQ_TYPE_EDGE_RISING: @@ -244,10 +268,14 @@ static int rzg2l_tint_set_edge(struct irq_data *d, unsigned int type) } raw_spin_lock(&priv->lock); + tssr = readl_relaxed(priv->base + TSSR(tssr_index)); + tssr = rzg2l_disable_tint_and_set_tint_source(d, priv, tssr, tssr_offset, tssr_index); reg = readl_relaxed(priv->base + TITSR(index)); reg &= ~(IRQ_MASK << (titseln * TITSEL_WIDTH)); reg |= sense << (titseln * TITSEL_WIDTH); writel_relaxed(reg, priv->base + TITSR(index)); + rzg2l_clear_tint_int(priv, hwirq); + writel_relaxed(tssr, priv->base + TSSR(tssr_index)); raw_spin_unlock(&priv->lock); return 0; From 9a8b202f8cb7ebebc71f1f2a353a21c76d3063a8 Mon Sep 17 00:00:00 2001 From: Shalini Manjunatha Date: Wed, 6 Mar 2024 16:23:20 +0530 Subject: [PATCH 258/496] ASoC: soc-compress: Fix and add DPCM locking We find mising DPCM locking inside soc_compr_set_params_fe before calling dpcm_be_dai_hw_params() and dpcm_be_dai_prepare() which cause lockdep assert for DPCM lock not held in __soc_pcm_hw_params() and __soc_pcm_prepare() Signed-off-by: Shalini Manjunatha Link: https://msgid.link/r/d985beeafdd32316eb45f20811eb7926da7a796e.1709720380.git.quic_c_shalma@quicinc.com Signed-off-by: Mark Brown --- sound/soc/soc-compress.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index a38fee48ee00..e692aa3b8b22 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -385,11 +385,15 @@ static int soc_compr_set_params_fe(struct snd_compr_stream *cstream, fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_FE; + snd_soc_dpcm_mutex_lock(fe); ret = dpcm_be_dai_hw_params(fe, stream); + snd_soc_dpcm_mutex_unlock(fe); if (ret < 0) goto out; + snd_soc_dpcm_mutex_lock(fe); ret = dpcm_be_dai_prepare(fe, stream); + snd_soc_dpcm_mutex_unlock(fe); if (ret < 0) goto out; From 1e5dc3989a20ccd703d5fe1a269d2bcedf29142c Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 18 Mar 2024 09:11:28 +0800 Subject: [PATCH 259/496] ALSA: hda/realtek: fix the hp playback volume issue for LG machines Recently we tested the headphone playback on 2 LG machines, if we set the volume to the max value or near to the max value, the sound is too loud, it could even bring harm to listeners. A workaround is to decrease the max volume to a reasonable value for the headphone's amplifier, then the users couldn't set the volume bigger than that value from the userspace. Signed-off-by: Hui Wang Message-ID: <20240318011128.156023-1-hui.wang@canonical.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e904f62e1952..d463d416fc23 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6964,6 +6964,25 @@ static void alc256_fixup_mic_no_presence_and_resume(struct hda_codec *codec, } } +static void alc256_decrease_headphone_amp_val(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + u32 caps; + u8 nsteps, offs; + + if (action != HDA_FIXUP_ACT_PRE_PROBE) + return; + + caps = query_amp_caps(codec, 0x3, HDA_OUTPUT); + nsteps = ((caps & AC_AMPCAP_NUM_STEPS) >> AC_AMPCAP_NUM_STEPS_SHIFT) - 10; + offs = ((caps & AC_AMPCAP_OFFSET) >> AC_AMPCAP_OFFSET_SHIFT) - 10; + caps &= ~AC_AMPCAP_NUM_STEPS & ~AC_AMPCAP_OFFSET; + caps |= (nsteps << AC_AMPCAP_NUM_STEPS_SHIFT) | (offs << AC_AMPCAP_OFFSET_SHIFT); + + if (snd_hda_override_amp_caps(codec, 0x3, HDA_OUTPUT, caps)) + codec_warn(codec, "failed to override amp caps for NID 0x3\n"); +} + static void alc_fixup_dell4_mic_no_presence_quiet(struct hda_codec *codec, const struct hda_fixup *fix, int action) @@ -7382,6 +7401,7 @@ enum { ALC294_FIXUP_CS35L41_I2C_2, ALC245_FIXUP_CS35L56_SPI_4_HP_GPIO_LED, ALC256_FIXUP_ACER_SFG16_MICMUTE_LED, + ALC256_FIXUP_HEADPHONE_AMP_VOL, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9581,6 +9601,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc256_fixup_acer_sfg16_micmute_led, }, + [ALC256_FIXUP_HEADPHONE_AMP_VOL] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc256_decrease_headphone_amp_val, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -10319,6 +10343,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x9e56, "Lenovo ZhaoYang CF4620Z", ALC286_FIXUP_SONY_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1849, 0x1233, "ASRock NUC Box 1100", ALC233_FIXUP_NO_AUDIO_JACK), SND_PCI_QUIRK(0x1849, 0xa233, "Positivo Master C6300", ALC269_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1854, 0x0440, "LG CQ6", ALC256_FIXUP_HEADPHONE_AMP_VOL), + SND_PCI_QUIRK(0x1854, 0x0441, "LG CQ6 AIO", ALC256_FIXUP_HEADPHONE_AMP_VOL), SND_PCI_QUIRK(0x19e5, 0x3204, "Huawei MACH-WX9", ALC256_FIXUP_HUAWEI_MACH_WX9_PINS), SND_PCI_QUIRK(0x19e5, 0x320f, "Huawei WRT-WX9 ", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1b35, 0x1235, "CZC B20", ALC269_FIXUP_CZC_B20), From 7397175cb7b48f7a3fc699083aa46f1234904c7e Mon Sep 17 00:00:00 2001 From: Kousik Sanagavarapu Date: Mon, 18 Mar 2024 21:08:34 +0530 Subject: [PATCH 260/496] spi: lm70llp: fix links in doc and comments Update links in the documentation and in-code comments which point to the datasheet and schematic. The current links don't work because National Semiconductor (which is the manufacturer of this board and lm70) has been a part of Texas Instruments since 2011 and hence http://www.national.com/ doesn't work anymore. Fixes: 78961a574037 ("spi_lm70llp parport adapter driver") Fixes: 2b7300513b98 ("hwmon: (lm70) Code streamlining and cleanup") Signed-off-by: Kousik Sanagavarapu Link: https://msgid.link/r/20240318154540.90613-2-five231003@gmail.com Signed-off-by: Mark Brown --- Documentation/spi/spi-lm70llp.rst | 4 ++-- drivers/spi/spi-lm70llp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/spi/spi-lm70llp.rst b/Documentation/spi/spi-lm70llp.rst index 2f20e5b405e6..ff98ddc76a74 100644 --- a/Documentation/spi/spi-lm70llp.rst +++ b/Documentation/spi/spi-lm70llp.rst @@ -6,7 +6,7 @@ Supported board/chip: * National Semiconductor LM70 LLP evaluation board - Datasheet: http://www.national.com/pf/LM/LM70.html + Datasheet: https://www.ti.com/lit/gpn/lm70 Author: Kaiwan N Billimoria @@ -28,7 +28,7 @@ Hardware Interfacing The schematic for this particular board (the LM70EVAL-LLP) is available (on page 4) here: - http://www.national.com/appinfo/tempsensors/files/LM70LLPEVALmanual.pdf + https://download.datasheets.com/pdfs/documentation/nat/kit&board/lm70llpevalmanual.pdf The hardware interfacing on the LM70 LLP eval board is as follows: diff --git a/drivers/spi/spi-lm70llp.c b/drivers/spi/spi-lm70llp.c index f982bdebd028..3c0c24ed1f3d 100644 --- a/drivers/spi/spi-lm70llp.c +++ b/drivers/spi/spi-lm70llp.c @@ -29,10 +29,10 @@ * * Datasheet and Schematic: * The LM70 is a temperature sensor chip from National Semiconductor; its - * datasheet is available at http://www.national.com/pf/LM/LM70.html + * datasheet is available at https://www.ti.com/lit/gpn/lm70 * The schematic for this particular board (the LM70EVAL-LLP) is * available (on page 4) here: - * http://www.national.com/appinfo/tempsensors/files/LM70LLPEVALmanual.pdf + * https://download.datasheets.com/pdfs/documentation/nat/kit&board/lm70llpevalmanual.pdf * * Also see Documentation/spi/spi-lm70llp.rst. The SPI<->parport code here is * (heavily) based on spi-butterfly by David Brownell. From 3fbd56f0e7c14e7c7a7597fd4a368753fe70d76f Mon Sep 17 00:00:00 2001 From: "Christoph Lameter (Ampere)" Date: Wed, 6 Mar 2024 17:45:04 -0800 Subject: [PATCH 261/496] ARM64: Dynamically allocate cpumasks and increase supported CPUs to 512 [ a.k.a. Revert "Revert "ARM64: Dynamically allocate cpumasks and increase supported CPUs to 512""; originally reverted because of a bug in the cpufreq-dt code not using zalloc_cpumask_var() ] Currently defconfig selects NR_CPUS=256, but some vendors (e.g. Ampere Computing) are planning to ship systems with 512 CPUs. So that all CPUs on these systems can be used with defconfig, we'd like to bump NR_CPUS to 512. Therefore this patch increases the default NR_CPUS from 256 to 512. As increasing NR_CPUS will increase the size of cpumasks, there's a fear that this might have a significant impact on stack usage due to code which places cpumasks on the stack. To mitigate that concern, we can select CPUMASK_OFFSTACK. As that doesn't seem to be a problem today with NR_CPUS=256, we only select this when NR_CPUS > 256. CPUMASK_OFFSTACK configures the cpumasks in the kernel to be dynamically allocated. This was used in the X86 architecture in the past to enable support for larger CPU configurations up to 8k cpus. With that is becomes possible to dynamically size the allocation of the cpu bitmaps depending on the quantity of processors detected on bootup. Memory used for cpumasks will increase if the kernel is run on a machine with more cores. Further increases may be needed if ARM processor vendors start supporting more processors. Given the current inflationary trends in core counts from multiple processor manufacturers this may occur. There are minor regressions for hackbench. The kernel data size for 512 cpus is smaller with offstack than with onstack. Benchmark results using hackbench average over 10 runs of hackbench -s 512 -l 2000 -g 15 -f 25 -P on Altra 80 Core Support for 256 CPUs on stack. Baseline 7.8564 sec Support for 512 CUs on stack. 7.8713 sec + 0.18% 512 CPUS offstack 7.8916 sec + 0.44% Kernel size comparison: text data filename Difference to onstack256 baseline 25755648 9589248 vmlinuz-6.8.0-rc4-onstack256 25755648 9607680 vmlinuz-6.8.0-rc4-onstack512 +0.19% 25755648 9603584 vmlinuz-6.8.0-rc4-offstack512 +0.14% Tested-by: Eric Mackay Reviewed-by: Russell King (Oracle) Signed-off-by: Christoph Lameter (Ampere) Acked-by: Mark Rutland Link: https://lore.kernel.org/r/37099a57-b655-3b3a-56d0-5f7fbd49d7db@gentwo.org Link: https://lore.kernel.org/r/20240314125457.186678-1-m.szyprowski@samsung.com [catalin.marinas@arm.com: use 'select' instead of duplicating 'config CPUMASK_OFFSTACK'] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4869265ace2d..a03de40bd4cd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -120,6 +120,7 @@ config ARM64 select CLONE_BACKWARDS select COMMON_CLK select CPU_PM if (SUSPEND || CPU_IDLE) + select CPUMASK_OFFSTACK if NR_CPUS > 256 select CRC32 select DCACHE_WORD_ACCESS select DYNAMIC_FTRACE if FUNCTION_TRACER @@ -1430,7 +1431,7 @@ config SCHED_SMT config NR_CPUS int "Maximum number of CPUs (2-4096)" range 2 4096 - default "256" + default "512" config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" From 0ef58ccb6178b1a40edfd027d8a11a52fa629215 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 Mar 2024 11:56:10 -0700 Subject: [PATCH 262/496] selftests/exec: execveat: Improve debug reporting Children processes were reporting their status, duplicating the parent's. Remove that, and add some additional details about the test execution. Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240313185606.work.073-kees@kernel.org Signed-off-by: Kees Cook --- tools/testing/selftests/exec/execveat.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c index 0546ca24f2b2..6418ded40bdd 100644 --- a/tools/testing/selftests/exec/execveat.c +++ b/tools/testing/selftests/exec/execveat.c @@ -98,10 +98,9 @@ static int check_execveat_invoked_rc(int fd, const char *path, int flags, if (child == 0) { /* Child: do execveat(). */ rc = execveat_(fd, path, argv, envp, flags); - ksft_print_msg("execveat() failed, rc=%d errno=%d (%s)\n", + ksft_print_msg("child execveat() failed, rc=%d errno=%d (%s)\n", rc, errno, strerror(errno)); - ksft_test_result_fail("%s\n", test_name); - exit(1); /* should not reach here */ + exit(errno); } /* Parent: wait for & check child's exit status. */ rc = waitpid(child, &status, 0); @@ -226,11 +225,14 @@ static int check_execveat_pathmax(int root_dfd, const char *src, int is_script) * "If the command name is found, but it is not an executable utility, * the exit status shall be 126."), so allow either. */ - if (is_script) + if (is_script) { + ksft_print_msg("Invoke script via root_dfd and relative filename\n"); fail += check_execveat_invoked_rc(root_dfd, longpath + 1, 0, 127, 126); - else + } else { + ksft_print_msg("Invoke exec via root_dfd and relative filename\n"); fail += check_execveat(root_dfd, longpath + 1, 0); + } return fail; } From 472874cf7bb34895ae69483338359df84e76f3e1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 15 Mar 2024 11:26:35 -0700 Subject: [PATCH 263/496] selftests/exec: Convert remaining /bin/sh to /bin/bash As was intended with commit 17107429947b ("selftests/exec: Perform script checks with /bin/bash"), convert the other instance of /bin/sh to /bin/bash. It appears that at least Debian Bookworm's /bin/sh (dash) does not conform to POSIX's "return 127 when script not found" requirement. Fixes: 17107429947b ("selftests/exec: Perform script checks with /bin/bash") Reported-by: Muhammad Usama Anjum Closes: https://lore.kernel.org/lkml/02c8bf8e-1934-44ab-a886-e065b37366a7@collabora.com/ Signed-off-by: Kees Cook --- tools/testing/selftests/exec/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index a0b8688b0836..fb4472ddffd8 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -19,8 +19,8 @@ include ../lib.mk $(OUTPUT)/subdir: mkdir -p $@ -$(OUTPUT)/script: - echo '#!/bin/sh' > $@ +$(OUTPUT)/script: Makefile + echo '#!/bin/bash' > $@ echo 'exit $$*' >> $@ chmod +x $@ $(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat From 77fcc34769c8a0a228af32c52ba7d3ef64690c0d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 12 Mar 2024 20:45:52 -0700 Subject: [PATCH 264/496] ubsan: Disable signed integer overflow sanitizer on GCC < 8 For opting functions out of sanitizer coverage, the "no_sanitize" attribute is used, but in GCC this wasn't introduced until GCC 8. Disable the sanitizer unless we're not using GCC, or it is GCC version 8 or higher. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403110643.27JXEVCI-lkp@intel.com/ Reviewed-by: Marco Elver Signed-off-by: Kees Cook --- lib/Kconfig.ubsan | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 48a67058f84e..e81e1ac4a919 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -119,6 +119,8 @@ config UBSAN_SIGNED_WRAP bool "Perform checking for signed arithmetic wrap-around" default UBSAN depends on !COMPILE_TEST + # The no_sanitize attribute was introduced in GCC with version 8. + depends on !CC_IS_GCC || GCC_VERSION >= 80000 depends on $(cc-option,-fsanitize=signed-integer-overflow) help This option enables -fsanitize=signed-integer-overflow which checks From c4ca2276f18ee638e4bb156126e6e1bf5e09f28e Mon Sep 17 00:00:00 2001 From: Liu Song Date: Wed, 13 Mar 2024 11:10:11 -0700 Subject: [PATCH 265/496] arch/Kconfig: eliminate needless UTF-8 character in Kconfig help Use "find ./linux/* | grep Kconfig | xargs file | grep UTF", can find files with utf-8 encoded characters, these files will display garbled characters in menuconfig, except for characters with special meanings that cannot be modified, modify the characters with obvious errors to eliminate the wrong display under meunconfig. Signed-off-by: Liu Song Suggested-by: Bjorn Helgaas Acked-by: Randy Dunlap Tested-by: Randy Dunlap Link: https://lore.kernel.org/lkml/1659435153-119538-1-git-send-email-liusong@linux.alibaba.com/ Signed-off-by: Kees Cook --- arch/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index fd18b7db2c77..e8d3169ce9df 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -799,7 +799,7 @@ config CFI_CLANG depends on ARCH_SUPPORTS_CFI_CLANG depends on $(cc-option,-fsanitize=kcfi) help - This option enables Clang’s forward-edge Control Flow Integrity + This option enables Clang's forward-edge Control Flow Integrity (CFI) checking, where the compiler injects a runtime check to each indirect function call to ensure the target is a valid function with the correct static type. This restricts possible call targets and From acd80cdcee17eb770fcb2b0dc659b78f369d8c01 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 14 Mar 2024 08:12:00 -0700 Subject: [PATCH 266/496] Revert "kunit: memcpy: Split slow memcpy tests into MEMCPY_SLOW_KUNIT_TEST" This reverts commit 4acf1de35f41549e60c3c02a8defa7cb95eabdf2. Commit d055c6a2cc16 ("kunit: memcpy: Mark tests as slow using test attributes") marks slow memcpy unit tests as slow. Since this commit, the tests can be disabled with a module parameter, and the configuration option to skip the slow tests is no longer needed. Revert the patch introducing it. Cc: David Gow Cc: Kees Cook Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20240314151200.2285314-1-linux@roeck-us.net Signed-off-by: Kees Cook --- lib/Kconfig.debug | 12 ------------ lib/memcpy_kunit.c | 3 --- 2 files changed, 15 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 733ee2ac0138..9ac4cb3eb20b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2703,18 +2703,6 @@ config MEMCPY_KUNIT_TEST If unsure, say N. -config MEMCPY_SLOW_KUNIT_TEST - bool "Include exhaustive memcpy tests" - depends on MEMCPY_KUNIT_TEST - default y - help - Some memcpy tests are quite exhaustive in checking for overlaps - and bit ranges. These can be very slow, so they are split out - as a separate config, in case they need to be disabled. - - Note this config option will be replaced by the use of KUnit test - attributes. - config IS_SIGNED_TYPE_KUNIT_TEST tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c index 30e00ef0bf2e..fd16e6ce53d1 100644 --- a/lib/memcpy_kunit.c +++ b/lib/memcpy_kunit.c @@ -309,9 +309,6 @@ static void set_random_nonzero(struct kunit *test, u8 *byte) static void init_large(struct kunit *test) { - if (!IS_ENABLED(CONFIG_MEMCPY_SLOW_KUNIT_TEST)) - kunit_skip(test, "Slow test skipped. Enable with CONFIG_MEMCPY_SLOW_KUNIT_TEST=y"); - /* Get many bit patterns. */ get_random_bytes(large_src, ARRAY_SIZE(large_src)); From dce0919c83c325ac9dec5bc8838d5de6d32c01b1 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 18 Mar 2024 08:50:40 +0000 Subject: [PATCH 267/496] irqchip/renesas-rzg2l: Do not set TIEN and TINT source at the same time As per the hardware team, TIEN and TINT source should not set at the same time due to a possible hardware race leading to spurious IRQ. Currently on some scenarios hardware settings for TINT detection is not in sync with TINT source as the enable/disable overrides source setting value leading to hardware inconsistent state. For eg: consider the case GPIOINT0 is used as TINT interrupt and configuring GPIOINT5 as edge type. During rzg2l_irq_set_type(), TINT source for GPIOINT5 is set. On disable(), clearing of the entire bytes of TINT source selection for GPIOINT5 is same as GPIOINT0 with TIEN disabled. Apart from this during enable(), the setting of GPIOINT5 with TIEN results in spurious IRQ as due to a HW race, it is possible that IP can use the TIEN with previous source value (GPIOINT0). So, just update TIEN during enable/disable as TINT source is already set during rzg2l_irq_set_type(). This will make the consistent hardware settings for detection method tied with TINT source and allows to simplify the code. Signed-off-by: Biju Das Signed-off-by: Thomas Gleixner --- drivers/irqchip/irq-renesas-rzg2l.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-renesas-rzg2l.c b/drivers/irqchip/irq-renesas-rzg2l.c index 8803facbb3a2..ae67fec2ab46 100644 --- a/drivers/irqchip/irq-renesas-rzg2l.c +++ b/drivers/irqchip/irq-renesas-rzg2l.c @@ -151,7 +151,7 @@ static void rzg2l_irqc_irq_disable(struct irq_data *d) raw_spin_lock(&priv->lock); reg = readl_relaxed(priv->base + TSSR(tssr_index)); - reg &= ~(TSSEL_MASK << TSSEL_SHIFT(tssr_offset)); + reg &= ~(TIEN << TSSEL_SHIFT(tssr_offset)); writel_relaxed(reg, priv->base + TSSR(tssr_index)); raw_spin_unlock(&priv->lock); } @@ -163,7 +163,6 @@ static void rzg2l_irqc_irq_enable(struct irq_data *d) unsigned int hw_irq = irqd_to_hwirq(d); if (hw_irq >= IRQC_TINT_START && hw_irq < IRQC_NUM_IRQ) { - unsigned long tint = (uintptr_t)irq_data_get_irq_chip_data(d); struct rzg2l_irqc_priv *priv = irq_data_to_priv(d); u32 offset = hw_irq - IRQC_TINT_START; u32 tssr_offset = TSSR_OFFSET(offset); @@ -172,7 +171,7 @@ static void rzg2l_irqc_irq_enable(struct irq_data *d) raw_spin_lock(&priv->lock); reg = readl_relaxed(priv->base + TSSR(tssr_index)); - reg |= (TIEN | tint) << TSSEL_SHIFT(tssr_offset); + reg |= TIEN << TSSEL_SHIFT(tssr_offset); writel_relaxed(reg, priv->base + TSSR(tssr_index)); raw_spin_unlock(&priv->lock); } From e89086c43f0500bc7c4ce225495b73b8ce234c1f Mon Sep 17 00:00:00 2001 From: "Jiawei Fu (iBug)" Date: Sat, 16 Mar 2024 03:27:49 +0800 Subject: [PATCH 268/496] drivers/nvme: Add quirks for device 126f:2262 This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for device [126f:2262], which appears to be a generic VID:PID pair used for many SSDs based on the Silicon Motion SM2262/SM2262EN controller. Two of my SSDs with this VID:PID pair exhibit the same behavior: * They frequently have trouble exiting the deepest power state (5), resulting in the entire disk unresponsive. Verified by setting nvme_core.default_ps_max_latency_us=10000 and observing them behaving normally. * They produce all-zero nguid and eui64 with `nvme id-ns` command. The offending products are: * HP SSD EX950 1TB * HIKVISION C2000Pro 2TB Signed-off-by: Jiawei Fu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6267a6aa380..8e0bb9692685 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3363,6 +3363,9 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_BOGUS_NID, }, From ec58afb49e90d6fd468b0e21d2de324dff1a711c Mon Sep 17 00:00:00 2001 From: Li Feng Date: Wed, 13 Mar 2024 20:38:09 +0800 Subject: [PATCH 269/496] nvme-tcp: Export the nvme_tcp_wq to sysfs Make the workqueue userspace visible for easy viewing and configuration. Signed-off-by: Li Feng Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a6d596e05602..2ec1186db0a3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2800,7 +2800,7 @@ static int __init nvme_tcp_init_module(void) BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0); if (!nvme_tcp_wq) return -ENOMEM; From 0c29f9fa46bbe4fdc218134823d80cf9934ef231 Mon Sep 17 00:00:00 2001 From: Li Feng Date: Wed, 13 Mar 2024 20:38:10 +0800 Subject: [PATCH 270/496] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq The default nvme_tcp_wq will use all CPUs to process tasks. Sometimes it is necessary to set CPU affinity to improve performance. A new module parameter wq_unbound is added here. If set to true, users can configure cpu affinity through /sys/devices/virtual/workqueue/nvme_tcp_wq/cpumask. Signed-off-by: Li Feng Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2ec1186db0a3..34a882b2ec53 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -36,6 +36,14 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); +/* + * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity + * from sysfs. + */ +static bool wq_unbound; +module_param(wq_unbound, bool, 0644); +MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)"); + /* * TLS handshake timeout */ @@ -1551,7 +1559,10 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) else if (nvme_tcp_poll_queue(queue)) n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - ctrl->io_queues[HCTX_TYPE_READ] - 1; - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + if (wq_unbound) + queue->io_cpu = WORK_CPU_UNBOUND; + else + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) @@ -2790,6 +2801,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { static int __init nvme_tcp_init_module(void) { + unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); @@ -2799,8 +2812,10 @@ static int __init nvme_tcp_init_module(void) BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); - nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0); + if (wq_unbound) + wq_flags |= WQ_UNBOUND; + + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); if (!nvme_tcp_wq) return -ENOMEM; From 09927e7ef11fe65a9cc38b6f74a9d6dba31c8c25 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 17 Jan 2024 12:42:11 +0800 Subject: [PATCH 271/496] ceph: break the check delayed cap loop every 5s In some cases this may take a long time and will block renewing the caps to MDS. [ idryomov: massage comment ] Link: https://tracker.ceph.com/issues/50223#note-21 Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7fb4aae97412..55051ad09c19 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4634,6 +4634,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) iput(inode); spin_lock(&mdsc->cap_delay_lock); } + + /* + * Make sure too many dirty caps or general + * slowness doesn't block mdsc delayed work, + * preventing send_renew_caps() from running. + */ + if (jiffies - loop_start >= 5 * HZ) + break; } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); From a8922f79671f0e81c6e4fe8d2fc6cae0cd32cd7a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:47:15 +0000 Subject: [PATCH 272/496] ceph: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1, so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series [1] went on to mark it obsolete to avoid confusion for users. Here we can just remove all its users, which has no functional change. [1] https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz/ Signed-off-by: Chengming Zhou Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 5ec102f6b1ac..4dcbbaa297f6 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -928,36 +928,36 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + ceph_inode_init_once); if (!ceph_inode_cachep) return -ENOMEM; - ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); + ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0); if (!ceph_cap_cachep) goto bad_cap; - ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0); if (!ceph_cap_snap_cachep) goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT); if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT); if (!ceph_dentry_cachep) goto bad_dentry; - ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); + ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0); if (!ceph_file_cachep) goto bad_file; - ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0); if (!ceph_dir_file_cachep) goto bad_dir_file; - ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD); + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0); if (!ceph_mds_request_cachep) goto bad_mds_req; From cf6d79a0f5769b5f4d9579ddaf88d2c30b03b873 Mon Sep 17 00:00:00 2001 From: Adam Butcher Date: Mon, 18 Mar 2024 17:50:52 +0000 Subject: [PATCH 273/496] spi: spi-imx: fix off-by-one in mx51 CPU mode burst length c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") corrects three cases of setting the ECSPI burst length but erroneously leaves the in-range CPU case one bit to big (in that field a value of 0 means 1 bit). The effect was that transmissions that should have been 8-bit bytes appeared as 9-bit causing failed communication with SPI devices. Link: https://lore.kernel.org/all/20240201105451.507005-1-carlos.song@nxp.com/ Link: https://lore.kernel.org/all/20240204091912.36488-1-carlos.song@nxp.com/ Fixes: c712c05e46c8 ("spi: imx: fix the burst length at DMA mode and CPU mode") Signed-off-by: Adam Butcher Link: https://msgid.link/r/20240318175119.3334-1-adam@jessamine.co.uk Signed-off-by: Mark Brown --- drivers/spi/spi-imx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c index 833a1bb7a914..c3e5cee18bea 100644 --- a/drivers/spi/spi-imx.c +++ b/drivers/spi/spi-imx.c @@ -668,8 +668,8 @@ static int mx51_ecspi_prepare_transfer(struct spi_imx_data *spi_imx, ctrl |= (MX51_ECSPI_CTRL_MAX_BURST * BITS_PER_BYTE - 1) << MX51_ECSPI_CTRL_BL_OFFSET; else - ctrl |= spi_imx->count / DIV_ROUND_UP(spi_imx->bits_per_word, - BITS_PER_BYTE) * spi_imx->bits_per_word + ctrl |= (spi_imx->count / DIV_ROUND_UP(spi_imx->bits_per_word, + BITS_PER_BYTE) * spi_imx->bits_per_word - 1) << MX51_ECSPI_CTRL_BL_OFFSET; } } From 807f96abdf14c80f534c78f2d854c2590963345c Mon Sep 17 00:00:00 2001 From: Arthur Grillo Date: Sat, 16 Mar 2024 13:25:20 -0300 Subject: [PATCH 274/496] drm: Fix drm_fixp2int_round() making it add 0.5 As well noted by Pekka[1], the rounding of drm_fixp2int_round is wrong. To round a number, you need to add 0.5 to the number and floor that, drm_fixp2int_round() is adding 0.0000076. Make it add 0.5. [1]: https://lore.kernel.org/all/20240301135327.22efe0dd.pekka.paalanen@collabora.com/ Fixes: 8b25320887d7 ("drm: Add fixed-point helper to get rounded integer values") Suggested-by: Pekka Paalanen Reviewed-by: Harry Wentland Reviewed-by: Melissa Wen Signed-off-by: Arthur Grillo Signed-off-by: Melissa Wen Link: https://patchwork.freedesktop.org/patch/msgid/20240316-drm_fixed-v2-1-c1bc2665b5ed@riseup.net --- include/drm/drm_fixed.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/drm/drm_fixed.h b/include/drm/drm_fixed.h index 0c9f917a4d4b..81572d32db0c 100644 --- a/include/drm/drm_fixed.h +++ b/include/drm/drm_fixed.h @@ -71,7 +71,6 @@ static inline u32 dfixed_div(fixed20_12 A, fixed20_12 B) } #define DRM_FIXED_POINT 32 -#define DRM_FIXED_POINT_HALF 16 #define DRM_FIXED_ONE (1ULL << DRM_FIXED_POINT) #define DRM_FIXED_DECIMAL_MASK (DRM_FIXED_ONE - 1) #define DRM_FIXED_DIGITS_MASK (~DRM_FIXED_DECIMAL_MASK) @@ -90,7 +89,7 @@ static inline int drm_fixp2int(s64 a) static inline int drm_fixp2int_round(s64 a) { - return drm_fixp2int(a + (1 << (DRM_FIXED_POINT_HALF - 1))); + return drm_fixp2int(a + DRM_FIXED_ONE / 2); } static inline int drm_fixp2int_ceil(s64 a) From 50171b8667733146f139c773d8f00866ceb4cee4 Mon Sep 17 00:00:00 2001 From: Yufeng Wang Date: Tue, 19 Mar 2024 09:42:19 +0800 Subject: [PATCH 275/496] floppy: remove duplicated code in redo_fd_request() duplicated code in redo_fd_request(), unlock_fdc() function has the same code "do_floppy = NULL" inside. Signed-off-by: Yufeng Wang Suggested-by: Denis Efremov Link: https://lore.kernel.org/r/20240319014219.7812-1-wangyufeng@kylinos.cn Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 1b399ec8c07d..25c9d85667f1 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2787,7 +2787,6 @@ do_request: pending = set_next_request(); spin_unlock_irq(&floppy_lock); if (!pending) { - do_floppy = NULL; unlock_fdc(); return; } From 1251d2025c3e1bcf1f17ec0f3c0dfae5e5bbb146 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 20:22:42 -0600 Subject: [PATCH 276/496] io_uring/sqpoll: early exit thread if task_context wasn't allocated Ideally we'd want to simply kill the task rather than wake it, but for now let's just add a startup check that causes the thread to exit. This can only happen if io_uring_alloc_task_context() fails, which generally requires fault injection. Reported-by: Ubisectech Sirius Fixes: af5d68f8892f ("io_uring/sqpoll: manage task_work privately") Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 363052b4ea76..3983708cef5b 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -274,6 +274,10 @@ static int io_sq_thread(void *data) char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); + /* offload context creation failed, just exit */ + if (!current->io_uring) + goto err_out; + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); @@ -371,7 +375,7 @@ static int io_sq_thread(void *data) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); mutex_unlock(&sqd->lock); - +err_out: complete(&sqd->exited); do_exit(0); } From 8b5db5e5337ecb0a7954c39020861867a85b6206 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 19 Mar 2024 15:50:27 +0800 Subject: [PATCH 277/496] LoongArch: Select ARCH_HAS_CURRENT_STACK_POINTER in Kconfig LoongArch has implemented the current_stack_pointer macro, so select ARCH_HAS_CURRENT_STACK_POINTER in Kconfig. This will let it be used in non-arch places (like HARDENED_USERCOPY). Reviewed-by: Guo Ren Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 99a0a15ce5f7..6df2b421895a 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -15,6 +15,7 @@ config LOONGARCH select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_KCOV select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS From f48ad26e5e57016b447461f22ecf7c3ed8337353 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 278/496] LoongArch: Select HAVE_ARCH_USERFAULTFD_MINOR in Kconfig This allocates the VM flag needed to support the userfaultfd minor fault functionality. See commit 7677f7fd8be7665 ("userfaultfd: add minor fault registration mode") for more information. Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 6df2b421895a..526a88598fbf 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -105,6 +105,7 @@ config LOONGARCH select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD select HAVE_ASM_MODVERSIONS select HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT From c87e12e0e8c1241410e758e181ca6bf23efa5b5b Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 279/496] LoongArch: Change __my_cpu_offset definition to avoid mis-optimization From GCC commit 3f13154553f8546a ("df-scan: remove ad-hoc handling of global regs in asms"), global registers will no longer be forced to add to the def-use chain. Then current_thread_info(), current_stack_pointer and __my_cpu_offset may be lifted out of the loop because they are no longer treated as "volatile variables". This optimization is still correct for the current_thread_info() and current_stack_pointer usages because they are associated to a thread. However it is wrong for __my_cpu_offset because it is associated to a CPU rather than a thread: if the thread migrates to a different CPU in the loop, __my_cpu_offset should be changed. Change __my_cpu_offset definition to treat it as a "volatile variable", in order to avoid such a mis-optimization. Cc: stable@vger.kernel.org Reported-by: Xiaotian Wu Reported-by: Miao Wang Signed-off-by: Xing Li Signed-off-by: Hongchen Zhang Signed-off-by: Rui Wang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/percpu.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/percpu.h b/arch/loongarch/include/asm/percpu.h index 9b36ac003f89..8f290e5546cf 100644 --- a/arch/loongarch/include/asm/percpu.h +++ b/arch/loongarch/include/asm/percpu.h @@ -29,7 +29,12 @@ static inline void set_my_cpu_offset(unsigned long off) __my_cpu_offset = off; csr_write64(off, PERCPU_BASE_KS); } -#define __my_cpu_offset __my_cpu_offset + +#define __my_cpu_offset \ +({ \ + __asm__ __volatile__("":"+r"(__my_cpu_offset)); \ + __my_cpu_offset; \ +}) #define PERCPU_OP(op, asm_op, c_op) \ static __always_inline unsigned long __percpu_##op(void *ptr, \ From d42ab9af605ee406ec339e5e80a1c3a708637fd6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 280/496] LoongArch: Move {dmw,tlb}_virt_to_page() definition to page.h These two functions are implemented in pgtable.c, and they are needed only by the virt_to_page() macro in page.h. Having the prototypes in pgtable.h causes a circular dependency between page.h and pgtable.h, because the virt_to_page() macro in page.h needs pgtable.h for these two functions, while pgtable.h needs various definitions from page.h (e.g. pte_t and pgt_t). Let's avoid this circular dependency by moving the function prototypes to page.h. Signed-off-by: Max Kellermann Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/page.h | 3 +++ arch/loongarch/include/asm/pgtable.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h index 63f137ce82a4..2391fcd18908 100644 --- a/arch/loongarch/include/asm/page.h +++ b/arch/loongarch/include/asm/page.h @@ -83,6 +83,9 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) #define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) +struct page *dmw_virt_to_page(unsigned long kaddr); +struct page *tlb_virt_to_page(unsigned long kaddr); + #define virt_to_pfn(kaddr) PFN_DOWN(PHYSADDR(kaddr)) #define virt_to_page(kaddr) \ diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 8b5df1bbf9e9..af3acdf3481a 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -363,9 +363,6 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt extern pgd_t swapper_pg_dir[]; extern pgd_t invalid_pg_dir[]; -struct page *dmw_virt_to_page(unsigned long kaddr); -struct page *tlb_virt_to_page(unsigned long kaddr); - /* * The following only work if pte_present() is true. * Undefined behaviour if not.. From 82bf60a6fed806d57e284a1fb40dbc1ad5097611 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 281/496] LoongArch: Remove superfluous flush_dcache_page() definition LoongArch doesn't have cache aliases, so flush_dcache_page() is a no-op. There is a generic implementation for this case in include/asm-generic/ cacheflush.h. So remove the superfluous flush_dcache_page() definition, which also silences such build warnings: In file included from crypto/scompress.c:12: include/crypto/scatterwalk.h: In function 'scatterwalk_pagedone': include/crypto/scatterwalk.h:76:30: warning: variable 'page' set but not used [-Wunused-but-set-variable] 76 | struct page *page; | ^~~~ crypto/scompress.c: In function 'scomp_acomp_comp_decomp': >> crypto/scompress.c:174:38: warning: unused variable 'dst_page' [-Wunused-variable] 174 | struct page *dst_page = sg_page(req->dst); | Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403091614.NeUw5zcv-lkp@intel.com/ Suggested-by: Barry Song Acked-by: Barry Song Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/cacheflush.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 80bd74106985..f8754d08a31a 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -37,8 +37,6 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range local_flush_icache_range #define flush_icache_user_range local_flush_icache_range -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - #define flush_cache_all() do { } while (0) #define flush_cache_mm(mm) do { } while (0) #define flush_cache_dup_mm(mm) do { } while (0) @@ -47,7 +45,6 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #define flush_icache_user_page(vma, page, addr, len) do { } while (0) -#define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) From 9c68ece8b2a5c5ff9b2fcaea923dd73efeb174cd Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 282/496] LoongArch: Define the __io_aw() hook as mmiowb() Commit fb24ea52f78e0d595852e ("drivers: Remove explicit invocations of mmiowb()") remove all mmiowb() in drivers, but it says: "NOTE: mmiowb() has only ever guaranteed ordering in conjunction with spin_unlock(). However, pairing each mmiowb() removal in this patch with the corresponding call to spin_unlock() is not at all trivial, so there is a small chance that this change may regress any drivers incorrectly relying on mmiowb() to order MMIO writes between CPUs using lock-free synchronisation." The mmio in radeon_ring_commit() is protected by a mutex rather than a spinlock, but in the mutex fastpath it behaves similar to spinlock. We can add mmiowb() calls in the radeon driver but the maintainer says he doesn't like such a workaround, and radeon is not the only example of mutex protected mmio. So we should extend the mmiowb tracking system from spinlock to mutex, and maybe other locking primitives. This is not easy and error prone, so we solve it in the architectural code, by simply defining the __io_aw() hook as mmiowb(). And we no longer need to override queued_spin_unlock() so use the generic definition. Without this, we get such an error when run 'glxgears' on weak ordering architectures such as LoongArch: radeon 0000:04:00.0: ring 0 stalled for more than 10324msec radeon 0000:04:00.0: ring 3 stalled for more than 10240msec radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000001f412 last fence id 0x000000000001f414 on ring 3) radeon 0000:04:00.0: GPU lockup (current fence id 0x000000000000f940 last fence id 0x000000000000f941 on ring 0) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) radeon 0000:04:00.0: scheduling IB failed (-35). [drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35) Link: https://lore.kernel.org/dri-devel/29df7e26-d7a8-4f67-b988-44353c4270ac@amd.com/T/#t Link: https://lore.kernel.org/linux-arch/20240301130532.3953167-1-chenhuacai@loongson.cn/T/#t Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/Kbuild | 1 + arch/loongarch/include/asm/io.h | 2 ++ arch/loongarch/include/asm/qspinlock.h | 18 ------------------ 3 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 arch/loongarch/include/asm/qspinlock.h diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index a97c0edbb866..2dbec7853ae8 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -6,6 +6,7 @@ generic-y += mcs_spinlock.h generic-y += parport.h generic-y += early_ioremap.h generic-y += qrwlock.h +generic-y += qspinlock.h generic-y += rwsem.h generic-y += segment.h generic-y += user.h diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index c486c2341b66..4a8adcca329b 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -71,6 +71,8 @@ extern void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t #define memcpy_fromio(a, c, l) __memcpy_fromio((a), (c), (l)) #define memcpy_toio(c, a, l) __memcpy_toio((c), (a), (l)) +#define __io_aw() mmiowb() + #include #define ARCH_HAS_VALID_PHYS_ADDR_RANGE diff --git a/arch/loongarch/include/asm/qspinlock.h b/arch/loongarch/include/asm/qspinlock.h deleted file mode 100644 index 34f43f8ad591..000000000000 --- a/arch/loongarch/include/asm/qspinlock.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_QSPINLOCK_H -#define _ASM_QSPINLOCK_H - -#include - -#define queued_spin_unlock queued_spin_unlock - -static inline void queued_spin_unlock(struct qspinlock *lock) -{ - compiletime_assert_atomic_type(lock->locked); - c_sync(); - WRITE_ONCE(lock->locked, 0); -} - -#include - -#endif /* _ASM_QSPINLOCK_H */ From fea1c949f6ca5059e12de00d0483645debc5b206 Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Tue, 19 Mar 2024 15:50:34 +0800 Subject: [PATCH 283/496] LoongArch/crypto: Clean up useless assignment operations The LoongArch CRC32 hw acceleration is based on arch/mips/crypto/ crc32-mips.c. While the MIPS code supports both MIPS32 and MIPS64, but LoongArch32 lacks the CRC instruction. As a result, the line "len -= sizeof(u32)" is unnecessary. Removing it can make context code style more unified and improve code readability. Cc: stable@vger.kernel.org Reviewed-by: WANG Xuerui Suggested-by: Wentao Guan Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/crypto/crc32-loongarch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/loongarch/crypto/crc32-loongarch.c b/arch/loongarch/crypto/crc32-loongarch.c index a49e507af38c..3eebea3a7b47 100644 --- a/arch/loongarch/crypto/crc32-loongarch.c +++ b/arch/loongarch/crypto/crc32-loongarch.c @@ -44,7 +44,6 @@ static u32 crc32_loongarch_hw(u32 crc_, const u8 *p, unsigned int len) CRC32(crc, value, w); p += sizeof(u32); - len -= sizeof(u32); } if (len & sizeof(u16)) { @@ -80,7 +79,6 @@ static u32 crc32c_loongarch_hw(u32 crc_, const u8 *p, unsigned int len) CRC32C(crc, value, w); p += sizeof(u32); - len -= sizeof(u32); } if (len & sizeof(u16)) { From f55acb1e44f3d4bf1ca7926d777895a67d4ec606 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Mar 2024 00:07:28 +0100 Subject: [PATCH 284/496] timers/migration: Fix endless timer requeue after idle interrupts When a CPU is an idle migrator, but another CPU wakes up before it, becomes an active migrator and handles the queue, the initial idle migrator may end up endlessly reprogramming its clockevent, chasing ghost timers forever such as in the following scenario: [GRP0:0] migrator = 0 active = 0 nextevt = T1 / \ 0 1 active idle (T1) 0) CPU 1 is idle and has a timer queued (T1), CPU 0 is active and is the active migrator. [GRP0:0] migrator = NONE active = NONE nextevt = T1 / \ 0 1 idle idle (T1) wakeup = T1 1) CPU 0 is now idle and is therefore the idle migrator. It has programmed its next timer interrupt to handle T1. [GRP0:0] migrator = 1 active = 1 nextevt = KTIME_MAX / \ 0 1 idle active wakeup = T1 2) CPU 1 has woken up, it is now active and it has just handled its own timer T1. 3) CPU 0 gets a timer interrupt to handle T1 but tmigr_handle_remote() realize it is not the migrator anymore. So it early returns without observing that T1 has been expired already and therefore without updating its ->wakeup value. 4) CPU 0 goes into tmigr_cpu_new_timer() which also early returns because it doesn't queue a timer of its own. So ->wakeup is left unchanged and the next timer is programmed to fire now. 5) goto 3) forever This results in timer interrupt storms in idle and also in nohz_full (as observed in rcutorture's TREE07 scenario). Fix this with forcing a re-evaluation of tmc->wakeup while trying remote timer handling when the CPU isn't the migrator anymmore. The check is inherently racy but in the worst case the CPU just races setting the KTIME_MAX value that a remote expiry also tries to set. Fixes: 7ee988770326 ("timers: Implement the hierarchical pull model") Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240318230729.15497-2-frederic@kernel.org --- kernel/time/timer_migration.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 611cd904f035..c63a0afdcebe 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1038,8 +1038,15 @@ void tmigr_handle_remote(void) * in tmigr_handle_remote_up() anyway. Keep this check to speed up the * return when nothing has to be done. */ - if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) - return; + if (!tmigr_check_migrator(tmc->tmgroup, tmc->childmask)) { + /* + * If this CPU was an idle migrator, make sure to clear its wakeup + * value so it won't chase timers that have already expired elsewhere. + * This avoids endless requeue from tmigr_new_timer(). + */ + if (READ_ONCE(tmc->wakeup) == KTIME_MAX) + return; + } data.now = get_jiffies_update(&data.basej); From 03877039863be021a19fda307136657bb6d61f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Mar 2024 00:07:29 +0100 Subject: [PATCH 285/496] timers: Fix removed self-IPI on global timer's enqueue in nohz_full While running in nohz_full mode, a task may enqueue a timer while the tick is stopped. However the only places where the timer wheel, alongside the timer migration machinery's decision, may reprogram the next event accordingly with that new timer's expiry are the idle loop or any IRQ tail. However neither the idle task nor an interrupt may run on the CPU if it resumes busy work in userspace for a long while in full dynticks mode. To solve this, the timer enqueue path raises a self-IPI that will re-evaluate the timer wheel on its IRQ tail. This asynchronous solution avoids potential locking inversion. This is supposed to happen both for local and global timers but commit: b2cf7507e186 ("timers: Always queue timers on the local CPU") broke the global timers case with removing the ->is_idle field handling for the global base. As a result, global timers enqueue may go unnoticed in nohz_full. Fix this with restoring the idle tracking of the global timer's base, allowing self-IPIs again on enqueue time. Fixes: b2cf7507e186 ("timers: Always queue timers on the local CPU") Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240318230729.15497-3-frederic@kernel.org --- kernel/time/timer.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e69e75d3858c..dee29f1f5b75 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -642,7 +642,8 @@ trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) * the base lock: */ if (base->is_idle) { - WARN_ON_ONCE(!(timer->flags & TIMER_PINNED)); + WARN_ON_ONCE(!(timer->flags & TIMER_PINNED || + tick_nohz_full_cpu(base->cpu))); wake_up_nohz_cpu(base->cpu); } } @@ -2292,6 +2293,13 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, */ if (!base_local->is_idle && time_after(nextevt, basej + 1)) { base_local->is_idle = true; + /* + * Global timers queued locally while running in a task + * in nohz_full mode need a self-IPI to kick reprogramming + * in IRQ tail. + */ + if (tick_nohz_full_cpu(base_local->cpu)) + base_global->is_idle = true; trace_timer_base_idle(true, base_local->cpu); } *idle = base_local->is_idle; @@ -2364,6 +2372,8 @@ void timer_clear_idle(void) * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); + if (tick_nohz_full_cpu(smp_processor_id())) + __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); trace_timer_base_idle(false, smp_processor_id()); /* Activate without holding the timer_base->lock */ From c2d953276b8b27459baed1277a4fdd5dd9bd4126 Mon Sep 17 00:00:00 2001 From: Roman Smirnov Date: Tue, 19 Mar 2024 11:13:44 +0300 Subject: [PATCH 286/496] fbmon: prevent division by zero in fb_videomode_from_videomode() The expression htotal * vtotal can have a zero value on overflow. It is necessary to prevent division by zero like in fb_var_to_videomode(). Found by Linux Verification Center (linuxtesting.org) with Svace. Signed-off-by: Roman Smirnov Reviewed-by: Sergey Shtylyov Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbmon.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index 79e5bfbdd34c..0a26399dbc89 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c @@ -1311,7 +1311,7 @@ int fb_get_mode(int flags, u32 val, struct fb_var_screeninfo *var, struct fb_inf int fb_videomode_from_videomode(const struct videomode *vm, struct fb_videomode *fbmode) { - unsigned int htotal, vtotal; + unsigned int htotal, vtotal, total; fbmode->xres = vm->hactive; fbmode->left_margin = vm->hback_porch; @@ -1344,8 +1344,9 @@ int fb_videomode_from_videomode(const struct videomode *vm, vtotal = vm->vactive + vm->vfront_porch + vm->vback_porch + vm->vsync_len; /* prevent division by zero */ - if (htotal && vtotal) { - fbmode->refresh = vm->pixelclock / (htotal * vtotal); + total = htotal * vtotal; + if (total) { + fbmode->refresh = vm->pixelclock / total; /* a mode must have htotal and vtotal != 0 or it is invalid */ } else { fbmode->refresh = 0; From f6e922365faf4cd576bd1cf3e64b58c8a32e1856 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Mar 2024 09:54:27 -0700 Subject: [PATCH 287/496] xsk: Don't assume metadata is always requested in TX completion `compl->tx_timestam != NULL` means that the user has explicitly requested the metadata via XDP_TX_METADATA+XDP_TX_METADATA_TIMESTAMP. Fixes: 48eb03dd2630 ("xsk: Add TX timestamp and TX checksum offload support") Reported-by: Daniele Salvatore Albano Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Tested-by: Daniele Salvatore Albano Link: https://lore.kernel.org/bpf/20240318165427.1403313-1-sdf@google.com --- include/net/xdp_sock.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3cb4dc9bd70e..3d54de168a6d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -188,6 +188,8 @@ static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, { if (!compl) return; + if (!compl->tx_timestamp) + return; *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); } From 5d4e8ae6e57b025802aadf55a4775c55cceb75f1 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Thu, 14 Mar 2024 11:45:21 +1000 Subject: [PATCH 288/496] nouveau/gsp: don't check devinit disable on GSP. GSP should be handling this and I can see no evidence in opengpu driver that this register should be touched. Fixed acceleration on 2080 Ti GPUs. Fixes: 15740541e8f0 ("drm/nouveau/devinit/tu102-: prepare for GSP-RM") Signed-off-by: Dave Airlie Signed-off-by: Danilo Krummrich Link: https://patchwork.freedesktop.org/patch/msgid/20240314014521.2695233-1-airlied@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c index 666eb93b1742..11b4c9c274a1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/r535.c @@ -41,7 +41,6 @@ r535_devinit_new(const struct nvkm_devinit_func *hw, rm->dtor = r535_devinit_dtor; rm->post = hw->post; - rm->disable = hw->disable; ret = nv50_devinit_new_(rm, device, type, inst, pdevinit); if (ret) From 1065da21e5df9d843d2c5165d5d576be000142a6 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 21 Feb 2024 09:16:12 +0800 Subject: [PATCH 289/496] ceph: stop copying to iter at EOF on sync reads If EOF is encountered, ceph_sync_read() return value is adjusted down according to i_size, but the "to" iter is advanced by the actual number of bytes read. Then, when retrying, the remainder of the range may be skipped incorrectly. Ensure that the "to" iter is advanced only until EOF. [ idryomov: changelog ] Fixes: c3d8e0b5de48 ("ceph: return the real size read when it hits EOF") Reported-by: Frank Hsiao Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Tested-by: Frank Hsiao Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index abe8028d95bf..3d1cd079dbf1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1138,7 +1138,12 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } idx = 0; - left = ret > 0 ? ret : 0; + if (ret <= 0) + left = 0; + else if (off + ret > i_size) + left = i_size - off; + else + left = ret; while (left > 0) { size_t plen, copied; @@ -1167,15 +1172,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, } if (ret > 0) { - if (off > *ki_pos) { - if (off >= i_size) { - *retry_op = CHECK_EOF; - ret = i_size - *ki_pos; - *ki_pos = i_size; - } else { - ret = off - *ki_pos; - *ki_pos = off; - } + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; } if (last_objver) From 825b82f6b82aa38dbb771d24e135152012500e51 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 22 Feb 2024 09:22:43 +0800 Subject: [PATCH 290/496] ceph: set correct cap mask for getattr request for read In case of hitting the file EOF, ceph_read_iter() needs to retrieve the file size from MDS, and Fr caps aren't neccessary. [ idryomov: fold into existing retry_op == READ_INLINE branch ] Reported-by: Frank Hsiao Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Tested-by: Frank Hsiao Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d1cd079dbf1..16873d07692f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2129,14 +2129,16 @@ again: int statret; struct page *page = NULL; loff_t i_size; + int mask = CEPH_STAT_CAP_SIZE; if (retry_op == READ_INLINE) { page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; + + mask = CEPH_STAT_CAP_INLINE_DATA; } - statret = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, !!page); + statret = __ceph_do_getattr(inode, page, mask, !!page); if (statret < 0) { if (page) __free_page(page); @@ -2177,7 +2179,7 @@ again: /* hit EOF or hole? */ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { - doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n", + doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n", iocb->ki_pos, i_size); read += ret; From 61456da04602bab779364e0945c57d8a1f1b6219 Mon Sep 17 00:00:00 2001 From: Anthony I Gilea Date: Mon, 18 Mar 2024 16:17:48 +0300 Subject: [PATCH 291/496] ALSA: hda/realtek: Add quirk for HP Spectre x360 14 eu0000 Cirrus amps support for this laptop was added in patch: 33e5e648e631 ("ALSA: hda: cs35l41: Support additional HP Envy Models") This patch adds fixes for wrong pincfgs, wrong DAC selection and mute/micmute LEDs. Signed-off-by: Anthony I Gilea Message-ID: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 39 ++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d463d416fc23..203aa2d600a1 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -7123,6 +7123,38 @@ static void alc_fixup_headset_mic(struct hda_codec *codec, } } +static void alc245_fixup_hp_spectre_x360_eu0xxx(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + /* + * The Pin Complex 0x14 for the treble speakers is wrongly reported as + * unconnected. + * The Pin Complex 0x17 for the bass speakers has the lowest association + * and sequence values so shift it up a bit to squeeze 0x14 in. + */ + static const struct hda_pintbl pincfgs[] = { + { 0x14, 0x90170110 }, // top/treble + { 0x17, 0x90170111 }, // bottom/bass + { } + }; + + /* + * Force DAC 0x02 for the bass speakers 0x17. + */ + static const hda_nid_t conn[] = { 0x02 }; + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + snd_hda_apply_pincfgs(codec, pincfgs); + snd_hda_override_conn_list(codec, 0x17, ARRAY_SIZE(conn), conn); + break; + } + + cs35l41_fixup_i2c_two(codec, fix, action); + alc245_fixup_hp_mute_led_coefbit(codec, fix, action); + alc245_fixup_hp_gpio_led(codec, fix, action); +} + enum { ALC269_FIXUP_GPIO2, @@ -7402,6 +7434,7 @@ enum { ALC245_FIXUP_CS35L56_SPI_4_HP_GPIO_LED, ALC256_FIXUP_ACER_SFG16_MICMUTE_LED, ALC256_FIXUP_HEADPHONE_AMP_VOL, + ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -9605,6 +9638,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc256_decrease_headphone_amp_val, }, + [ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc245_fixup_hp_spectre_x360_eu0xxx, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -9968,7 +10005,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8be8, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8be9, "HP Envy 15", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8bf0, "HP", ALC236_FIXUP_HP_GPIO_LED), - SND_PCI_QUIRK(0x103c, 0x8c15, "HP Spectre 14", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8c15, "HP Spectre x360 2-in-1 Laptop 14-eu0xxx", ALC245_FIXUP_HP_SPECTRE_X360_EU0XXX), SND_PCI_QUIRK(0x103c, 0x8c16, "HP Spectre 16", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8c17, "HP Spectre 16", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8c46, "HP EliteBook 830 G11", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From 55e565c42dce81a4e49c13262d5bc4eb4c2e588a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 18 Mar 2024 18:35:06 +0100 Subject: [PATCH 292/496] dm-integrity: fix a memory leak when rechecking the data Memory for the "checksums" pointer will leak if the data is rechecked after checksum failure (because the associated kfree won't happen due to 'goto skip_io'). Fix this by freeing the checksums memory before recheck, and just use the "checksum_onstack" memory for storing checksum during recheck. Fixes: c88f5e553fe3 ("dm-integrity: recheck the integrity tag after a failure") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index d822ab2f739b..3329e1e93524 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1848,12 +1848,12 @@ again: r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE); if (unlikely(r)) { - if (r > 0) { - integrity_recheck(dio, checksums); - goto skip_io; - } if (likely(checksums != checksums_onstack)) kfree(checksums); + if (r > 0) { + integrity_recheck(dio, checksums_onstack); + goto skip_io; + } goto error; } From 2ff0573e7aff5129d73ec5c3159cd84d862cb1cc Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 19 Mar 2024 13:33:42 -0500 Subject: [PATCH 293/496] spi: docs: spidev: fix echo command format The two example echo commands for binding the spidev driver were being rendered as one line in the HTML output. This patch makes use of the restructured text :: to format the commands as a code block instead which preserves the line break. Signed-off-by: David Lechner Link: https://msgid.link/r/20240319183344.2106335-1-dlechner@baylibre.com Signed-off-by: Mark Brown --- Documentation/spi/spidev.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/spi/spidev.rst b/Documentation/spi/spidev.rst index 369c657ba435..e08b301ad24a 100644 --- a/Documentation/spi/spidev.rst +++ b/Documentation/spi/spidev.rst @@ -61,7 +61,7 @@ the spidev driver failing to probe. Sysfs also supports userspace driven binding/unbinding of drivers to devices that do not bind automatically using one of the tables above. -To make the spidev driver bind to such a device, use the following: +To make the spidev driver bind to such a device, use the following:: echo spidev > /sys/bus/spi/devices/spiB.C/driver_override echo spiB.C > /sys/bus/spi/drivers/spidev/bind From 1d63d1d9e5c5cb2e7c7ca75751a5eaf67c5623a7 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 18 Mar 2024 15:35:04 +0000 Subject: [PATCH 294/496] perf: starfive: fix 64-bit only COMPILE_TEST condition ARCH_STARFIVE is not restricted to 64-bit platforms, so while Will's addition of a 64-bit only condition satisfied the build robots doing COMPILE_TEST builds, Palmer ran into the same problems with writeq() being undefined during regular rv32 builds. Promote the dependency on 64-bit to its own `depends on` so that the driver can never be included in 32-bit builds. Reported-by: Palmer Dabbelt Fixes: c2b24812f7bc ("perf: starfive: Add StarLink PMU support") Fixes: f0dbc6d0de38 ("perf: starfive: Only allow COMPILE_TEST for 64-bit architectures") Signed-off-by: Conor Dooley Acked-by: Will Deacon Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Acked-by: Ji Sheng Teoh Acked-by: Emil Renner Berthing Link: https://lore.kernel.org/r/20240318-emphatic-rally-f177a4fe1bdc@spud Signed-off-by: Catalin Marinas --- drivers/perf/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 004d86230aa6..54ff5cc17ccd 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -87,7 +87,8 @@ config RISCV_PMU_SBI filtering, counter configuration. config STARFIVE_STARLINK_PMU - depends on ARCH_STARFIVE || (COMPILE_TEST && 64BIT) + depends on ARCH_STARFIVE || COMPILE_TEST + depends on 64BIT bool "StarFive StarLink PMU" help Provide support for StarLink Performance Monitor Unit. From 6be7ee4bebd14b8e7e040a5e7fd6aec3d9167c72 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Wed, 3 Jan 2024 13:00:19 -0300 Subject: [PATCH 295/496] riscv: Improve arch_get_mmap_end() macro This macro caused me some confusion, which took some reviewer's time to make it clear, so I propose adding a short comment in code to avoid confusion in the future. Also, added some improvements to the macro, such as removing the assumption of VA_USER_SV57 being the largest address space. Signed-off-by: Leonardo Bras Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20240103160024.70305-3-leobras@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index f19f861cda54..2278e2a8362a 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -18,15 +18,21 @@ #define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1)) #define STACK_TOP_MAX TASK_SIZE_64 +/* + * addr is a hint to the maximum userspace address that mmap should provide, so + * this macro needs to return the largest address space available so that + * mmap_end < addr, being mmap_end the top of that address space. + * See Documentation/arch/riscv/vm-layout.rst for more details. + */ #define arch_get_mmap_end(addr, len, flags) \ ({ \ unsigned long mmap_end; \ typeof(addr) _addr = (addr); \ if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ mmap_end = STACK_TOP_MAX; \ - else if ((_addr) >= VA_USER_SV57) \ - mmap_end = STACK_TOP_MAX; \ - else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \ + else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \ + mmap_end = VA_USER_SV57; \ + else if (((_addr) >= VA_USER_SV48) && (VA_BITS >= VA_BITS_SV48)) \ mmap_end = VA_USER_SV48; \ else \ mmap_end = VA_USER_SV39; \ From 9dc30419248f78dfebea7a554ec212dd1d82f8d7 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Wed, 3 Jan 2024 13:00:20 -0300 Subject: [PATCH 296/496] riscv: Replace direct thread flag check with is_compat_task() There is some code that detects compat mode into a task by checking the flag directly, and other code that check using the helper is_compat_task(). Since the helper already exists, use it instead of checking the flags directly. Signed-off-by: Leonardo Bras Link: https://lore.kernel.org/r/20240103160024.70305-4-leobras@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/elf.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 06c236bfab53..59a08367fddd 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -54,7 +54,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); #ifdef CONFIG_64BIT #ifdef CONFIG_COMPAT -#define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +#define STACK_RND_MASK (is_compat_task() ? \ 0x7ff >> (PAGE_SHIFT - 12) : \ 0x3ffff >> (PAGE_SHIFT - 12)) #else diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 294044429e8e..9a7a92edcb46 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -882,7 +882,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) #ifdef CONFIG_COMPAT #define TASK_SIZE_32 (_AC(0x80000000, UL) - PAGE_SIZE) -#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ +#define TASK_SIZE (is_compat_task() ? \ TASK_SIZE_32 : TASK_SIZE_64) #else #define TASK_SIZE TASK_SIZE_64 From 4c0b5a451675e9a95be98a16ddb889bb0486d2ad Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Wed, 3 Jan 2024 13:00:21 -0300 Subject: [PATCH 297/496] riscv: add compile-time test into is_compat_task() Currently several places will test for CONFIG_COMPAT before testing is_compat_task(), probably in order to avoid a run-time test into the task structure. Since is_compat_task() is an inlined function, it would be helpful to add a compile-time test of CONFIG_COMPAT, making sure it always returns zero when the option is not enabled during the kernel build. With this, the compiler is able to understand in build-time that is_compat_task() will always return 0, and optimize-out some of the extra code introduced by the option. This will also allow removing a lot #ifdefs that were introduced, and make the code more clean. Signed-off-by: Leonardo Bras Reviewed-by: Guo Ren Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240103160024.70305-5-leobras@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/compat.h | 3 +++ arch/riscv/include/asm/elf.h | 4 ---- arch/riscv/include/asm/pgtable.h | 6 ------ arch/riscv/include/asm/processor.h | 4 ++-- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/compat.h b/arch/riscv/include/asm/compat.h index 2ac955b51148..91517b51b8e2 100644 --- a/arch/riscv/include/asm/compat.h +++ b/arch/riscv/include/asm/compat.h @@ -14,6 +14,9 @@ static inline int is_compat_task(void) { + if (!IS_ENABLED(CONFIG_COMPAT)) + return 0; + return test_thread_flag(TIF_32BIT); } diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 59a08367fddd..2e88257cafae 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -53,13 +53,9 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); #define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2) #ifdef CONFIG_64BIT -#ifdef CONFIG_COMPAT #define STACK_RND_MASK (is_compat_task() ? \ 0x7ff >> (PAGE_SHIFT - 12) : \ 0x3ffff >> (PAGE_SHIFT - 12)) -#else -#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) -#endif #endif /* diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 9a7a92edcb46..f5b504265d76 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -127,16 +127,10 @@ #define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1)) #define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1)) -#ifdef CONFIG_COMPAT #define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS) #define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39) #define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64) #define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64) -#else -#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS) -#define MMAP_MIN_VA_BITS (VA_BITS_SV39) -#endif /* CONFIG_COMPAT */ - #else #include #endif /* CONFIG_64BIT */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 2278e2a8362a..d2d7ce30baf3 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -28,7 +28,7 @@ ({ \ unsigned long mmap_end; \ typeof(addr) _addr = (addr); \ - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + if ((_addr) == 0 || is_compat_task()) \ mmap_end = STACK_TOP_MAX; \ else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \ mmap_end = VA_USER_SV57; \ @@ -45,7 +45,7 @@ typeof(addr) _addr = (addr); \ typeof(base) _base = (base); \ unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \ - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + if ((_addr) == 0 || is_compat_task()) \ mmap_base = (_base); \ else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \ mmap_base = VA_USER_SV57 - rnd_gap; \ From 5917ea17ad07f35bb5be4fd5fdcd408f090e347b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Wed, 3 Jan 2024 13:00:22 -0300 Subject: [PATCH 298/496] riscv: Introduce is_compat_thread() into compat.h task_user_regset_view() makes use of a function very similar to is_compat_task(), but pointing to a any thread. In arm64 asm/compat.h there is a function very similar to that: is_compat_thread(struct thread_info *thread) Copy this function to riscv asm/compat.h and make use of it into task_user_regset_view(). Also, introduce a compile-time test for CONFIG_COMPAT and simplify the function code by removing the #ifdef. Signed-off-by: Leonardo Bras Reviewed-by: Guo Ren Reviewed-by: Andy Chiu Link: https://lore.kernel.org/r/20240103160024.70305-6-leobras@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/compat.h | 8 ++++++++ arch/riscv/kernel/ptrace.c | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/compat.h b/arch/riscv/include/asm/compat.h index 91517b51b8e2..da4b28cd01a9 100644 --- a/arch/riscv/include/asm/compat.h +++ b/arch/riscv/include/asm/compat.h @@ -20,6 +20,14 @@ static inline int is_compat_task(void) return test_thread_flag(TIF_32BIT); } +static inline int is_compat_thread(struct thread_info *thread) +{ + if (!IS_ENABLED(CONFIG_COMPAT)) + return 0; + + return test_ti_thread_flag(thread, TIF_32BIT); +} + struct compat_user_regs_struct { compat_ulong_t pc; compat_ulong_t ra; diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 2afe460de16a..f36283212361 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -374,14 +374,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, return ret; } +#else +static const struct user_regset_view compat_riscv_user_native_view = {}; #endif /* CONFIG_COMPAT */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (is_compat_thread(&task->thread_info)) return &compat_riscv_user_native_view; else -#endif return &riscv_user_native_view; } From 2a8986fc5e1cb686dd3aae3022459aea23b9823a Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Wed, 3 Jan 2024 13:00:23 -0300 Subject: [PATCH 299/496] riscv: Introduce set_compat_task() in asm/compat.h In order to have all task compat bit access directly in compat.h, introduce set_compat_task() to set/reset those when needed. Also, since it's only used on an if/else scenario, simplify the macro using it. Signed-off-by: Leonardo Bras Reviewed-by: Guo Ren Link: https://lore.kernel.org/r/20240103160024.70305-7-leobras@redhat.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/compat.h | 8 ++++++++ arch/riscv/include/asm/elf.h | 5 +---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/compat.h b/arch/riscv/include/asm/compat.h index da4b28cd01a9..aa103530a5c8 100644 --- a/arch/riscv/include/asm/compat.h +++ b/arch/riscv/include/asm/compat.h @@ -28,6 +28,14 @@ static inline int is_compat_thread(struct thread_info *thread) return test_ti_thread_flag(thread, TIF_32BIT); } +static inline void set_compat_task(bool is_compat) +{ + if (is_compat) + set_thread_flag(TIF_32BIT); + else + clear_thread_flag(TIF_32BIT); +} + struct compat_user_regs_struct { compat_ulong_t pc; compat_ulong_t ra; diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 2e88257cafae..c7aea7886d22 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -135,10 +135,7 @@ do { \ #ifdef CONFIG_COMPAT #define SET_PERSONALITY(ex) \ -do { if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - set_thread_flag(TIF_32BIT); \ - else \ - clear_thread_flag(TIF_32BIT); \ +do { set_compat_task((ex).e_ident[EI_CLASS] == ELFCLASS32); \ if (personality(current->personality) != PER_LINUX32) \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ From 6649182a383c9872e9543e2e7d4981d971bf0998 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 18 Jan 2024 11:59:28 +0530 Subject: [PATCH 300/496] cpuidle: RISC-V: Move few functions to arch/riscv To support ACPI Low Power Idle (LPI), few functions are required which are currently static functions in the DT based cpuidle driver. Hence, move them under arch/riscv so that ACPI driver also can use them. Since they are no longer static functions, append "riscv_" prefix to the function name. Signed-off-by: Sunil V L Reviewed-by: Andrew Jones Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240118062930.245937-2-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/suspend.h | 3 ++ arch/riscv/kernel/suspend.c | 49 +++++++++++++++++++++++++++++ drivers/cpuidle/cpuidle-riscv-sbi.c | 49 +++-------------------------- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/arch/riscv/include/asm/suspend.h b/arch/riscv/include/asm/suspend.h index 02f87867389a..076f8a9437cf 100644 --- a/arch/riscv/include/asm/suspend.h +++ b/arch/riscv/include/asm/suspend.h @@ -55,4 +55,7 @@ int hibernate_resume_nonboot_cpu_disable(void); asmlinkage void hibernate_restore_image(unsigned long resume_satp, unsigned long satp_temp, unsigned long cpu_resume); asmlinkage int hibernate_core_restore_code(void); +bool riscv_sbi_hsm_is_supported(void); +bool riscv_sbi_suspend_state_is_valid(u32 state); +int riscv_sbi_hart_suspend(u32 state); #endif diff --git a/arch/riscv/kernel/suspend.c b/arch/riscv/kernel/suspend.c index 239509367e42..b20f2cb5879f 100644 --- a/arch/riscv/kernel/suspend.c +++ b/arch/riscv/kernel/suspend.c @@ -128,4 +128,53 @@ static int __init sbi_system_suspend_init(void) } arch_initcall(sbi_system_suspend_init); + +static int sbi_suspend_finisher(unsigned long suspend_type, + unsigned long resume_addr, + unsigned long opaque) +{ + struct sbiret ret; + + ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, + suspend_type, resume_addr, opaque, 0, 0, 0); + + return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; +} + +int riscv_sbi_hart_suspend(u32 state) +{ + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return cpu_suspend(state, sbi_suspend_finisher); + else + return sbi_suspend_finisher(state, 0, 0); +} + +bool riscv_sbi_suspend_state_is_valid(u32 state) +{ + if (state > SBI_HSM_SUSPEND_RET_DEFAULT && + state < SBI_HSM_SUSPEND_RET_PLATFORM) + return false; + + if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && + state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) + return false; + + return true; +} + +bool riscv_sbi_hsm_is_supported(void) +{ + /* + * The SBI HSM suspend function is only available when: + * 1) SBI version is 0.3 or higher + * 2) SBI HSM extension is available + */ + if (sbi_spec_version < sbi_mk_version(0, 3) || + !sbi_probe_extension(SBI_EXT_HSM)) { + pr_info("HSM suspend not available\n"); + return false; + } + + return true; +} #endif /* CONFIG_RISCV_SBI */ diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index e8094fc92491..a6e123dfe394 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -73,26 +73,6 @@ static inline bool sbi_is_domain_state_available(void) return data->available; } -static int sbi_suspend_finisher(unsigned long suspend_type, - unsigned long resume_addr, - unsigned long opaque) -{ - struct sbiret ret; - - ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, - suspend_type, resume_addr, opaque, 0, 0, 0); - - return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0; -} - -static int sbi_suspend(u32 state) -{ - if (state & SBI_HSM_SUSP_NON_RET_BIT) - return cpu_suspend(state, sbi_suspend_finisher); - else - return sbi_suspend_finisher(state, 0, 0); -} - static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, int idx) { @@ -100,9 +80,9 @@ static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev, u32 state = states[idx]; if (state & SBI_HSM_SUSP_NON_RET_BIT) - return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state); + return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend, idx, state); else - return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend, + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend, idx, state); } @@ -133,7 +113,7 @@ static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev, else state = states[idx]; - ret = sbi_suspend(state) ? -1 : idx; + ret = riscv_sbi_hart_suspend(state) ? -1 : idx; ct_cpuidle_exit(); @@ -206,17 +186,6 @@ static const struct of_device_id sbi_cpuidle_state_match[] = { { }, }; -static bool sbi_suspend_state_is_valid(u32 state) -{ - if (state > SBI_HSM_SUSPEND_RET_DEFAULT && - state < SBI_HSM_SUSPEND_RET_PLATFORM) - return false; - if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT && - state < SBI_HSM_SUSPEND_NON_RET_PLATFORM) - return false; - return true; -} - static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) { int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state); @@ -226,7 +195,7 @@ static int sbi_dt_parse_state_node(struct device_node *np, u32 *state) return err; } - if (!sbi_suspend_state_is_valid(*state)) { + if (!riscv_sbi_suspend_state_is_valid(*state)) { pr_warn("Invalid SBI suspend state %#x\n", *state); return -EINVAL; } @@ -607,16 +576,8 @@ static int __init sbi_cpuidle_init(void) int ret; struct platform_device *pdev; - /* - * The SBI HSM suspend function is only available when: - * 1) SBI version is 0.3 or higher - * 2) SBI HSM extension is available - */ - if ((sbi_spec_version < sbi_mk_version(0, 3)) || - !sbi_probe_extension(SBI_EXT_HSM)) { - pr_info("HSM suspend not available\n"); + if (!riscv_sbi_hsm_is_supported()) return 0; - } ret = platform_driver_register(&sbi_cpuidle_driver); if (ret) From 4877fc92142f635be418d8c915eb48ef87681108 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 18 Jan 2024 11:59:29 +0530 Subject: [PATCH 301/496] ACPI: RISC-V: Add LPI driver Enable Low Power Idle (LPI) based cpuidle driver for RISC-V platforms. It depends on SBI HSM calls for idle state transitions. Signed-off-by: Sunil V L Reviewed-by: Andrew Jones Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240118062930.245937-3-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/acpi/riscv/Makefile | 3 +- drivers/acpi/riscv/cpuidle.c | 81 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 drivers/acpi/riscv/cpuidle.c diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index 8b3b126e0b94..7309d92dd477 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += rhct.o +obj-y += rhct.o +obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o diff --git a/drivers/acpi/riscv/cpuidle.c b/drivers/acpi/riscv/cpuidle.c new file mode 100644 index 000000000000..624f9bbdb58c --- /dev/null +++ b/drivers/acpi/riscv/cpuidle.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024, Ventana Micro Systems Inc + * Author: Sunil V L + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define RISCV_FFH_LPI_TYPE_MASK GENMASK_ULL(63, 60) +#define RISCV_FFH_LPI_RSVD_MASK GENMASK_ULL(59, 32) + +#define RISCV_FFH_LPI_TYPE_SBI BIT_ULL(60) + +static int acpi_cpu_init_idle(unsigned int cpu) +{ + int i; + struct acpi_lpi_state *lpi; + struct acpi_processor *pr = per_cpu(processors, cpu); + + if (unlikely(!pr || !pr->flags.has_lpi)) + return -EINVAL; + + if (!riscv_sbi_hsm_is_supported()) + return -ENODEV; + + if (pr->power.count <= 1) + return -ENODEV; + + for (i = 1; i < pr->power.count; i++) { + u32 state; + + lpi = &pr->power.lpi_states[i]; + + /* + * Validate Entry Method as per FFH spec. + * bits[63:60] should be 0x1 + * bits[59:32] should be 0x0 + * bits[31:0] represent a SBI power_state + */ + if (((lpi->address & RISCV_FFH_LPI_TYPE_MASK) != RISCV_FFH_LPI_TYPE_SBI) || + (lpi->address & RISCV_FFH_LPI_RSVD_MASK)) { + pr_warn("Invalid LPI entry method %#llx\n", lpi->address); + return -EINVAL; + } + + state = lpi->address; + if (!riscv_sbi_suspend_state_is_valid(state)) { + pr_warn("Invalid SBI power state %#x\n", state); + return -EINVAL; + } + } + + return 0; +} + +int acpi_processor_ffh_lpi_probe(unsigned int cpu) +{ + return acpi_cpu_init_idle(cpu); +} + +int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) +{ + u32 state = lpi->address; + + if (state & SBI_HSM_SUSP_NON_RET_BIT) + return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend, + lpi->index, + state); + else + return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend, + lpi->index, + state); +} From 359df7c5be4ba5c57f641010be7237ad9f09ea53 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 18 Jan 2024 11:59:30 +0530 Subject: [PATCH 302/496] ACPI: Enable ACPI_PROCESSOR for RISC-V The ACPI processor driver is not currently enabled for RISC-V. This is required to enable CPU related functionalities like LPI and CPPC. Hence, enable ACPI_PROCESSOR for RISC-V. Signed-off-by: Sunil V L Reviewed-by: Andrew Jones Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240118062930.245937-4-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/acpi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 3c3f8037ebed..1606eb622a9f 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -286,7 +286,7 @@ config ACPI_CPPC_LIB config ACPI_PROCESSOR tristate "Processor" - depends on X86 || ARM64 || LOONGARCH + depends on X86 || ARM64 || LOONGARCH || RISCV select ACPI_PROCESSOR_IDLE select ACPI_CPU_FREQ_PSS if X86 || LOONGARCH select THERMAL From 30f3ffbee86b576705aabdd9093165a49cd66011 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 8 Feb 2024 09:14:12 +0530 Subject: [PATCH 303/496] ACPI: RISC-V: Add CPPC driver Add cpufreq driver based on ACPI CPPC for RISC-V. The driver uses either SBI CPPC interfaces or the CSRs to access the CPPC registers as defined by the RISC-V FFH spec. Signed-off-by: Sunil V L Reviewed-by: Pierre Gondois Acked-by: Rafael J. Wysocki Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20240208034414.22579-2-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/acpi/riscv/Makefile | 1 + drivers/acpi/riscv/cppc.c | 157 ++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 drivers/acpi/riscv/cppc.c diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index 7309d92dd477..86b0925f612d 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += rhct.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o diff --git a/drivers/acpi/riscv/cppc.c b/drivers/acpi/riscv/cppc.c new file mode 100644 index 000000000000..4cdff387deff --- /dev/null +++ b/drivers/acpi/riscv/cppc.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Implement CPPC FFH helper routines for RISC-V. + * + * Copyright (C) 2024 Ventana Micro Systems Inc. + */ + +#include +#include +#include + +#define SBI_EXT_CPPC 0x43505043 + +/* CPPC interfaces defined in SBI spec */ +#define SBI_CPPC_PROBE 0x0 +#define SBI_CPPC_READ 0x1 +#define SBI_CPPC_READ_HI 0x2 +#define SBI_CPPC_WRITE 0x3 + +/* RISC-V FFH definitions from RISC-V FFH spec */ +#define FFH_CPPC_TYPE(r) (((r) & GENMASK_ULL(63, 60)) >> 60) +#define FFH_CPPC_SBI_REG(r) ((r) & GENMASK(31, 0)) +#define FFH_CPPC_CSR_NUM(r) ((r) & GENMASK(11, 0)) + +#define FFH_CPPC_SBI 0x1 +#define FFH_CPPC_CSR 0x2 + +struct sbi_cppc_data { + u64 val; + u32 reg; + struct sbiret ret; +}; + +static bool cppc_ext_present; + +static int __init sbi_cppc_init(void) +{ + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_CPPC) > 0) { + pr_info("SBI CPPC extension detected\n"); + cppc_ext_present = true; + } else { + pr_info("SBI CPPC extension NOT detected!!\n"); + cppc_ext_present = false; + } + + return 0; +} +device_initcall(sbi_cppc_init); + +static void sbi_cppc_read(void *read_data) +{ + struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data; + + data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_READ, + data->reg, 0, 0, 0, 0, 0); +} + +static void sbi_cppc_write(void *write_data) +{ + struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data; + + data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_WRITE, + data->reg, data->val, 0, 0, 0, 0); +} + +static void cppc_ffh_csr_read(void *read_data) +{ + struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data; + + switch (data->reg) { + /* Support only TIME CSR for now */ + case CSR_TIME: + data->ret.value = csr_read(CSR_TIME); + data->ret.error = 0; + break; + default: + data->ret.error = -EINVAL; + break; + } +} + +static void cppc_ffh_csr_write(void *write_data) +{ + struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data; + + data->ret.error = -EINVAL; +} + +/* + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. + */ +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + struct sbi_cppc_data data; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; + + if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) { + if (!cppc_ext_present) + return -EINVAL; + + data.reg = FFH_CPPC_SBI_REG(reg->address); + + smp_call_function_single(cpu, sbi_cppc_read, &data, 1); + + *val = data.ret.value; + + return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0; + } else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) { + data.reg = FFH_CPPC_CSR_NUM(reg->address); + + smp_call_function_single(cpu, cppc_ffh_csr_read, &data, 1); + + *val = data.ret.value; + + return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0; + } + + return -EINVAL; +} + +int cpc_write_ffh(int cpu, struct cpc_reg *reg, u64 val) +{ + struct sbi_cppc_data data; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; + + if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) { + if (!cppc_ext_present) + return -EINVAL; + + data.reg = FFH_CPPC_SBI_REG(reg->address); + data.val = val; + + smp_call_function_single(cpu, sbi_cppc_write, &data, 1); + + return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0; + } else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) { + data.reg = FFH_CPPC_CSR_NUM(reg->address); + data.val = val; + + smp_call_function_single(cpu, cppc_ffh_csr_write, &data, 1); + + return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0; + } + + return -EINVAL; +} From 7ee1378736f09fceef95d2c9122d2cff14a375da Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 8 Feb 2024 09:14:13 +0530 Subject: [PATCH 304/496] cpufreq: Move CPPC configs to common Kconfig and add RISC-V CPPC related config options are currently defined only in ARM specific file. However, they are required for RISC-V as well. Instead of creating a new Kconfig.riscv file and duplicating them, move them to the common Kconfig file and enable RISC-V too. Signed-off-by: Sunil V L Acked-by: Viresh Kumar Reviewed-by: Pierre Gondois Acked-by: Rafael J. Wysocki Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20240208034414.22579-3-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- drivers/cpufreq/Kconfig | 29 +++++++++++++++++++++++++++++ drivers/cpufreq/Kconfig.arm | 26 -------------------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 35efb53d5492..94e55c40970a 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -302,4 +302,33 @@ config QORIQ_CPUFREQ which are capable of changing the CPU's frequency dynamically. endif + +config ACPI_CPPC_CPUFREQ + tristate "CPUFreq driver based on the ACPI CPPC spec" + depends on ACPI_PROCESSOR + depends on ARM || ARM64 || RISCV + select ACPI_CPPC_LIB + help + This adds a CPUFreq driver which uses CPPC methods + as described in the ACPIv5.1 spec. CPPC stands for + Collaborative Processor Performance Controls. It + is based on an abstract continuous scale of CPU + performance values which allows the remote power + processor to flexibly optimize for power and + performance. CPPC relies on power management firmware + support for its operation. + + If in doubt, say N. + +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + depends on ARM || ARM64 || RISCV + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + endmenu diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index f911606897b8..987b3d900a89 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -3,32 +3,6 @@ # ARM CPU Frequency scaling drivers # -config ACPI_CPPC_CPUFREQ - tristate "CPUFreq driver based on the ACPI CPPC spec" - depends on ACPI_PROCESSOR - select ACPI_CPPC_LIB - help - This adds a CPUFreq driver which uses CPPC methods - as described in the ACPIv5.1 spec. CPPC stands for - Collaborative Processor Performance Controls. It - is based on an abstract continuous scale of CPU - performance values which allows the remote power - processor to flexibly optimize for power and - performance. CPPC relies on power management firmware - support for its operation. - - If in doubt, say N. - -config ACPI_CPPC_CPUFREQ_FIE - bool "Frequency Invariance support for CPPC cpufreq driver" - depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY - default y - help - This extends frequency invariance support in the CPPC cpufreq driver, - by using CPPC delivered and reference performance counters. - - If in doubt, say N. - config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM tristate "Allwinner nvmem based SUN50I CPUFreq driver" depends on ARCH_SUNXI From 282b9df4e9603bbb5c9cbf3ea60bc393287e8a2f Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Thu, 8 Feb 2024 09:14:14 +0530 Subject: [PATCH 305/496] RISC-V: defconfig: Enable CONFIG_ACPI_CPPC_CPUFREQ CONFIG_ACPI_CPPC_CPUFREQ is required to enable CPPC for RISC-V. Signed-off-by: Sunil V L Reviewed-by: Pierre Gondois Acked-by: Rafael J. Wysocki Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20240208034414.22579-4-sunilvl@ventanamicro.com Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index eaf34e871e30..2988ecd3eb4d 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -44,6 +44,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m CONFIG_CPUFREQ_DT=y +CONFIG_ACPI_CPPC_CPUFREQ=m CONFIG_VIRTUALIZATION=y CONFIG_KVM=m CONFIG_ACPI=y From 89f4fd7b1ab7733d9d817e7123c58996ca38ae98 Mon Sep 17 00:00:00 2001 From: Eric Chan Date: Sat, 17 Feb 2024 13:12:49 +0000 Subject: [PATCH 306/496] riscv/barrier: Define __{mb,rmb,wmb} Introduce __{mb,rmb,wmb}, and rely on the generic definitions for {mb,rmb,wmb}. Although KCSAN is not supported yet, the definitions can be made more consistent with generic instrumentation. Also add a space to make the changes pass check by checkpatch.pl. Without the space, the error message is as below: ERROR: space required after that ',' (ctx:VxV) 26: FILE: arch/riscv/include/asm/barrier.h:23: +#define __mb() RISCV_FENCE(iorw,iorw) ^ Signed-off-by: Eric Chan Reviewed-by: Andrea Parri Reviewed-by: Samuel Holland Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240217131249.3668103-1-ericchancf@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/barrier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 110752594228..173b44a989f8 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -20,9 +20,9 @@ __asm__ __volatile__ ("fence " #p "," #s : : : "memory") /* These barriers need to enforce ordering on both devices or memory. */ -#define mb() RISCV_FENCE(iorw,iorw) -#define rmb() RISCV_FENCE(ir,ir) -#define wmb() RISCV_FENCE(ow,ow) +#define __mb() RISCV_FENCE(iorw, iorw) +#define __rmb() RISCV_FENCE(ir, ir) +#define __wmb() RISCV_FENCE(ow, ow) /* These barriers do not need to enforce ordering on devices, just memory. */ #define __smp_mb() RISCV_FENCE(rw,rw) From b3c8064ccc447be45a3bdc2c4a9ea0491f011920 Mon Sep 17 00:00:00 2001 From: Eric Chan Date: Sat, 17 Feb 2024 13:13:02 +0000 Subject: [PATCH 307/496] riscv/barrier: Define RISCV_FULL_BARRIER Introduce RISCV_FULL_BARRIER and use in arch_atomic* function. like RISCV_ACQUIRE_BARRIER and RISCV_RELEASE_BARRIER, the fence instruction can be eliminated When SMP is not enabled. Signed-off-by: Eric Chan Reviewed-by: Andrea Parri Reviewed-by: Samuel Holland Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240217131302.3668481-1-ericchancf@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/atomic.h | 16 ++++++++-------- arch/riscv/include/asm/cmpxchg.h | 4 ++-- arch/riscv/include/asm/fence.h | 2 ++ 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index f5dfef6c2153..31e6e2e7cc18 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -207,7 +207,7 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int " add %[rc], %[p], %[a]\n" " sc.w.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [a]"r" (a), [u]"r" (u) @@ -228,7 +228,7 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a, " add %[rc], %[p], %[a]\n" " sc.d.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : [a]"r" (a), [u]"r" (u) @@ -248,7 +248,7 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v) " addi %[rc], %[p], 1\n" " sc.w.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -268,7 +268,7 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v) " addi %[rc], %[p], -1\n" " sc.w.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -288,7 +288,7 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v) " bltz %[rc], 1f\n" " sc.w.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -310,7 +310,7 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v) " addi %[rc], %[p], 1\n" " sc.d.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -331,7 +331,7 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v) " addi %[rc], %[p], -1\n" " sc.d.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : @@ -352,7 +352,7 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) " bltz %[rc], 1f\n" " sc.d.rl %[rc], %[rc], %[c]\n" " bnez %[rc], 0b\n" - " fence rw, rw\n" + RISCV_FULL_BARRIER "1:\n" : [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter) : diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 2f4726d3cfcc..a608e4d1a0a4 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -313,7 +313,7 @@ " bne %0, %z3, 1f\n" \ " sc.w.rl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" ((long)__old), "rJ" (__new) \ @@ -325,7 +325,7 @@ " bne %0, %z3, 1f\n" \ " sc.d.rl %1, %z4, %2\n" \ " bnez %1, 0b\n" \ - " fence rw, rw\n" \ + RISCV_FULL_BARRIER \ "1:\n" \ : "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr) \ : "rJ" (__old), "rJ" (__new) \ diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h index 2b443a3a487f..6c26c44dfcd6 100644 --- a/arch/riscv/include/asm/fence.h +++ b/arch/riscv/include/asm/fence.h @@ -4,9 +4,11 @@ #ifdef CONFIG_SMP #define RISCV_ACQUIRE_BARRIER "\tfence r , rw\n" #define RISCV_RELEASE_BARRIER "\tfence rw, w\n" +#define RISCV_FULL_BARRIER "\tfence rw, rw\n" #else #define RISCV_ACQUIRE_BARRIER #define RISCV_RELEASE_BARRIER +#define RISCV_FULL_BARRIER #endif #endif /* _ASM_RISCV_FENCE_H */ From c85688e2b0f0afbce7ea3cd8c47f2be67c09b9f4 Mon Sep 17 00:00:00 2001 From: Eric Chan Date: Sat, 17 Feb 2024 13:13:16 +0000 Subject: [PATCH 308/496] riscv/barrier: Consolidate fence definitions Disparate fence implementations are consolidated into fence.h. Also introduce RISCV_FENCE_ASM to make fence macro more reusable. Signed-off-by: Eric Chan Reviewed-by: Andrea Parri Reviewed-by: Samuel Holland Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240217131316.3668927-1-ericchancf@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/atomic.h | 1 - arch/riscv/include/asm/barrier.h | 3 +-- arch/riscv/include/asm/cmpxchg.h | 1 - arch/riscv/include/asm/fence.h | 10 +++++++--- arch/riscv/include/asm/io.h | 8 ++++---- arch/riscv/include/asm/mmio.h | 5 +++-- arch/riscv/include/asm/mmiowb.h | 2 +- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h index 31e6e2e7cc18..0e0522e588ca 100644 --- a/arch/riscv/include/asm/atomic.h +++ b/arch/riscv/include/asm/atomic.h @@ -17,7 +17,6 @@ #endif #include -#include #define __atomic_acquire_fence() \ __asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory") diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 173b44a989f8..15857dbc2279 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -11,13 +11,12 @@ #define _ASM_RISCV_BARRIER_H #ifndef __ASSEMBLY__ +#include #define nop() __asm__ __volatile__ ("nop") #define __nops(n) ".rept " #n "\nnop\n.endr\n" #define nops(n) __asm__ __volatile__ (__nops(n)) -#define RISCV_FENCE(p, s) \ - __asm__ __volatile__ ("fence " #p "," #s : : : "memory") /* These barriers need to enforce ordering on both devices or memory. */ #define __mb() RISCV_FENCE(iorw, iorw) diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index a608e4d1a0a4..2fee65cc8443 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -8,7 +8,6 @@ #include -#include #include #define __xchg_relaxed(ptr, new, size) \ diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h index 6c26c44dfcd6..6bcd80325dfc 100644 --- a/arch/riscv/include/asm/fence.h +++ b/arch/riscv/include/asm/fence.h @@ -1,10 +1,14 @@ #ifndef _ASM_RISCV_FENCE_H #define _ASM_RISCV_FENCE_H +#define RISCV_FENCE_ASM(p, s) "\tfence " #p "," #s "\n" +#define RISCV_FENCE(p, s) \ + ({ __asm__ __volatile__ (RISCV_FENCE_ASM(p, s) : : : "memory"); }) + #ifdef CONFIG_SMP -#define RISCV_ACQUIRE_BARRIER "\tfence r , rw\n" -#define RISCV_RELEASE_BARRIER "\tfence rw, w\n" -#define RISCV_FULL_BARRIER "\tfence rw, rw\n" +#define RISCV_ACQUIRE_BARRIER RISCV_FENCE_ASM(r, rw) +#define RISCV_RELEASE_BARRIER RISCV_FENCE_ASM(rw, w) +#define RISCV_FULL_BARRIER RISCV_FENCE_ASM(rw, rw) #else #define RISCV_ACQUIRE_BARRIER #define RISCV_RELEASE_BARRIER diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 42497d487a17..1c5c641075d2 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -47,10 +47,10 @@ * sufficient to ensure this works sanely on controllers that support I/O * writes. */ -#define __io_pbr() __asm__ __volatile__ ("fence io,i" : : : "memory"); -#define __io_par(v) __asm__ __volatile__ ("fence i,ior" : : : "memory"); -#define __io_pbw() __asm__ __volatile__ ("fence iow,o" : : : "memory"); -#define __io_paw() __asm__ __volatile__ ("fence o,io" : : : "memory"); +#define __io_pbr() RISCV_FENCE(io, i) +#define __io_par(v) RISCV_FENCE(i, ior) +#define __io_pbw() RISCV_FENCE(iow, o) +#define __io_paw() RISCV_FENCE(o, io) /* * Accesses from a single hart to a single I/O address must be ordered. This diff --git a/arch/riscv/include/asm/mmio.h b/arch/riscv/include/asm/mmio.h index 4c58ee7f95ec..06cadfd7a237 100644 --- a/arch/riscv/include/asm/mmio.h +++ b/arch/riscv/include/asm/mmio.h @@ -12,6 +12,7 @@ #define _ASM_RISCV_MMIO_H #include +#include #include /* Generic IO read/write. These perform native-endian accesses. */ @@ -131,8 +132,8 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) * doesn't define any ordering between the memory space and the I/O space. */ #define __io_br() do {} while (0) -#define __io_ar(v) ({ __asm__ __volatile__ ("fence i,ir" : : : "memory"); }) -#define __io_bw() ({ __asm__ __volatile__ ("fence w,o" : : : "memory"); }) +#define __io_ar(v) RISCV_FENCE(i, ir) +#define __io_bw() RISCV_FENCE(w, o) #define __io_aw() mmiowb_set_pending() #define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; }) diff --git a/arch/riscv/include/asm/mmiowb.h b/arch/riscv/include/asm/mmiowb.h index 0b2333e71fdc..52ce4a399d9b 100644 --- a/arch/riscv/include/asm/mmiowb.h +++ b/arch/riscv/include/asm/mmiowb.h @@ -7,7 +7,7 @@ * "o,w" is sufficient to ensure that all writes to the device have completed * before the write to the spinlock is allowed to commit. */ -#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory"); +#define mmiowb() RISCV_FENCE(o, w) #include #include From 9133e6e6908d95812e89619f8a86abd0deb17bf3 Mon Sep 17 00:00:00 2001 From: Eric Chan Date: Sat, 17 Feb 2024 13:13:28 +0000 Subject: [PATCH 309/496] riscv/barrier: Add missing space after ',' The past form of RISCV_FENCE would cause checkpatch.pl to issue error messages, the example is as follows: ERROR: space required after that ',' (ctx:VxV) 26: FILE: arch/riscv/include/asm/barrier.h:27: +#define __smp_mb() RISCV_FENCE(rw,rw) ^ fix the remaining of RISCV_FENCE. Signed-off-by: Eric Chan Reviewed-by: Andrea Parri Reviewed-by: Samuel Holland Tested-by: Samuel Holland Link: https://lore.kernel.org/r/20240217131328.3669364-1-ericchancf@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/barrier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index 15857dbc2279..880b56d8480d 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -24,14 +24,14 @@ #define __wmb() RISCV_FENCE(ow, ow) /* These barriers do not need to enforce ordering on devices, just memory. */ -#define __smp_mb() RISCV_FENCE(rw,rw) -#define __smp_rmb() RISCV_FENCE(r,r) -#define __smp_wmb() RISCV_FENCE(w,w) +#define __smp_mb() RISCV_FENCE(rw, rw) +#define __smp_rmb() RISCV_FENCE(r, r) +#define __smp_wmb() RISCV_FENCE(w, w) #define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(rw,w); \ + RISCV_FENCE(rw, w); \ WRITE_ONCE(*p, v); \ } while (0) @@ -39,7 +39,7 @@ do { \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ - RISCV_FENCE(r,rw); \ + RISCV_FENCE(r, rw); \ ___p1; \ }) @@ -68,7 +68,7 @@ do { \ * instances the scheduler pairs this with an mb(), so nothing is necessary on * the new hart. */ -#define smp_mb__after_spinlock() RISCV_FENCE(iorw,iorw) +#define smp_mb__after_spinlock() RISCV_FENCE(iorw, iorw) #include From 7ded842b356d151ece8ac4985940438e6d3998bb Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 02:54:12 +0100 Subject: [PATCH 310/496] s390/bpf: Fix bpf_plt pointer arithmetic Kui-Feng Lee reported a crash on s390x triggered by the dummy_st_ops/dummy_init_ptr_arg test [1]: [<0000000000000002>] 0x2 [<00000000009d5cde>] bpf_struct_ops_test_run+0x156/0x250 [<000000000033145a>] __sys_bpf+0xa1a/0xd00 [<00000000003319dc>] __s390x_sys_bpf+0x44/0x50 [<0000000000c4382c>] __do_syscall+0x244/0x300 [<0000000000c59a40>] system_call+0x70/0x98 This is caused by GCC moving memcpy() after assignments in bpf_jit_plt(), resulting in NULL pointers being written instead of the return and the target addresses. Looking at the GCC internals, the reordering is allowed because the alias analysis thinks that the memcpy() destination and the assignments' left-hand-sides are based on different objects: new_plt and bpf_plt_ret/bpf_plt_target respectively, and therefore they cannot alias. This is in turn due to a violation of the C standard: When two pointers are subtracted, both shall point to elements of the same array object, or one past the last element of the array object ... From the C's perspective, bpf_plt_ret and bpf_plt are distinct objects and cannot be subtracted. In the practical terms, doing so confuses the GCC's alias analysis. The code was written this way in order to let the C side know a few offsets defined in the assembly. While nice, this is by no means necessary. Fix the noncompliance by hardcoding these offsets. [1] https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ Fixes: f1d5df84cd8c ("s390/bpf: Implement bpf_arch_text_poke()") Signed-off-by: Ilya Leoshkevich Message-ID: <20240320015515.11883-1-iii@linux.ibm.com> Signed-off-by: Alexei Starovoitov --- arch/s390/net/bpf_jit_comp.c | 46 ++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index b418333bb086..5af0402e94b8 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -516,11 +516,12 @@ static void bpf_skip(struct bpf_jit *jit, int size) * PLT for hotpatchable calls. The calling convention is the same as for the * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. */ -extern const char bpf_plt[]; -extern const char bpf_plt_ret[]; -extern const char bpf_plt_target[]; -extern const char bpf_plt_end[]; -#define BPF_PLT_SIZE 32 +struct bpf_plt { + char code[16]; + void *ret; + void *target; +} __packed; +extern const struct bpf_plt bpf_plt; asm( ".pushsection .rodata\n" " .balign 8\n" @@ -531,15 +532,14 @@ asm( " .balign 8\n" "bpf_plt_ret: .quad 0\n" "bpf_plt_target: .quad 0\n" - "bpf_plt_end:\n" " .popsection\n" ); -static void bpf_jit_plt(void *plt, void *ret, void *target) +static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target) { - memcpy(plt, bpf_plt, BPF_PLT_SIZE); - *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; - *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; + memcpy(plt, &bpf_plt, sizeof(*plt)); + plt->ret = ret; + plt->target = target; } /* @@ -662,9 +662,9 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) jit->prg = ALIGN(jit->prg, 8); jit->prologue_plt = jit->prg; if (jit->prg_buf) - bpf_jit_plt(jit->prg_buf + jit->prg, + bpf_jit_plt((struct bpf_plt *)(jit->prg_buf + jit->prg), jit->prg_buf + jit->prologue_plt_ret, NULL); - jit->prg += BPF_PLT_SIZE; + jit->prg += sizeof(struct bpf_plt); } static int get_probe_mem_regno(const u8 *insn) @@ -2040,9 +2040,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; - if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE)) - return orig_fp; - if (!fp->jit_requested) return orig_fp; @@ -2148,14 +2145,11 @@ bool bpf_jit_supports_far_kfunc_call(void) int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { + struct bpf_plt expected_plt, current_plt, new_plt, *plt; struct { u16 opc; s32 disp; } __packed insn; - char expected_plt[BPF_PLT_SIZE]; - char current_plt[BPF_PLT_SIZE]; - char new_plt[BPF_PLT_SIZE]; - char *plt; char *ret; int err; @@ -2174,18 +2168,18 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, */ } else { /* Verify the PLT. */ - plt = (char *)ip + (insn.disp << 1); - err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); + plt = ip + (insn.disp << 1); + err = copy_from_kernel_nofault(¤t_plt, plt, + sizeof(current_plt)); if (err < 0) return err; ret = (char *)ip + 6; - bpf_jit_plt(expected_plt, ret, old_addr); - if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) + bpf_jit_plt(&expected_plt, ret, old_addr); + if (memcmp(¤t_plt, &expected_plt, sizeof(current_plt))) return -EINVAL; /* Adjust the call address. */ - bpf_jit_plt(new_plt, ret, new_addr); - s390_kernel_write(plt + (bpf_plt_target - bpf_plt), - new_plt + (bpf_plt_target - bpf_plt), + bpf_jit_plt(&new_plt, ret, new_addr); + s390_kernel_write(&plt->target, &new_plt.target, sizeof(void *)); } From 5ab8cb89dbb6f3e111c8b0a9a86496da23c94439 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Mar 2024 14:51:43 -0700 Subject: [PATCH 311/496] libbpf: fix u64-to-pointer cast on 32-bit arches It's been reported that (void *)map->map_extra is causing compilation warnings on 32-bit architectures. It's easy enough to fix this by casting to long first. Fixes: 79ff13e99169 ("libbpf: Add support for bpf_arena.") Reported-by: Ryan Eatmon Signed-off-by: Andrii Nakryiko Message-ID: <20240319215143.1279312-1-andrii@kernel.org> Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 36e26f4f5997..1e3e697b98bc 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5352,8 +5352,8 @@ retry: goto err_out; } if (map->def.type == BPF_MAP_TYPE_ARENA) { - map->mmaped = mmap((void *)map->map_extra, bpf_map_mmap_sz(map), - PROT_READ | PROT_WRITE, + map->mmaped = mmap((void *)(long)map->map_extra, + bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED, map->fd, 0); if (map->mmaped == MAP_FAILED) { From 114b5b3b4bde7358624437be2f12cde1b265224e Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 12 Mar 2024 23:59:17 +0000 Subject: [PATCH 312/496] bpf, arm64: fix bug in BPF_LDX_MEMSX A64_LDRSW() takes three registers: Xt, Xn, Xm as arguments and it loads and sign extends the value at address Xn + Xm into register Xt. Currently, the offset is being directly used in place of the tmp register which has the offset already loaded by the last emitted instruction. This will cause JIT failures. The easiest way to reproduce this is to test the following code through test_bpf module: { "BPF_LDX_MEMSX | BPF_W", .u.insns_int = { BPF_LD_IMM64(R1, 0x00000000deadbeefULL), BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL), BPF_STX_MEM(BPF_DW, R10, R1, -7), BPF_LDX_MEMSX(BPF_W, R0, R10, -7), BPF_JMP_REG(BPF_JNE, R0, R2, 1), BPF_ALU64_IMM(BPF_MOV, R0, 0), BPF_EXIT_INSN(), }, INTERNAL, { }, { { 0, 0 } }, .stack_depth = 7, }, We need to use the offset as -7 to trigger this code path, there could be other valid ways to trigger this from proper BPF programs as well. This code is rejected by the JIT because -7 is passed to A64_LDRSW() but it expects a valid register (0 - 31). roott@pjy:~# modprobe test_bpf test_name="BPF_LDX_MEMSX | BPF_W" [11300.490371] test_bpf: test_bpf: set 'test_bpf' as the default test_suite. [11300.491750] test_bpf: #345 BPF_LDX_MEMSX | BPF_W [11300.493179] aarch64_insn_encode_register: unknown register encoding -7 [11300.494133] aarch64_insn_encode_register: unknown register encoding -7 [11300.495292] FAIL to select_runtime err=-524 [11300.496804] test_bpf: Summary: 0 PASSED, 1 FAILED, [0/0 JIT'ed] modprobe: ERROR: could not insert 'test_bpf': Invalid argument Applying this patch fixes the issue. root@pjy:~# modprobe test_bpf test_name="BPF_LDX_MEMSX | BPF_W" [ 292.837436] test_bpf: test_bpf: set 'test_bpf' as the default test_suite. [ 292.839416] test_bpf: #345 BPF_LDX_MEMSX | BPF_W jited:1 156 PASS [ 292.844794] test_bpf: Summary: 1 PASSED, 0 FAILED, [1/1 JIT'ed] Fixes: cc88f540da52 ("bpf, arm64: Support sign-extension load instructions") Signed-off-by: Puranjay Mohan Message-ID: <20240312235917.103626-1-puranjay12@gmail.com> Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c5b461dda438..48b19a233299 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1256,7 +1256,7 @@ emit_cond_jmp: } else { emit_a64_mov_i(1, tmp, off, ctx); if (sign_extend) - emit(A64_LDRSW(dst, src_adj, off_adj), ctx); + emit(A64_LDRSW(dst, src, tmp), ctx); else emit(A64_LDR32(dst, src, tmp), ctx); } From 33affa7fb46c0c07f6c49d4ddac9dd436715064c Mon Sep 17 00:00:00 2001 From: Tim Crawford Date: Tue, 19 Mar 2024 15:27:26 -0600 Subject: [PATCH 313/496] ALSA: hda/realtek: Add quirks for some Clevo laptops Add audio quirks to fix speaker output and headset detection on some new Clevo models: - L240TU (ALC245) - PE60SNE-G (ALC1220) - V350SNEQ (ALC245) Co-authored-by: Jeremy Soller Signed-off-by: Tim Crawford Message-ID: <20240319212726.62888-1-tcrawford@system76.com> Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 203aa2d600a1..a17c36a36aa5 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2645,6 +2645,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x65f1, "Clevo PC50HS", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x65f5, "Clevo PD50PN[NRT]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x66a2, "Clevo PE60RNE", ALC1220_FIXUP_CLEVO_PB51ED_PINS), + SND_PCI_QUIRK(0x1558, 0x66a6, "Clevo PE60SN[CDE]-[GS]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67d1, "Clevo PB71[ER][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e1, "Clevo PB71[DE][CDF]", ALC1220_FIXUP_CLEVO_PB51ED_PINS), SND_PCI_QUIRK(0x1558, 0x67e5, "Clevo PC70D[PRS](?:-D|-G)?", ALC1220_FIXUP_CLEVO_PB51ED_PINS), @@ -10176,12 +10177,14 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1462, 0xb120, "MSI Cubi MS-B120", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x1462, 0xb171, "Cubi N 8GL (MS-B171)", ALC283_FIXUP_HEADSET_MIC), SND_PCI_QUIRK(0x152d, 0x1082, "Quanta NL3", ALC269_FIXUP_LIFEBOOK), + SND_PCI_QUIRK(0x1558, 0x0353, "Clevo V35[05]SN[CDE]Q", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1323, "Clevo N130ZU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1325, "Clevo N15[01][CW]U", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1401, "Clevo L140[CZ]U", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1403, "Clevo N140CU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x1404, "Clevo N150CU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x14a1, "Clevo L141MU", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0x2624, "Clevo L240TU", ALC256_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4018, "Clevo NV40M[BE]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4019, "Clevo NV40MZ", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1558, 0x4020, "Clevo NV40MB", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), From 14d811467f6592aa0e685730e66b5f9123287468 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 Mar 2024 07:27:20 +0100 Subject: [PATCH 314/496] ALSA: control: Fix unannotated kfree() cleanup The recent conversion to the automatic kfree() forgot to mark a variable with __free(kfree), leading to memory leaks. Fix it. Fixes: 1052d9882269 ("ALSA: control: Use automatic cleanup of kfree()") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/r/c1e2ef3c-164f-4840-9b1c-f7ca07ca422a@alu.unizg.hr Message-ID: <20240320062722.31325-1-tiwai@suse.de> Signed-off-by: Takashi Iwai --- sound/core/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/core/control.c b/sound/core/control.c index 8367fd485371..fb0c60044f7b 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1275,12 +1275,12 @@ static int snd_ctl_elem_read(struct snd_card *card, static int snd_ctl_elem_read_user(struct snd_card *card, struct snd_ctl_elem_value __user *_control) { - struct snd_ctl_elem_value *control; + struct snd_ctl_elem_value *control __free(kfree) = NULL; int result; control = memdup_user(_control, sizeof(*control)); if (IS_ERR(control)) - return PTR_ERR(control); + return PTR_ERR(no_free_ptr(control)); result = snd_ctl_elem_read(card, control); if (result < 0) From b7b73f6d4f7bca79fef9e31a054e24e31a65f22a Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Wed, 13 Mar 2024 09:14:00 +1300 Subject: [PATCH 315/496] i2c: muxes: pca954x: Allow sharing reset GPIO Some hardware designs with multiple PCA954x devices use a reset GPIO connected to all the muxes. Support this configuration by making use of the reset controller framework which can deal with the shared reset GPIOs. Fall back to the old GPIO descriptor method if the reset controller framework is not enabled. Signed-off-by: Chris Packham Acked-by: Peter Rosin Reviewed-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- drivers/i2c/muxes/i2c-mux-pca954x.c | 46 ++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c index f5dfc33b97c0..c3f4ff08ac38 100644 --- a/drivers/i2c/muxes/i2c-mux-pca954x.c +++ b/drivers/i2c/muxes/i2c-mux-pca954x.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -116,6 +117,9 @@ struct pca954x { unsigned int irq_mask; raw_spinlock_t lock; struct regulator *supply; + + struct gpio_desc *reset_gpio; + struct reset_control *reset_cont; }; /* Provide specs for the MAX735x, PCA954x and PCA984x types we know about */ @@ -518,6 +522,35 @@ static int pca954x_init(struct i2c_client *client, struct pca954x *data) return ret; } +static int pca954x_get_reset(struct device *dev, struct pca954x *data) +{ + data->reset_cont = devm_reset_control_get_optional_shared(dev, NULL); + if (IS_ERR(data->reset_cont)) + return dev_err_probe(dev, PTR_ERR(data->reset_cont), + "Failed to get reset\n"); + else if (data->reset_cont) + return 0; + + /* + * fallback to legacy reset-gpios + */ + data->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(data->reset_gpio)) { + return dev_err_probe(dev, PTR_ERR(data->reset_gpio), + "Failed to get reset gpio"); + } + + return 0; +} + +static void pca954x_reset_deassert(struct pca954x *data) +{ + if (data->reset_cont) + reset_control_deassert(data->reset_cont); + else + gpiod_set_value_cansleep(data->reset_gpio, 0); +} + /* * I2C init/probing/exit functions */ @@ -526,7 +559,6 @@ static int pca954x_probe(struct i2c_client *client) const struct i2c_device_id *id = i2c_client_get_device_id(client); struct i2c_adapter *adap = client->adapter; struct device *dev = &client->dev; - struct gpio_desc *gpio; struct i2c_mux_core *muxc; struct pca954x *data; int num; @@ -554,15 +586,13 @@ static int pca954x_probe(struct i2c_client *client) return dev_err_probe(dev, ret, "Failed to enable vdd supply\n"); - /* Reset the mux if a reset GPIO is specified. */ - gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(gpio)) { - ret = PTR_ERR(gpio); + ret = pca954x_get_reset(dev, data); + if (ret) goto fail_cleanup; - } - if (gpio) { + + if (data->reset_cont || data->reset_gpio) { udelay(1); - gpiod_set_value_cansleep(gpio, 0); + pca954x_reset_deassert(data); /* Give the chip some time to recover. */ udelay(1); } From 10b890ee21a5ef7f6444506974648648f98f9ba9 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Tue, 19 Mar 2024 09:19:25 +0800 Subject: [PATCH 316/496] MAINTAINERS: wifi: add git tree for Realtek WiFi drivers Add git tree to manage all Realtek WiFi drivers except RTL8180 which is old and orphan. Signed-off-by: Ping-Ke Shih Signed-off-by: Kalle Valo Link: https://msgid.link/20240319011925.6855-1-pkshih@realtek.com --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 54775eaaf7b3..452288995991 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18586,18 +18586,21 @@ REALTEK WIRELESS DRIVER (rtlwifi family) M: Ping-Ke Shih L: linux-wireless@vger.kernel.org S: Maintained +T: git https://github.com/pkshih/rtw.git F: drivers/net/wireless/realtek/rtlwifi/ REALTEK WIRELESS DRIVER (rtw88) M: Ping-Ke Shih L: linux-wireless@vger.kernel.org S: Maintained +T: git https://github.com/pkshih/rtw.git F: drivers/net/wireless/realtek/rtw88/ REALTEK WIRELESS DRIVER (rtw89) M: Ping-Ke Shih L: linux-wireless@vger.kernel.org S: Maintained +T: git https://github.com/pkshih/rtw.git F: drivers/net/wireless/realtek/rtw89/ REDPINE WIRELESS DRIVER @@ -19120,12 +19123,14 @@ M: Hin-Tak Leung M: Larry Finger L: linux-wireless@vger.kernel.org S: Maintained +T: git https://github.com/pkshih/rtw.git F: drivers/net/wireless/realtek/rtl818x/rtl8187/ RTL8XXXU WIRELESS DRIVER (rtl8xxxu) M: Jes Sorensen L: linux-wireless@vger.kernel.org S: Maintained +T: git https://github.com/pkshih/rtw.git F: drivers/net/wireless/realtek/rtl8xxxu/ RTRS TRANSPORT DRIVERS From e593a4a2d3ad5e1a4be338b38ed6ba7c70642d88 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 11 Mar 2024 16:26:04 -0600 Subject: [PATCH 317/496] dt-bindings: i2c: qcom,i2c-cci: Fix OV7251 'data-lanes' entries The OV7251 sensor only has a single data lane, so 2 entries is not valid. Fix this to be 1 entry as the schema specifies. The schema validation doesn't catch this currently due to some limitations in handling of arrays vs. matrices, but a fix is being worked on. Signed-off-by: Rob Herring Reviewed-by: Krzysztof Kozlowski Signed-off-by: Wolfram Sang --- Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml index 8386cfe21532..f0eabff86310 100644 --- a/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml +++ b/Documentation/devicetree/bindings/i2c/qcom,i2c-cci.yaml @@ -270,7 +270,7 @@ examples: port { ov7251_ep: endpoint { - data-lanes = <0 1>; + data-lanes = <0>; link-frequencies = /bits/ 64 <240000000 319200000>; remote-endpoint = <&csiphy3_ep>; }; From 28e4748e5e3d313056ed38abeff4b455abd02c1b Mon Sep 17 00:00:00 2001 From: Erick Archer Date: Sat, 20 Jan 2024 14:54:00 +0100 Subject: [PATCH 318/496] riscv: Use kcalloc() instead of kzalloc() As noted in the "Deprecated Interfaces, Language Features, Attributes, and Conventions" documentation [1], size calculations (especially multiplication) should not be performed in memory allocator (or similar) function arguments due to the risk of them overflowing. This could lead to values wrapping around and a smaller allocation being made than the caller was expecting. Using those allocations could lead to linear overflows of heap memory and other misbehaviors. So, use the purpose specific kcalloc() function instead of the argument count * size in the kzalloc() function. Also, it is preferred to use sizeof(*pointer) instead of sizeof(type) due to the type of the variable can change and one needs not change the former (unlike the latter). Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [1] Link: https://github.com/KSPP/linux/issues/162 Signed-off-by: Erick Archer Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240120135400.4710-1-erick.archer@gmx.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/unaligned_access_speed.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index 52264ea4f0bd..a9a6bcb02acf 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -218,8 +218,7 @@ static int check_unaligned_access_speed_all_cpus(void) { unsigned int cpu; unsigned int cpu_count = num_possible_cpus(); - struct page **bufs = kzalloc(cpu_count * sizeof(struct page *), - GFP_KERNEL); + struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL); if (!bufs) { pr_warn("Allocation failure, not measuring misaligned performance\n"); From 01261e24cfab69c65043e1e61168348ae23a64c2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 2 Feb 2024 13:47:11 +0100 Subject: [PATCH 319/496] riscv: Only flush the mm icache when setting an exec pte We used to emit a flush_icache_all() whenever a dirty executable mapping is set in the page table but we can instead call flush_icache_mm() which will only send IPIs to cores that currently run this mm and add a deferred icache flush to the others. The number of calls to sbi_remote_fence_i() (tested without IPI support): With a simple buildroot rootfs: * Before: ~5k * After : 4 (!) Tested on HW, the boot to login is ~4.5% faster. With an ubuntu rootfs: * Before: ~24k * After : ~13k Signed-off-by: Alexandre Ghiti Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/20240202124711.256146-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 14 +++++++------- arch/riscv/mm/cacheflush.c | 4 ++-- arch/riscv/mm/pgtable.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 9c45810806ff..b2e6965748f2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -513,12 +513,12 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) WRITE_ONCE(*ptep, pteval); } -void flush_icache_pte(pte_t pte); +void flush_icache_pte(struct mm_struct *mm, pte_t pte); -static inline void __set_pte_at(pte_t *ptep, pte_t pteval) +static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval) { if (pte_present(pteval) && pte_exec(pteval)) - flush_icache_pte(pteval); + flush_icache_pte(mm, pteval); set_pte(ptep, pteval); } @@ -529,7 +529,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, page_table_check_ptes_set(mm, ptep, pteval, nr); for (;;) { - __set_pte_at(ptep, pteval); + __set_pte_at(mm, ptep, pteval); if (--nr == 0) break; ptep++; @@ -541,7 +541,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - __set_pte_at(ptep, __pte(0)); + __set_pte_at(mm, ptep, __pte(0)); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */ @@ -713,14 +713,14 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(mm, pmdp, pmd); - return __set_pte_at((pte_t *)pmdp, pmd_pte(pmd)); + return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd)); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { page_table_check_pud_set(mm, pudp, pud); - return __set_pte_at((pte_t *)pudp, pud_pte(pud)); + return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud)); } #ifdef CONFIG_PAGE_TABLE_CHECK diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 55a34f2020a8..bc61ee5975e4 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -82,12 +82,12 @@ void flush_icache_mm(struct mm_struct *mm, bool local) #endif /* CONFIG_SMP */ #ifdef CONFIG_MMU -void flush_icache_pte(pte_t pte) +void flush_icache_pte(struct mm_struct *mm, pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); if (!test_bit(PG_dcache_clean, &folio->flags)) { - flush_icache_all(); + flush_icache_mm(mm, false); set_bit(PG_dcache_clean, &folio->flags); } } diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index ef887efcb679..533ec9055fa0 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -10,7 +10,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { if (!pte_same(ptep_get(ptep), entry)) - __set_pte_at(ptep, entry); + __set_pte_at(vma->vm_mm, ptep, entry); /* * update_mmu_cache will unconditionally execute, handling both * the case that the PTE changed and the spurious fault case. From da215b089b5d4ff30745c59922b54b309d55a5d8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 7 Feb 2024 22:08:51 -0800 Subject: [PATCH 320/496] crypto: riscv - parallelize AES-CBC decryption Since CBC decryption is parallelizable, make the RISC-V implementation of AES-CBC decryption process multiple blocks at a time, instead of processing the blocks one by one. This should improve performance. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240208060851.154129-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/aes-riscv64-zvkned.S | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S index 78d4e1186c07..43541aad6386 100644 --- a/arch/riscv/crypto/aes-riscv64-zvkned.S +++ b/arch/riscv/crypto/aes-riscv64-zvkned.S @@ -139,19 +139,25 @@ SYM_FUNC_END(aes_ecb_decrypt_zvkned) .endm .macro aes_cbc_decrypt keylen + srli LEN, LEN, 2 // Convert LEN from bytes to words vle32.v v16, (IVP) // Load IV 1: - vle32.v v17, (INP) // Load ciphertext block - vmv.v.v v18, v17 // Save ciphertext block - aes_decrypt v17, \keylen // Decrypt - vxor.vv v17, v17, v16 // XOR with IV or prev ciphertext block - vse32.v v17, (OUTP) // Store plaintext block - vmv.v.v v16, v18 // Next "IV" is prev ciphertext block - addi INP, INP, 16 - addi OUTP, OUTP, 16 - addi LEN, LEN, -16 + vsetvli t0, LEN, e32, m4, ta, ma + vle32.v v20, (INP) // Load ciphertext blocks + vslideup.vi v16, v20, 4 // Setup prev ciphertext blocks + addi t1, t0, -4 + vslidedown.vx v24, v20, t1 // Save last ciphertext block + aes_decrypt v20, \keylen // Decrypt the blocks + vxor.vv v20, v20, v16 // XOR with prev ciphertext blocks + vse32.v v20, (OUTP) // Store plaintext blocks + vmv.v.v v16, v24 // Next "IV" is last ciphertext block + slli t1, t0, 2 // Words to bytes + add INP, INP, t1 + add OUTP, OUTP, t1 + sub LEN, LEN, t0 bnez LEN, 1b + vsetivli zero, 4, e32, m1, ta, ma vse32.v v16, (IVP) // Store next IV ret .endm From c70dfa4a2723ff5046fdc6d8a054713483f64f1b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 Feb 2024 21:54:42 -0800 Subject: [PATCH 321/496] crypto: riscv - add vector crypto accelerated AES-CBC-CTS Add an implementation of cts(cbc(aes)) accelerated using the Zvkned RISC-V vector crypto extension. This is mainly useful for fscrypt, where cts(cbc(aes)) is the "default" filenames encryption algorithm. In that use case, typically most messages are short and are block-aligned. The CBC-CTS variant implemented is CS3; this is the variant Linux uses. To perform well on short messages, the new implementation processes the full message in one call to the assembly function if the data is contiguous. Otherwise it falls back to CBC operations followed by CTS at the end. For decryption, to further improve performance on short messages, especially block-aligned messages, the CBC-CTS assembly function parallelizes the AES decryption of all full blocks. This improves on the arm64 implementation of cts(cbc(aes)), which always splits the CBC part(s) from the CTS part, doing the AES decryptions for the last two blocks serially and usually loading the round keys twice. Tested in QEMU with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20240213055442.35954-1-ebiggers@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/crypto/Kconfig | 4 +- arch/riscv/crypto/aes-riscv64-glue.c | 93 ++++++++++++++- arch/riscv/crypto/aes-riscv64-zvkned.S | 153 +++++++++++++++++++++++++ 3 files changed, 245 insertions(+), 5 deletions(-) diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig index 2ad44e1d464a..ad58dad9a580 100644 --- a/arch/riscv/crypto/Kconfig +++ b/arch/riscv/crypto/Kconfig @@ -3,14 +3,14 @@ menu "Accelerated Cryptographic Algorithms for CPU (riscv)" config CRYPTO_AES_RISCV64 - tristate "Ciphers: AES, modes: ECB, CBC, CTR, XTS" + tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS" depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO select CRYPTO_ALGAPI select CRYPTO_LIB_AES select CRYPTO_SKCIPHER help Block cipher: AES cipher algorithms - Length-preserving ciphers: AES with ECB, CBC, CTR, XTS + Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTS Architecture: riscv64 using: - Zvkned vector crypto extension diff --git a/arch/riscv/crypto/aes-riscv64-glue.c b/arch/riscv/crypto/aes-riscv64-glue.c index 37bc6ef0be40..f814ee048555 100644 --- a/arch/riscv/crypto/aes-riscv64-glue.c +++ b/arch/riscv/crypto/aes-riscv64-glue.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0-only /* * AES using the RISC-V vector crypto extensions. Includes the bare block - * cipher and the ECB, CBC, CTR, and XTS modes. + * cipher and the ECB, CBC, CBC-CTS, CTR, and XTS modes. * * Copyright (C) 2023 VRULL GmbH * Author: Heiko Stuebner * * Copyright (C) 2023 SiFive, Inc. * Author: Jerry Shih + * + * Copyright 2024 Google LLC */ #include @@ -40,6 +42,10 @@ asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key, const u8 *in, u8 *out, size_t len, u8 iv[AES_BLOCK_SIZE]); +asmlinkage void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key, + const u8 *in, u8 *out, size_t len, + const u8 iv[AES_BLOCK_SIZE], bool enc); + asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key, const u8 *in, u8 *out, size_t len, u8 iv[AES_BLOCK_SIZE]); @@ -164,7 +170,7 @@ static int riscv64_aes_ecb_decrypt(struct skcipher_request *req) /* AES-CBC */ -static inline int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc) +static int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); @@ -202,6 +208,70 @@ static int riscv64_aes_cbc_decrypt(struct skcipher_request *req) return riscv64_aes_cbc_crypt(req, false); } +/* AES-CBC-CTS */ + +static int riscv64_aes_cbc_cts_crypt(struct skcipher_request *req, bool enc) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); + struct scatterlist sg_src[2], sg_dst[2]; + struct skcipher_request subreq; + struct scatterlist *src, *dst; + struct skcipher_walk walk; + unsigned int cbc_len; + int err; + + if (req->cryptlen < AES_BLOCK_SIZE) + return -EINVAL; + + err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; + /* + * If the full message is available in one step, decrypt it in one call + * to the CBC-CTS assembly function. This reduces overhead, especially + * on short messages. Otherwise, fall back to doing CBC up to the last + * two blocks, then invoke CTS just for the ciphertext stealing. + */ + if (unlikely(walk.nbytes != req->cryptlen)) { + cbc_len = round_down(req->cryptlen - AES_BLOCK_SIZE - 1, + AES_BLOCK_SIZE); + skcipher_walk_abort(&walk); + skcipher_request_set_tfm(&subreq, tfm); + skcipher_request_set_callback(&subreq, + skcipher_request_flags(req), + NULL, NULL); + skcipher_request_set_crypt(&subreq, req->src, req->dst, + cbc_len, req->iv); + err = riscv64_aes_cbc_crypt(&subreq, enc); + if (err) + return err; + dst = src = scatterwalk_ffwd(sg_src, req->src, cbc_len); + if (req->dst != req->src) + dst = scatterwalk_ffwd(sg_dst, req->dst, cbc_len); + skcipher_request_set_crypt(&subreq, src, dst, + req->cryptlen - cbc_len, req->iv); + err = skcipher_walk_virt(&walk, &subreq, false); + if (err) + return err; + } + kernel_vector_begin(); + aes_cbc_cts_crypt_zvkned(ctx, walk.src.virt.addr, walk.dst.virt.addr, + walk.nbytes, req->iv, enc); + kernel_vector_end(); + return skcipher_walk_done(&walk, 0); +} + +static int riscv64_aes_cbc_cts_encrypt(struct skcipher_request *req) +{ + return riscv64_aes_cbc_cts_crypt(req, true); +} + +static int riscv64_aes_cbc_cts_decrypt(struct skcipher_request *req) +{ + return riscv64_aes_cbc_cts_crypt(req, false); +} + /* AES-CTR */ static int riscv64_aes_ctr_crypt(struct skcipher_request *req) @@ -434,6 +504,22 @@ static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = { .cra_driver_name = "cbc-aes-riscv64-zvkned", .cra_module = THIS_MODULE, }, + }, { + .setkey = riscv64_aes_setkey_skcipher, + .encrypt = riscv64_aes_cbc_cts_encrypt, + .decrypt = riscv64_aes_cbc_cts_decrypt, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */ + .base = { + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_priority = 300, + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-riscv64-zvkned", + .cra_module = THIS_MODULE, + }, } }; @@ -540,11 +626,12 @@ static void __exit riscv64_aes_mod_exit(void) module_init(riscv64_aes_mod_init); module_exit(riscv64_aes_mod_exit); -MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS (RISC-V accelerated)"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTS/CTR/XTS (RISC-V accelerated)"); MODULE_AUTHOR("Jerry Shih "); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("aes"); MODULE_ALIAS_CRYPTO("ecb(aes)"); MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("cts(cbc(aes))"); MODULE_ALIAS_CRYPTO("ctr(aes)"); MODULE_ALIAS_CRYPTO("xts(aes)"); diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S index 43541aad6386..23d063f94ce6 100644 --- a/arch/riscv/crypto/aes-riscv64-zvkned.S +++ b/arch/riscv/crypto/aes-riscv64-zvkned.S @@ -184,3 +184,156 @@ SYM_FUNC_START(aes_cbc_decrypt_zvkned) 192: aes_cbc_decrypt 192 SYM_FUNC_END(aes_cbc_decrypt_zvkned) + +.macro aes_cbc_cts_encrypt keylen + + // CBC-encrypt all blocks except the last. But don't store the + // second-to-last block to the output buffer yet, since it will be + // handled specially in the ciphertext stealing step. Exception: if the + // message is single-block, still encrypt the last (and only) block. + li t0, 16 + j 2f +1: + vse32.v v16, (OUTP) // Store ciphertext block + addi OUTP, OUTP, 16 +2: + vle32.v v17, (INP) // Load plaintext block + vxor.vv v16, v16, v17 // XOR with IV or prev ciphertext block + aes_encrypt v16, \keylen // Encrypt + addi INP, INP, 16 + addi LEN, LEN, -16 + bgt LEN, t0, 1b // Repeat if more than one block remains + + // Special case: if the message is a single block, just do CBC. + beqz LEN, .Lcts_encrypt_done\@ + + // Encrypt the last two blocks using ciphertext stealing as follows: + // C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n]) + // C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN] + // + // C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th + // plaintext block. Block n, the last block, may be partial; its length + // is 1 <= LEN <= 16. If there are only 2 blocks, C[n-2] means the IV. + // + // v16 already contains Encrypt(P[n-1] ^ C[n-2]). + // INP points to P[n]. OUTP points to where C[n-1] should go. + // To support in-place encryption, load P[n] before storing C[n]. + addi t0, OUTP, 16 // Get pointer to where C[n] should go + vsetvli zero, LEN, e8, m1, tu, ma + vle8.v v17, (INP) // Load P[n] + vse8.v v16, (t0) // Store C[n] + vxor.vv v16, v16, v17 // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n] + vsetivli zero, 4, e32, m1, ta, ma + aes_encrypt v16, \keylen +.Lcts_encrypt_done\@: + vse32.v v16, (OUTP) // Store C[n-1] (or C[n] in single-block case) + ret +.endm + +#define LEN32 t4 // Length of remaining full blocks in 32-bit words +#define LEN_MOD16 t5 // Length of message in bytes mod 16 + +.macro aes_cbc_cts_decrypt keylen + andi LEN32, LEN, ~15 + srli LEN32, LEN32, 2 + andi LEN_MOD16, LEN, 15 + + // Save C[n-2] in v28 so that it's available later during the ciphertext + // stealing step. If there are fewer than three blocks, C[n-2] means + // the IV, otherwise it means the third-to-last ciphertext block. + vmv.v.v v28, v16 // IV + add t0, LEN, -33 + bltz t0, .Lcts_decrypt_loop\@ + andi t0, t0, ~15 + add t0, t0, INP + vle32.v v28, (t0) + + // CBC-decrypt all full blocks. For the last full block, or the last 2 + // full blocks if the message is block-aligned, this doesn't write the + // correct output blocks (unless the message is only a single block), + // because it XORs the wrong values with the raw AES plaintexts. But we + // fix this after this loop without redoing the AES decryptions. This + // approach allows more of the AES decryptions to be parallelized. +.Lcts_decrypt_loop\@: + vsetvli t0, LEN32, e32, m4, ta, ma + addi t1, t0, -4 + vle32.v v20, (INP) // Load next set of ciphertext blocks + vmv.v.v v24, v16 // Get IV or last ciphertext block of prev set + vslideup.vi v24, v20, 4 // Setup prev ciphertext blocks + vslidedown.vx v16, v20, t1 // Save last ciphertext block of this set + aes_decrypt v20, \keylen // Decrypt this set of blocks + vxor.vv v24, v24, v20 // XOR prev ciphertext blocks with decrypted blocks + vse32.v v24, (OUTP) // Store this set of plaintext blocks + sub LEN32, LEN32, t0 + slli t0, t0, 2 // Words to bytes + add INP, INP, t0 + add OUTP, OUTP, t0 + bnez LEN32, .Lcts_decrypt_loop\@ + + vsetivli zero, 4, e32, m4, ta, ma + vslidedown.vx v20, v20, t1 // Extract raw plaintext of last full block + addi t0, OUTP, -16 // Get pointer to last full plaintext block + bnez LEN_MOD16, .Lcts_decrypt_non_block_aligned\@ + + // Special case: if the message is a single block, just do CBC. + li t1, 16 + beq LEN, t1, .Lcts_decrypt_done\@ + + // Block-aligned message. Just fix up the last 2 blocks. We need: + // + // P[n-1] = Decrypt(C[n]) ^ C[n-2] + // P[n] = Decrypt(C[n-1]) ^ C[n] + // + // We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28. + // Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this + // is everything needed to fix the output without re-decrypting blocks. + addi t1, OUTP, -32 // Get pointer to where P[n-1] should go + vxor.vv v20, v20, v28 // Decrypt(C[n]) ^ C[n-2] == P[n-1] + vle32.v v24, (t1) // Decrypt(C[n-1]) ^ C[n-2] + vse32.v v20, (t1) // Store P[n-1] + vxor.vv v20, v24, v16 // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2] + j .Lcts_decrypt_finish\@ + +.Lcts_decrypt_non_block_aligned\@: + // Decrypt the last two blocks using ciphertext stealing as follows: + // + // P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2] + // P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16] + // + // We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28. + vmv.v.v v16, v20 // v16 = Decrypt(C[n-1]) + vsetvli zero, LEN_MOD16, e8, m1, tu, ma + vle8.v v20, (INP) // v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16] + vxor.vv v16, v16, v20 // v16 = Decrypt(C[n-1]) ^ C[n] + vse8.v v16, (OUTP) // Store P[n] + vsetivli zero, 4, e32, m1, ta, ma + aes_decrypt v20, \keylen // v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) +.Lcts_decrypt_finish\@: + vxor.vv v20, v20, v28 // XOR with C[n-2] + vse32.v v20, (t0) // Store last full plaintext block +.Lcts_decrypt_done\@: + ret +.endm + +.macro aes_cbc_cts_crypt keylen + vle32.v v16, (IVP) // Load IV + beqz a5, .Lcts_decrypt\@ + aes_cbc_cts_encrypt \keylen +.Lcts_decrypt\@: + aes_cbc_cts_decrypt \keylen +.endm + +// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key, +// const u8 *in, u8 *out, size_t len, +// const u8 iv[16], bool enc); +// +// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS. +// This is the variant that unconditionally swaps the last two blocks. +SYM_FUNC_START(aes_cbc_cts_crypt_zvkned) + aes_begin KEYP, 128f, 192f + aes_cbc_cts_crypt 256 +128: + aes_cbc_cts_crypt 128 +192: + aes_cbc_cts_crypt 192 +SYM_FUNC_END(aes_cbc_cts_crypt_zvkned) From 5b142b37c70b1fa6936fa2d0babb0b8c16767d3a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 15 Mar 2024 17:46:56 +0000 Subject: [PATCH 322/496] cifs: Move some extern decls from .c files to .h Move the following: extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; extern bool disable_legacy_dialects; from various .c files to cifsglob.h. Signed-off-by: David Howells cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cifsfs.c | 4 ---- fs/smb/client/cifsglob.h | 2 ++ fs/smb/client/connect.c | 3 --- fs/smb/client/misc.c | 3 --- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 81d9aafd2210..aa6f1ecb7c0e 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -151,10 +151,6 @@ MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be " "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker" " and less secure. Default: n/N/0"); -extern mempool_t *cifs_sm_req_poolp; -extern mempool_t *cifs_req_poolp; -extern mempool_t *cifs_mid_poolp; - struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 3da625d53235..7ed9d05f6890 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -2107,6 +2107,8 @@ extern struct workqueue_struct *cifsoplockd_wq; extern struct workqueue_struct *deferredclose_wq; extern __u32 cifs_lock_secret; +extern mempool_t *cifs_sm_req_poolp; +extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; /* Operations for different SMB versions */ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 4cbb79418e50..9b85b5341822 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -52,9 +52,6 @@ #include "fs_context.h" #include "cifs_swn.h" -extern mempool_t *cifs_req_poolp; -extern bool disable_legacy_dialects; - /* FIXME: should these be tunable? */ #define TLINK_ERROR_EXPIRE (1 * HZ) #define TLINK_IDLE_EXPIRE (600 * HZ) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 9428a0db7718..c3771fc81328 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -27,9 +27,6 @@ #include "fs_context.h" #include "cached_dir.h" -extern mempool_t *cifs_sm_req_poolp; -extern mempool_t *cifs_req_poolp; - /* The xid serves as a useful identifier for each incoming vfs request, in a similar way to the mid which is useful to track each sent smb, and CurrentXid can also provide a running counter (although it From 68c5818a27afcb5cdddab041b82e9d47c996cb6a Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Mar 2024 15:59:38 -0500 Subject: [PATCH 323/496] smb311: correct incorrect offset field in compression header The offset field in the compression header is 32 bits not 16. Reviewed-by: Bharath SM Reported-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 20784f76a604..a23b56f93c36 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -227,7 +227,7 @@ struct smb2_compression_hdr { __le32 OriginalCompressedSegmentSize; __le16 CompressionAlgorithm; __le16 Flags; - __le16 Offset; /* this is the size of the uncompressed SMB2 header below */ + __le32 Offset; /* this is the size of the uncompressed SMB2 header below */ /* uncompressed SMB2 header (READ or WRITE) goes here */ /* compressed data goes here */ } __packed; From e56bc745fa1de77abc2ad8debc4b1b83e0426c49 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 19 Mar 2024 17:00:01 -0500 Subject: [PATCH 324/496] smb311: additional compression flag defined in updated protocol spec Added new compression flag that was recently documented, in addition fix some typos and clarify the sid_attr_data struct definition. Reviewed-by: Bharath SM Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index a23b56f93c36..1b594307c9d5 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -280,15 +280,16 @@ struct smb3_blob_data { #define SE_GROUP_RESOURCE 0x20000000 #define SE_GROUP_LOGON_ID 0xC0000000 -/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ - struct sid_array_data { __le16 SidAttrCount; /* SidAttrList - array of sid_attr_data structs */ } __packed; -struct luid_attr_data { - +/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ +struct sid_attr_data { + __le16 BlobSize; + __u8 BlobData[]; + /* __le32 Attr */ } __packed; /* @@ -502,6 +503,7 @@ struct smb2_encryption_neg_context { #define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) /* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ #define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ +#define SMB3_COMPRESS_LZ4 cpu_to_le16(0x0005) /* Compression Flags */ #define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) From 2f14c0c8cae8e9e3b603a3f91909baba66540027 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 5 Mar 2024 14:34:24 -0600 Subject: [PATCH 325/496] drm/amd/display: Use freesync when `DRM_EDID_FEATURE_CONTINUOUS_FREQ` found The monitor shipped with the Framework 16 supports VRR [1], but it's not being advertised. This is because the detailed timing block doesn't contain `EDID_DETAIL_MONITOR_RANGE` which amdgpu looks for to find min and max frequencies. This check however is superfluous for this case because update_display_info() calls drm_get_monitor_range() to get these ranges already. So if the `DRM_EDID_FEATURE_CONTINUOUS_FREQ` EDID feature is found then turn on freesync without extra checks. v2: squash in fix from Harry Closes: https://www.reddit.com/r/framework/comments/1b4y2i5/no_variable_refresh_rate_on_the_framework_16_on/ Closes: https://www.reddit.com/r/framework/comments/1b6vzcy/framework_16_variable_refresh_rate/ Closes: https://community.frame.work/t/resolved-no-vrr-freesync-with-amd-version/42338 Link: https://gist.github.com/superm1/e8fbacfa4d0f53150231d3a3e0a13faf Signed-off-by: Mario Limonciello Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 1c9c6096e28f..b2013f2b57c0 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11271,18 +11271,24 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, if (!adev->dm.freesync_module) goto update; - if (sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT - || sink->sink_signal == SIGNAL_TYPE_EDP) { + if (edid && (sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT || + sink->sink_signal == SIGNAL_TYPE_EDP)) { bool edid_check_required = false; - if (edid) { - edid_check_required = is_dp_capable_without_timing_msa( - adev->dm.dc, - amdgpu_dm_connector); + if (is_dp_capable_without_timing_msa(adev->dm.dc, + amdgpu_dm_connector)) { + if (edid->features & DRM_EDID_FEATURE_CONTINUOUS_FREQ) { + freesync_capable = true; + amdgpu_dm_connector->min_vfreq = connector->display_info.monitor_range.min_vfreq; + amdgpu_dm_connector->max_vfreq = connector->display_info.monitor_range.max_vfreq; + } else { + edid_check_required = edid->version > 1 || + (edid->version == 1 && + edid->revision > 1); + } } - if (edid_check_required == true && (edid->version > 1 || - (edid->version == 1 && edid->revision > 1))) { + if (edid_check_required) { for (i = 0; i < 4; i++) { timing = &edid->detailed_timings[i]; From 71b9d19220dae4b69f03acd900498b23eeeaf000 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 8 Mar 2024 11:11:03 -0500 Subject: [PATCH 326/496] drm/amdgpu: Handle duplicate BOs during process restore In certain situations, some apps can import a BO multiple times (through IPC for example). To restore such processes successfully, we need to tell drm to ignore duplicate BOs. While at it, also add additional logging to prevent silent failures when process restore fails. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 14dc9d2d8d53..df58a6a1a67e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2869,14 +2869,16 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * mutex_lock(&process_info->lock); - drm_exec_init(&exec, 0, 0); + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); drm_exec_until_all_locked(&exec) { list_for_each_entry(peer_vm, &process_info->vm_list_head, vm_list_node) { ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2); drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("Locking VM PD failed, ret: %d\n", ret); goto ttm_reserve_fail; + } } /* Reserve all BOs and page tables/directory. Add all BOs from @@ -2889,8 +2891,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * gobj = &mem->bo->tbo.base; ret = drm_exec_prepare_obj(&exec, gobj, 1); drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) + if (unlikely(ret)) { + pr_err("drm_exec_prepare_obj failed, ret: %d\n", ret); goto ttm_reserve_fail; + } } } @@ -2950,8 +2954,10 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu * * validations above would invalidate DMABuf imports again. */ ret = process_validate_vms(process_info, &exec.ticket); - if (ret) + if (ret) { + pr_debug("Validating VMs failed, ret: %d\n", ret); goto validate_map_fail; + } /* Update mappings not managed by KFD */ list_for_each_entry(peer_vm, &process_info->vm_list_head, From 22207fd5c80177b860279653d017474b2812af5e Mon Sep 17 00:00:00 2001 From: Vitaly Prosyak Date: Wed, 6 Mar 2024 14:57:48 -0500 Subject: [PATCH 327/496] drm/amdgpu: fix use-after-free bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug can be triggered by sending a single amdgpu_gem_userptr_ioctl to the AMDGPU DRM driver on any ASICs with an invalid address and size. The bug was reported by Joonkyo Jung . For example the following code: static void Syzkaller1(int fd) { struct drm_amdgpu_gem_userptr arg; int ret; arg.addr = 0xffffffffffff0000; arg.size = 0x80000000; /*2 Gb*/ arg.flags = 0x7; ret = drmIoctl(fd, 0xc1186451/*amdgpu_gem_userptr_ioctl*/, &arg); } Due to the address and size are not valid there is a failure in amdgpu_hmm_register->mmu_interval_notifier_insert->__mmu_interval_notifier_insert-> check_shl_overflow, but we even the amdgpu_hmm_register failure we still call amdgpu_hmm_unregister into amdgpu_gem_object_free which causes access to a bad address. The following stack is below when the issue is reproduced when Kazan is enabled: [ +0.000014] Hardware name: ASUS System Product Name/ROG STRIX B550-F GAMING (WI-FI), BIOS 1401 12/03/2020 [ +0.000009] RIP: 0010:mmu_interval_notifier_remove+0x327/0x340 [ +0.000017] Code: ff ff 49 89 44 24 08 48 b8 00 01 00 00 00 00 ad de 4c 89 f7 49 89 47 40 48 83 c0 22 49 89 47 48 e8 ce d1 2d 01 e9 32 ff ff ff <0f> 0b e9 16 ff ff ff 4c 89 ef e8 fa 14 b3 ff e9 36 ff ff ff e8 80 [ +0.000014] RSP: 0018:ffffc90002657988 EFLAGS: 00010246 [ +0.000013] RAX: 0000000000000000 RBX: 1ffff920004caf35 RCX: ffffffff8160565b [ +0.000011] RDX: dffffc0000000000 RSI: 0000000000000004 RDI: ffff8881a9f78260 [ +0.000010] RBP: ffffc90002657a70 R08: 0000000000000001 R09: fffff520004caf25 [ +0.000010] R10: 0000000000000003 R11: ffffffff8161d1d6 R12: ffff88810e988c00 [ +0.000010] R13: ffff888126fb5a00 R14: ffff88810e988c0c R15: ffff8881a9f78260 [ +0.000011] FS: 00007ff9ec848540(0000) GS:ffff8883cc880000(0000) knlGS:0000000000000000 [ +0.000012] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ +0.000010] CR2: 000055b3f7e14328 CR3: 00000001b5770000 CR4: 0000000000350ef0 [ +0.000010] Call Trace: [ +0.000006] [ +0.000007] ? show_regs+0x6a/0x80 [ +0.000018] ? __warn+0xa5/0x1b0 [ +0.000019] ? mmu_interval_notifier_remove+0x327/0x340 [ +0.000018] ? report_bug+0x24a/0x290 [ +0.000022] ? handle_bug+0x46/0x90 [ +0.000015] ? exc_invalid_op+0x19/0x50 [ +0.000016] ? asm_exc_invalid_op+0x1b/0x20 [ +0.000017] ? kasan_save_stack+0x26/0x50 [ +0.000017] ? mmu_interval_notifier_remove+0x23b/0x340 [ +0.000019] ? mmu_interval_notifier_remove+0x327/0x340 [ +0.000019] ? mmu_interval_notifier_remove+0x23b/0x340 [ +0.000020] ? __pfx_mmu_interval_notifier_remove+0x10/0x10 [ +0.000017] ? kasan_save_alloc_info+0x1e/0x30 [ +0.000018] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? __kasan_kmalloc+0xb1/0xc0 [ +0.000018] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? __kasan_check_read+0x11/0x20 [ +0.000020] amdgpu_hmm_unregister+0x34/0x50 [amdgpu] [ +0.004695] amdgpu_gem_object_free+0x66/0xa0 [amdgpu] [ +0.004534] ? __pfx_amdgpu_gem_object_free+0x10/0x10 [amdgpu] [ +0.004291] ? do_syscall_64+0x5f/0xe0 [ +0.000023] ? srso_return_thunk+0x5/0x5f [ +0.000017] drm_gem_object_free+0x3b/0x50 [drm] [ +0.000489] amdgpu_gem_userptr_ioctl+0x306/0x500 [amdgpu] [ +0.004295] ? __pfx_amdgpu_gem_userptr_ioctl+0x10/0x10 [amdgpu] [ +0.004270] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? __this_cpu_preempt_check+0x13/0x20 [ +0.000015] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? sysvec_apic_timer_interrupt+0x57/0xc0 [ +0.000020] ? srso_return_thunk+0x5/0x5f [ +0.000014] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 [ +0.000022] ? drm_ioctl_kernel+0x17b/0x1f0 [drm] [ +0.000496] ? __pfx_amdgpu_gem_userptr_ioctl+0x10/0x10 [amdgpu] [ +0.004272] ? drm_ioctl_kernel+0x190/0x1f0 [drm] [ +0.000492] drm_ioctl_kernel+0x140/0x1f0 [drm] [ +0.000497] ? __pfx_amdgpu_gem_userptr_ioctl+0x10/0x10 [amdgpu] [ +0.004297] ? __pfx_drm_ioctl_kernel+0x10/0x10 [drm] [ +0.000489] ? srso_return_thunk+0x5/0x5f [ +0.000011] ? __kasan_check_write+0x14/0x20 [ +0.000016] drm_ioctl+0x3da/0x730 [drm] [ +0.000475] ? __pfx_amdgpu_gem_userptr_ioctl+0x10/0x10 [amdgpu] [ +0.004293] ? __pfx_drm_ioctl+0x10/0x10 [drm] [ +0.000506] ? __pfx_rpm_resume+0x10/0x10 [ +0.000016] ? srso_return_thunk+0x5/0x5f [ +0.000011] ? __kasan_check_write+0x14/0x20 [ +0.000010] ? srso_return_thunk+0x5/0x5f [ +0.000011] ? _raw_spin_lock_irqsave+0x99/0x100 [ +0.000015] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ +0.000014] ? srso_return_thunk+0x5/0x5f [ +0.000013] ? srso_return_thunk+0x5/0x5f [ +0.000011] ? srso_return_thunk+0x5/0x5f [ +0.000011] ? preempt_count_sub+0x18/0xc0 [ +0.000013] ? srso_return_thunk+0x5/0x5f [ +0.000010] ? _raw_spin_unlock_irqrestore+0x27/0x50 [ +0.000019] amdgpu_drm_ioctl+0x7e/0xe0 [amdgpu] [ +0.004272] __x64_sys_ioctl+0xcd/0x110 [ +0.000020] do_syscall_64+0x5f/0xe0 [ +0.000021] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ +0.000015] RIP: 0033:0x7ff9ed31a94f [ +0.000012] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <41> 89 c0 3d 00 f0 ff ff 77 1f 48 8b 44 24 18 64 48 2b 04 25 28 00 [ +0.000013] RSP: 002b:00007fff25f66790 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ +0.000016] RAX: ffffffffffffffda RBX: 000055b3f7e133e0 RCX: 00007ff9ed31a94f [ +0.000012] RDX: 000055b3f7e133e0 RSI: 00000000c1186451 RDI: 0000000000000003 [ +0.000010] RBP: 00000000c1186451 R08: 0000000000000000 R09: 0000000000000000 [ +0.000009] R10: 0000000000000008 R11: 0000000000000246 R12: 00007fff25f66ca8 [ +0.000009] R13: 0000000000000003 R14: 000055b3f7021ba8 R15: 00007ff9ed7af040 [ +0.000024] [ +0.000007] ---[ end trace 0000000000000000 ]--- v2: Consolidate any error handling into amdgpu_hmm_register which applied to kfd_bo also. (Christian) v3: Improve syntax and comment (Christian) Cc: Christian Koenig Cc: Alex Deucher Cc: Felix Kuehling Cc: Joonkyo Jung Cc: Dokyung Song Cc: Cc: Signed-off-by: Vitaly Prosyak Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 55b65fc04b65..431ec72655ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -129,13 +129,25 @@ static const struct mmu_interval_notifier_ops amdgpu_hmm_hsa_ops = { */ int amdgpu_hmm_register(struct amdgpu_bo *bo, unsigned long addr) { + int r; + if (bo->kfd_bo) - return mmu_interval_notifier_insert(&bo->notifier, current->mm, + r = mmu_interval_notifier_insert(&bo->notifier, current->mm, addr, amdgpu_bo_size(bo), &amdgpu_hmm_hsa_ops); - return mmu_interval_notifier_insert(&bo->notifier, current->mm, addr, - amdgpu_bo_size(bo), - &amdgpu_hmm_gfx_ops); + else + r = mmu_interval_notifier_insert(&bo->notifier, current->mm, addr, + amdgpu_bo_size(bo), + &amdgpu_hmm_gfx_ops); + if (r) + /* + * Make sure amdgpu_hmm_unregister() doesn't call + * mmu_interval_notifier_remove() when the notifier isn't properly + * initialized. + */ + bo->notifier.mm = NULL; + + return r; } /** From 6540ff6482c1a5a6890ae44b23d0852ba1986d9e Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Wed, 6 Mar 2024 12:42:49 +0800 Subject: [PATCH 328/496] drm/amdgpu: fix mmhub client id out-of-bounds access Properly handle cid 0x140. Fixes: aba2be41470a ("drm/amdgpu: add mmhub 3.3.0 support") Signed-off-by: Lang Yu Reviewed-by: Yifan Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c index b3961968c10c..238ea40c2450 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_3.c @@ -99,16 +99,15 @@ mmhub_v3_3_print_l2_protection_fault_status(struct amdgpu_device *adev, switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { case IP_VERSION(3, 3, 0): case IP_VERSION(3, 3, 1): - mmhub_cid = mmhub_client_ids_v3_3[cid][rw]; + mmhub_cid = cid < ARRAY_SIZE(mmhub_client_ids_v3_3) ? + mmhub_client_ids_v3_3[cid][rw] : + cid == 0x140 ? "UMSCH" : NULL; break; default: mmhub_cid = NULL; break; } - if (!mmhub_cid && cid == 0x140) - mmhub_cid = "UMSCH"; - dev_err(adev->dev, "\t Faulty UTCL2 client ID: %s (0x%x)\n", mmhub_cid ? mmhub_cid : "unknown", cid); dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n", From c6ba60af015a0cc42bec5ca1cdc28a108958363a Mon Sep 17 00:00:00 2001 From: Friedrich Vock Date: Sun, 10 Mar 2024 06:40:40 +0100 Subject: [PATCH 329/496] drm/amdgpu: Reset IH OVERFLOW_EN bit for IH 7.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IH 7.0 support landed shortly after the original patch for resetting the bit on all other generations, but without that patch applied. Fixes: 12443fc53e7d ("drm/amdgpu: Add ih v7_0 ip block support") Cc: Christian König Cc: Alex Deucher Signed-off-by: Friedrich Vock Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/ih_v7_0.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c index 16fe428c0722..7aed96fa10a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c @@ -418,6 +418,12 @@ static u32 ih_v7_0_get_wptr(struct amdgpu_device *adev, tmp = RREG32_NO_KIQ(ih_regs->ih_rb_cntl); tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 1); WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); + + /* Unset the CLEAR_OVERFLOW bit immediately so new overflows + * can be detected. + */ + tmp = REG_SET_FIELD(tmp, IH_RB_CNTL, WPTR_OVERFLOW_CLEAR, 0); + WREG32_NO_KIQ(ih_regs->ih_rb_cntl, tmp); out: return (wptr & ih->ptr_mask); } From 75eb8f7df65c5e6eb22a5aff8deb60ce0b65de1a Mon Sep 17 00:00:00 2001 From: Swapnil Patel Date: Tue, 13 Feb 2024 08:09:48 -0500 Subject: [PATCH 330/496] drm/amd/display: Change default size for dummy plane in DML2 [WHY & HOW] Currently, to map dc states into dml_display_cfg, We create a dummy plane if the stream doesn't have any planes attached to it. This dummy plane uses max addersable width height. This results in certain mode validations failing when they shouldn't. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Dhere Acked-by: Alex Hung Signed-off-by: Swapnil Patel Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/dml2/dml2_translation_helper.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index 1ba6933d2b36..17a58f41fc6a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -824,13 +824,25 @@ static struct scaler_data get_scaler_data_for_plane(const struct dc_plane_state static void populate_dummy_dml_plane_cfg(struct dml_plane_cfg_st *out, unsigned int location, const struct dc_stream_state *in) { + dml_uint_t width, height; + + if (in->timing.h_addressable > 3840) + width = 3840; + else + width = in->timing.h_addressable; // 4K max + + if (in->timing.v_addressable > 2160) + height = 2160; + else + height = in->timing.v_addressable; // 4K max + out->CursorBPP[location] = dml_cur_32bit; out->CursorWidth[location] = 256; out->GPUVMMinPageSizeKBytes[location] = 256; - out->ViewportWidth[location] = in->timing.h_addressable; - out->ViewportHeight[location] = in->timing.v_addressable; + out->ViewportWidth[location] = width; + out->ViewportHeight[location] = height; out->ViewportStationary[location] = false; out->ViewportWidthChroma[location] = 0; out->ViewportHeightChroma[location] = 0; @@ -849,7 +861,7 @@ static void populate_dummy_dml_plane_cfg(struct dml_plane_cfg_st *out, unsigned out->HTapsChroma[location] = 0; out->VTapsChroma[location] = 0; out->SourceScan[location] = dml_rotation_0; - out->ScalerRecoutWidth[location] = in->timing.h_addressable; + out->ScalerRecoutWidth[location] = width; out->LBBitPerPixel[location] = 57; From a568c4947ee1279c5e411bc9afc60233b23bed7d Mon Sep 17 00:00:00 2001 From: Aurabindo Pillai Date: Sun, 10 Dec 2023 23:52:25 -0500 Subject: [PATCH 331/496] drm/amd/display: Enable DML2 debug flags [WHY & HOW] Enable DML2 related debug config options in DM for testing purposes. Reviewed-by: Chaitanya Dhere Acked-by: Alex Hung Signed-off-by: Aurabindo Pillai Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index b2013f2b57c0..2851719d7121 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1767,6 +1767,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) if (amdgpu_dc_debug_mask & DC_FORCE_SUBVP_MCLK_SWITCH) adev->dm.dc->debug.force_subvp_mclk_switch = true; + if (amdgpu_dc_debug_mask & DC_ENABLE_DML2) + adev->dm.dc->debug.using_dml2 = true; + adev->dm.dc->debug.visual_confirm = amdgpu_dc_visual_confirm; /* TODO: Remove after DP2 receiver gets proper support of Cable ID feature */ From 6a7cbbc267c0cafa2b027983a40276deb673c066 Mon Sep 17 00:00:00 2001 From: Saleemkhan Jamadar Date: Wed, 6 Mar 2024 18:15:29 +0530 Subject: [PATCH 332/496] drm/amdgpu/vcn: enable vcn1 fw load for VCN 4_0_6 v1 - update the fw header for each vcn instance (Veera) VCN1 has different FW binary in VCN v4_0_6. Add changes to load the VCN1 fw binary Signed-off-by: Saleemkhan Jamadar Reviewed-by: Veerabadhran Gopalakrishnan Reviewed-by: Leo Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 38 ++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 2 +- drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c | 4 +-- drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c | 8 ++++-- drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 6 ++-- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 10 +++---- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c | 4 +-- 10 files changed, 52 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index b2535023764f..9c514a606a2f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -60,6 +60,7 @@ #define FIRMWARE_VCN4_0_4 "amdgpu/vcn_4_0_4.bin" #define FIRMWARE_VCN4_0_5 "amdgpu/vcn_4_0_5.bin" #define FIRMWARE_VCN4_0_6 "amdgpu/vcn_4_0_6.bin" +#define FIRMWARE_VCN4_0_6_1 "amdgpu/vcn_4_0_6_1.bin" #define FIRMWARE_VCN5_0_0 "amdgpu/vcn_5_0_0.bin" MODULE_FIRMWARE(FIRMWARE_RAVEN); @@ -85,6 +86,7 @@ MODULE_FIRMWARE(FIRMWARE_VCN4_0_3); MODULE_FIRMWARE(FIRMWARE_VCN4_0_4); MODULE_FIRMWARE(FIRMWARE_VCN4_0_5); MODULE_FIRMWARE(FIRMWARE_VCN4_0_6); +MODULE_FIRMWARE(FIRMWARE_VCN4_0_6_1); MODULE_FIRMWARE(FIRMWARE_VCN5_0_0); static void amdgpu_vcn_idle_work_handler(struct work_struct *work); @@ -93,14 +95,22 @@ int amdgpu_vcn_early_init(struct amdgpu_device *adev) { char ucode_prefix[30]; char fw_name[40]; - int r; + int r, i; - amdgpu_ucode_ip_version_decode(adev, UVD_HWIP, ucode_prefix, sizeof(ucode_prefix)); - snprintf(fw_name, sizeof(fw_name), "amdgpu/%s.bin", ucode_prefix); - r = amdgpu_ucode_request(adev, &adev->vcn.fw, fw_name); - if (r) - amdgpu_ucode_release(&adev->vcn.fw); + for (i = 0; i < adev->vcn.num_vcn_inst; i++) { + amdgpu_ucode_ip_version_decode(adev, UVD_HWIP, ucode_prefix, sizeof(ucode_prefix)); + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s.bin", ucode_prefix); + if (amdgpu_ip_version(adev, UVD_HWIP, 0) == IP_VERSION(4, 0, 6) && + i == 1) { + snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_%d.bin", ucode_prefix, i); + } + r = amdgpu_ucode_request(adev, &adev->vcn.fw[i], fw_name); + if (r) { + amdgpu_ucode_release(&adev->vcn.fw[i]); + return r; + } + } return r; } @@ -141,7 +151,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) } } - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[0]->data; adev->vcn.fw_version = le32_to_cpu(hdr->ucode_version); /* Bit 20-23, it is encode major and non-zero for new naming convention. @@ -256,9 +266,10 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev) for (i = 0; i < adev->vcn.num_enc_rings; ++i) amdgpu_ring_fini(&adev->vcn.inst[j].ring_enc[i]); + + amdgpu_ucode_release(&adev->vcn.fw[j]); } - amdgpu_ucode_release(&adev->vcn.fw); mutex_destroy(&adev->vcn.vcn1_jpeg1_workaround); mutex_destroy(&adev->vcn.vcn_pg_lock); @@ -354,11 +365,12 @@ int amdgpu_vcn_resume(struct amdgpu_device *adev) const struct common_firmware_header *hdr; unsigned int offset; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[i]->data; if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) { offset = le32_to_cpu(hdr->ucode_array_offset_bytes); if (drm_dev_enter(adev_to_drm(adev), &idx)) { - memcpy_toio(adev->vcn.inst[i].cpu_addr, adev->vcn.fw->data + offset, + memcpy_toio(adev->vcn.inst[i].cpu_addr, + adev->vcn.fw[i]->data + offset, le32_to_cpu(hdr->ucode_size_bytes)); drm_dev_exit(idx); } @@ -1043,11 +1055,11 @@ void amdgpu_vcn_setup_ucode(struct amdgpu_device *adev) if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; - for (i = 0; i < adev->vcn.num_vcn_inst; i++) { if (adev->vcn.harvest_config & (1 << i)) continue; + + hdr = (const struct common_firmware_header *)adev->vcn.fw[i]->data; /* currently only support 2 FW instances */ if (i >= 2) { dev_info(adev->dev, "More then 2 VCN FW instances!\n"); @@ -1055,7 +1067,7 @@ void amdgpu_vcn_setup_ucode(struct amdgpu_device *adev) } idx = AMDGPU_UCODE_ID_VCN + i; adev->firmware.ucode[idx].ucode_id = idx; - adev->firmware.ucode[idx].fw = adev->vcn.fw; + adev->firmware.ucode[idx].fw = adev->vcn.fw[i]; adev->firmware.fw_size += ALIGN(le32_to_cpu(hdr->ucode_size_bytes), PAGE_SIZE); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index 1985f71b4373..a418393d89ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -306,7 +306,7 @@ struct amdgpu_vcn_ras { struct amdgpu_vcn { unsigned fw_version; struct delayed_work idle_work; - const struct firmware *fw; /* VCN firmware */ + const struct firmware *fw[AMDGPU_MAX_VCN_INSTANCES]; /* VCN firmware */ unsigned num_enc_rings; enum amd_powergating_state cur_state; bool indirect_sram; diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c index 25ba27151ac0..aaceecd558cf 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c @@ -304,7 +304,7 @@ static int vcn_v1_0_resume(void *handle) */ static void vcn_v1_0_mc_resume_spg_mode(struct amdgpu_device *adev) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[0]->size + 4); uint32_t offset; /* cache window 0: fw */ @@ -371,7 +371,7 @@ static void vcn_v1_0_mc_resume_spg_mode(struct amdgpu_device *adev) static void vcn_v1_0_mc_resume_dpg_mode(struct amdgpu_device *adev) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[0]->size + 4); uint32_t offset; /* cache window 0: fw */ diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c index 18794394c5a0..e357d8cf0c01 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c @@ -330,7 +330,7 @@ static int vcn_v2_0_resume(void *handle) */ static void vcn_v2_0_mc_resume(struct amdgpu_device *adev) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[0]->size + 4); uint32_t offset; if (amdgpu_sriov_vf(adev)) @@ -386,7 +386,7 @@ static void vcn_v2_0_mc_resume(struct amdgpu_device *adev) static void vcn_v2_0_mc_resume_dpg_mode(struct amdgpu_device *adev, bool indirect) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[0]->size + 4); uint32_t offset; /* cache window 0: fw */ @@ -1878,7 +1878,7 @@ static int vcn_v2_0_start_sriov(struct amdgpu_device *adev) init_table += header->vcn_table_offset; - size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[0]->size + 4); MMSCH_V2_0_INSERT_DIRECT_RD_MOD_WT( SOC15_REG_OFFSET(UVD, i, mmUVD_STATUS), diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index aba403d71806..1cd8a94b0fbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -414,13 +414,15 @@ static int vcn_v2_5_resume(void *handle) */ static void vcn_v2_5_mc_resume(struct amdgpu_device *adev) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size; uint32_t offset; int i; for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { if (adev->vcn.harvest_config & (1 << i)) continue; + + size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[i]->size + 4); /* cache window 0: fw */ if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { WREG32_SOC15(VCN, i, mmUVD_LMI_VCPU_CACHE_64BIT_BAR_LOW, @@ -469,7 +471,7 @@ static void vcn_v2_5_mc_resume(struct amdgpu_device *adev) static void vcn_v2_5_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_idx, bool indirect) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[inst_idx]->size + 4); uint32_t offset; /* cache window 0: fw */ @@ -1240,7 +1242,7 @@ static int vcn_v2_5_sriov_start(struct amdgpu_device *adev) SOC15_REG_OFFSET(VCN, i, mmUVD_STATUS), ~UVD_STATUS__UVD_BUSY, UVD_STATUS__UVD_BUSY); - size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[i]->size + 4); /* mc resume*/ if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { MMSCH_V1_0_INSERT_DIRECT_WT( diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index e02af4de521c..8f82fb887e9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -449,7 +449,7 @@ static int vcn_v3_0_resume(void *handle) */ static void vcn_v3_0_mc_resume(struct amdgpu_device *adev, int inst) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[inst]->size + 4); uint32_t offset; /* cache window 0: fw */ @@ -499,7 +499,7 @@ static void vcn_v3_0_mc_resume(struct amdgpu_device *adev, int inst) static void vcn_v3_0_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_idx, bool indirect) { - uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + uint32_t size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[inst_idx]->size + 4); uint32_t offset; /* cache window 0: fw */ @@ -1332,7 +1332,7 @@ static int vcn_v3_0_start_sriov(struct amdgpu_device *adev) mmUVD_STATUS), ~UVD_STATUS__UVD_BUSY, UVD_STATUS__UVD_BUSY); - cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[i]->size + 4); if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { MMSCH_V3_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i, diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 8ab01ae919d2..832d15f7b5f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -382,7 +382,7 @@ static void vcn_v4_0_mc_resume(struct amdgpu_device *adev, int inst) uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -442,7 +442,7 @@ static void vcn_v4_0_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_idx { uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst_idx]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -1289,7 +1289,7 @@ static int vcn_v4_0_start_sriov(struct amdgpu_device *adev) regUVD_STATUS), ~UVD_STATUS__UVD_BUSY, UVD_STATUS__UVD_BUSY); - cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[i]->size + 4); if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, i, diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 810bbfccd6f2..203fa988322b 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -332,7 +332,7 @@ static void vcn_v4_0_3_mc_resume(struct amdgpu_device *adev, int inst_idx) uint32_t offset, size, vcn_inst; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst_idx]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); vcn_inst = GET_INST(VCN, inst_idx); @@ -407,7 +407,7 @@ static void vcn_v4_0_3_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_i uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst_idx]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -894,7 +894,7 @@ static int vcn_v4_0_3_start_sriov(struct amdgpu_device *adev) MMSCH_V4_0_INSERT_DIRECT_RD_MOD_WT(SOC15_REG_OFFSET(VCN, 0, regUVD_STATUS), ~UVD_STATUS__UVD_BUSY, UVD_STATUS__UVD_BUSY); - cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw->size + 4); + cache_size = AMDGPU_GPU_PAGE_ALIGN(adev->vcn.fw[i]->size + 4); if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { MMSCH_V4_0_INSERT_DIRECT_WT(SOC15_REG_OFFSET(VCN, 0, diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index 0468955338b7..501e53e69f2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -45,7 +45,7 @@ #define mmUVD_DPG_LMA_DATA_BASE_IDX regUVD_DPG_LMA_DATA_BASE_IDX #define VCN_VID_SOC_ADDRESS_2_0 0x1fb00 -#define VCN1_VID_SOC_ADDRESS_3_0 0x48300 +#define VCN1_VID_SOC_ADDRESS_3_0 (0x48300 + 0x38000) #define VCN_HARVEST_MMSCH 0 @@ -329,7 +329,7 @@ static void vcn_v4_0_5_mc_resume(struct amdgpu_device *adev, int inst) uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -390,7 +390,7 @@ static void vcn_v4_0_5_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_i uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst_idx]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -486,7 +486,8 @@ static void vcn_v4_0_5_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_i /* VCN global tiling registers */ WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET( - VCN, 0, regUVD_GFX10_ADDR_CONFIG), adev->gfx.config.gb_addr_config, 0, indirect); + VCN, inst_idx, regUVD_GFX10_ADDR_CONFIG), + adev->gfx.config.gb_addr_config, 0, indirect); } /** @@ -911,7 +912,6 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_device *adev, int inst_idx, b VCN, inst_idx, regUVD_MASTINT_EN), UVD_MASTINT_EN__VCPU_EN_MASK, 0, indirect); - if (indirect) amdgpu_vcn_psp_update_sram(adev, inst_idx, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index d6ee9958ba5f..bc60c554eb32 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -290,7 +290,7 @@ static void vcn_v5_0_0_mc_resume(struct amdgpu_device *adev, int inst) uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ @@ -351,7 +351,7 @@ static void vcn_v5_0_0_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_i uint32_t offset, size; const struct common_firmware_header *hdr; - hdr = (const struct common_firmware_header *)adev->vcn.fw->data; + hdr = (const struct common_firmware_header *)adev->vcn.fw[inst_idx]->data; size = AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); /* cache window 0: fw */ From 6c6064cbe58b43533e3451ad6a8ba9736c109ac3 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 11 Mar 2024 18:07:34 -0400 Subject: [PATCH 333/496] drm/amdgpu: amdgpu_ttm_gart_bind set gtt bound flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise after the GTT bo is released, the GTT and gart space is freed but amdgpu_ttm_backend_unbind will not clear the gart page table entry and leave valid mapping entry pointing to the stale system page. Then if GPU access the gart address mistakely, it will read undefined value instead page fault, harder to debug and reproduce the real issue. Cc: stable@vger.kernel.org Signed-off-by: Philip Yang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 8722beba494e..fc418e670fda 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -864,6 +864,7 @@ static void amdgpu_ttm_gart_bind(struct amdgpu_device *adev, amdgpu_gart_bind(adev, gtt->offset, ttm->num_pages, gtt->ttm.dma_address, flags); } + gtt->bound = true; } /* From f679fd6057fbf5ab34aaee28d58b7f81af0cbf48 Mon Sep 17 00:00:00 2001 From: Ahmad Rehman Date: Mon, 4 Mar 2024 15:56:00 -0600 Subject: [PATCH 334/496] drm/amdgpu: Init zone device and drm client after mode-1 reset on reload In passthrough environment, when amdgpu is reloaded after unload, mode-1 is triggered after initializing the necessary IPs, That init does not include KFD, and KFD init waits until the reset is completed. KFD init is called in the reset handler, but in this case, the zone device and drm client is not initialized, causing app to create kernel panic. v2: Removing the init KFD condition from amdgpu_amdkfd_drm_client_create. As the previous version has the potential of creating DRM client twice. v3: v2 patch results in SDMA engine hung as DRM open causes VM clear to SDMA before SDMA init. Adding the condition to in drm client creation, on top of v1, to guard against drm client creation call multiple times. Signed-off-by: Ahmad Rehman Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index f5f2945711be..35dd6effa9a3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -146,7 +146,7 @@ int amdgpu_amdkfd_drm_client_create(struct amdgpu_device *adev) { int ret; - if (!adev->kfd.init_complete) + if (!adev->kfd.init_complete || adev->kfd.client.dev) return 0; ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 15b188aaf681..80b9642f2bc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2479,8 +2479,11 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work) } for (i = 0; i < mgpu_info.num_dgpu; i++) { adev = mgpu_info.gpu_ins[i].adev; - if (!adev->kfd.init_complete) + if (!adev->kfd.init_complete) { + kgd2kfd_init_zone_device(adev); amdgpu_amdkfd_device_init(adev); + amdgpu_amdkfd_drm_client_create(adev); + } amdgpu_ttm_set_buffer_funcs_status(adev, true); } } From 56b30ac84c517eefcfd5384339fee5d8a675f811 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Wed, 6 Mar 2024 17:59:29 +0800 Subject: [PATCH 335/496] drm/amdgpu: Skip access PF-only registers on gfx10/gfxhub2_1 under SRIOV [Why] RLCG interface returns "out-of-range" error under SRIOV VF when accessing PF-only registers. [How] Skip access PF-only registers on gfx10/gfxhub2_1 under SRIOV. Acked-by: Alex Deucher Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 8 ++++++-- drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 904b9ff5ead2..f90905ef32c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -3657,6 +3657,9 @@ static void gfx_v10_0_init_spm_golden_registers(struct amdgpu_device *adev) static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev) { + if (amdgpu_sriov_vf(adev)) + return; + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(10, 1, 10): soc15_program_register_sequence(adev, @@ -4982,7 +4985,8 @@ static void gfx_v10_0_constants_init(struct amdgpu_device *adev) u32 tmp; int i; - WREG32_FIELD15(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff); + if (!amdgpu_sriov_vf(adev)) + WREG32_FIELD15(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff); gfx_v10_0_setup_rb(adev); gfx_v10_0_get_cu_info(adev, &adev->gfx.cu_info); @@ -7163,7 +7167,7 @@ static int gfx_v10_0_hw_init(void *handle) if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(10, 3, 0)) gfx_v10_3_program_pbb_mode(adev); - if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) + if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0) && !amdgpu_sriov_vf(adev)) gfx_v10_3_set_power_brake_sequence(adev); return r; diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c index cd0e8a321e46..17509f32f61a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c @@ -155,6 +155,9 @@ static void gfxhub_v2_1_init_system_aperture_regs(struct amdgpu_device *adev) { uint64_t value; + if (amdgpu_sriov_vf(adev)) + return; + /* Program the AGP BAR */ WREG32_SOC15(GC, 0, mmGCMC_VM_AGP_BASE, 0); WREG32_SOC15(GC, 0, mmGCMC_VM_AGP_BOT, adev->gmc.agp_start >> 24); From 08ae9ef829b8055c2fdc8cfee37510c1f4721a07 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 11 Mar 2024 14:38:34 +0800 Subject: [PATCH 336/496] drm/amdgpu/pm: Fix NULL pointer dereference when get power limit Because powerplay_table initialization is skipped under sriov case, We check and set default lower and upper OD value if powerplay_table is NULL. Fixes: 7968e9748fbb ("drm/amdgpu/pm: Fix the power1_min_cap value") Signed-off-by: Ma Jun Reported-by: Yin Zhenguo Suggested-by: Lazar Lijo Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 14 ++++++++------ drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 16 +++++++++------- .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 14 ++++++++------ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 14 ++++++++------ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 14 ++++++++------ 5 files changed, 41 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 1d96eb274d72..a406372e79d8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1286,7 +1286,7 @@ static int arcturus_get_power_limit(struct smu_context *smu, struct smu_11_0_powerplay_table *powerplay_table = (struct smu_11_0_powerplay_table *)smu->smu_table.power_play_table; PPTable_t *pptable = smu->smu_table.driver_pptable; - uint32_t power_limit, od_percent_upper, od_percent_lower; + uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; if (smu_v11_0_get_current_power_limit(smu, &power_limit)) { /* the last hope to figure out the ppt limit */ @@ -1303,12 +1303,14 @@ static int arcturus_get_power_limit(struct smu_context *smu, if (default_power_limit) *default_power_limit = power_limit; - if (smu->od_enabled) - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - else - od_percent_upper = 0; + if (powerplay_table) { + if (smu->od_enabled) + od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + else + od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", od_percent_upper, od_percent_lower, power_limit); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index ed189a3878eb..65bba5fc2335 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -2339,7 +2339,7 @@ static int navi10_get_power_limit(struct smu_context *smu, (struct smu_11_0_powerplay_table *)smu->smu_table.power_play_table; struct smu_11_0_overdrive_table *od_settings = smu->od_settings; PPTable_t *pptable = smu->smu_table.driver_pptable; - uint32_t power_limit, od_percent_upper, od_percent_lower; + uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; if (smu_v11_0_get_current_power_limit(smu, &power_limit)) { /* the last hope to figure out the ppt limit */ @@ -2356,13 +2356,15 @@ static int navi10_get_power_limit(struct smu_context *smu, if (default_power_limit) *default_power_limit = power_limit; - if (smu->od_enabled && - navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_POWER_LIMIT)) - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - else - od_percent_upper = 0; + if (powerplay_table) { + if (smu->od_enabled && + navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_POWER_LIMIT)) + od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + else + od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", od_percent_upper, od_percent_lower, power_limit); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index e2ad2b972ab0..395718b48131 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -625,7 +625,7 @@ static int sienna_cichlid_get_power_limit(struct smu_context *smu, { struct smu_11_0_7_powerplay_table *powerplay_table = (struct smu_11_0_7_powerplay_table *)smu->smu_table.power_play_table; - uint32_t power_limit, od_percent_upper, od_percent_lower; + uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; uint16_t *table_member; GET_PPTABLE_MEMBER(SocketPowerLimitAc, &table_member); @@ -640,12 +640,14 @@ static int sienna_cichlid_get_power_limit(struct smu_context *smu, if (default_power_limit) *default_power_limit = power_limit; - if (smu->od_enabled) - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); - else - od_percent_upper = 0; + if (powerplay_table) { + if (smu->od_enabled) + od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + else + od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", od_percent_upper, od_percent_lower, power_limit); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 9b80f18ea6c3..7873f024d429 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2356,7 +2356,7 @@ static int smu_v13_0_0_get_power_limit(struct smu_context *smu, (struct smu_13_0_0_powerplay_table *)table_context->power_play_table; PPTable_t *pptable = table_context->driver_pptable; SkuTable_t *skutable = &pptable->SkuTable; - uint32_t power_limit, od_percent_upper, od_percent_lower; + uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; uint32_t msg_limit = skutable->MsgLimits.Power[PPT_THROTTLER_PPT0][POWER_SOURCE_AC]; if (smu_v13_0_get_current_power_limit(smu, &power_limit)) @@ -2369,12 +2369,14 @@ static int smu_v13_0_0_get_power_limit(struct smu_context *smu, if (default_power_limit) *default_power_limit = power_limit; - if (smu->od_enabled) - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); - else - od_percent_upper = 0; + if (powerplay_table) { + if (smu->od_enabled) + od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + else + od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", od_percent_upper, od_percent_lower, power_limit); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 3dc7b60cb075..8abf0a772e6b 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2320,7 +2320,7 @@ static int smu_v13_0_7_get_power_limit(struct smu_context *smu, (struct smu_13_0_7_powerplay_table *)table_context->power_play_table; PPTable_t *pptable = table_context->driver_pptable; SkuTable_t *skutable = &pptable->SkuTable; - uint32_t power_limit, od_percent_upper, od_percent_lower; + uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; uint32_t msg_limit = skutable->MsgLimits.Power[PPT_THROTTLER_PPT0][POWER_SOURCE_AC]; if (smu_v13_0_get_current_power_limit(smu, &power_limit)) @@ -2333,12 +2333,14 @@ static int smu_v13_0_7_get_power_limit(struct smu_context *smu, if (default_power_limit) *default_power_limit = power_limit; - if (smu->od_enabled) - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); - else - od_percent_upper = 0; + if (powerplay_table) { + if (smu->od_enabled) + od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + else + od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", od_percent_upper, od_percent_lower, power_limit); From e17718251addb31e1771fd28735ec410e6ca650a Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 11 Mar 2024 15:23:34 +0800 Subject: [PATCH 337/496] drm/amdgpu/pm: Check the validity of overdiver power limit Check the validity of overdriver power limit before using it. Fixes: 7968e9748fbb ("drm/amdgpu/pm: Fix the power1_min_cap value") Signed-off-by: Ma Jun Suggested-by: Lazar Lijo Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 11 +++++---- .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 9 ++++---- .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 23 +++++++++++-------- .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 10 ++++---- .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 10 ++++---- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index a406372e79d8..40ba7227cca5 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1285,6 +1285,7 @@ static int arcturus_get_power_limit(struct smu_context *smu, { struct smu_11_0_powerplay_table *powerplay_table = (struct smu_11_0_powerplay_table *)smu->smu_table.power_play_table; + struct smu_11_0_overdrive_table *od_settings = smu->od_settings; PPTable_t *pptable = smu->smu_table.driver_pptable; uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; @@ -1304,12 +1305,14 @@ static int arcturus_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (powerplay_table) { - if (smu->od_enabled) + if (smu->od_enabled && + od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) { od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - else + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } else if (od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) { od_percent_upper = 0; - - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index 65bba5fc2335..836b1df79928 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -2358,12 +2358,13 @@ static int navi10_get_power_limit(struct smu_context *smu, if (powerplay_table) { if (smu->od_enabled && - navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_POWER_LIMIT)) + navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_POWER_LIMIT)) { od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - else + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } else if (navi10_od_feature_is_supported(od_settings, SMU_11_0_ODCAP_POWER_LIMIT)) { od_percent_upper = 0; - - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); + } } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 395718b48131..1f18b61884f3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -617,6 +617,12 @@ static uint32_t sienna_cichlid_get_throttler_status_locked(struct smu_context *s return throttler_status; } +static bool sienna_cichlid_is_od_feature_supported(struct smu_11_0_7_overdrive_table *od_table, + enum SMU_11_0_7_ODFEATURE_CAP cap) +{ + return od_table->cap[cap]; +} + static int sienna_cichlid_get_power_limit(struct smu_context *smu, uint32_t *current_power_limit, uint32_t *default_power_limit, @@ -625,6 +631,7 @@ static int sienna_cichlid_get_power_limit(struct smu_context *smu, { struct smu_11_0_7_powerplay_table *powerplay_table = (struct smu_11_0_7_powerplay_table *)smu->smu_table.power_play_table; + struct smu_11_0_7_overdrive_table *od_settings = smu->od_settings; uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; uint16_t *table_member; @@ -641,12 +648,14 @@ static int sienna_cichlid_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (powerplay_table) { - if (smu->od_enabled) + if (smu->od_enabled && + sienna_cichlid_is_od_feature_supported(od_settings, SMU_11_0_7_ODCAP_POWER_LIMIT)) { od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); - else + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + } else if ((sienna_cichlid_is_od_feature_supported(od_settings, SMU_11_0_7_ODCAP_POWER_LIMIT))) { od_percent_upper = 0; - - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_7_ODSETTING_POWERPERCENTAGE]); + } } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", @@ -1252,12 +1261,6 @@ static bool sienna_cichlid_is_support_fine_grained_dpm(struct smu_context *smu, return dpm_desc->SnapToDiscrete == 0; } -static bool sienna_cichlid_is_od_feature_supported(struct smu_11_0_7_overdrive_table *od_table, - enum SMU_11_0_7_ODFEATURE_CAP cap) -{ - return od_table->cap[cap]; -} - static void sienna_cichlid_get_od_setting_range(struct smu_11_0_7_overdrive_table *od_table, enum SMU_11_0_7_ODSETTING_ID setting, uint32_t *min, uint32_t *max) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 7873f024d429..9c03296f92cd 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2370,12 +2370,14 @@ static int smu_v13_0_0_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (powerplay_table) { - if (smu->od_enabled) + if (smu->od_enabled && + smu_v13_0_0_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT)) { od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); - else + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + } else if (smu_v13_0_0_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT)) { od_percent_upper = 0; - - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_0_ODSETTING_POWERPERCENTAGE]); + } } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 8abf0a772e6b..7318964f1f14 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2334,12 +2334,14 @@ static int smu_v13_0_7_get_power_limit(struct smu_context *smu, *default_power_limit = power_limit; if (powerplay_table) { - if (smu->od_enabled) + if (smu->od_enabled && + (smu_v13_0_7_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT))) { od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); - else + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + } else if (smu_v13_0_7_is_od_feature_supported(smu, PP_OD_FEATURE_PPT_BIT)) { od_percent_upper = 0; - - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_13_0_7_ODSETTING_POWERPERCENTAGE]); + } } dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", From 43bda3e782fb54dd13e0b9f2c0f77940b84a0a0b Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Wed, 13 Mar 2024 16:11:26 +0800 Subject: [PATCH 338/496] drm/amdgpu: correct the KGQ fallback message Fix the KGQ fallback function name, as this will help differentiate the failure in the KCQ enablement. Signed-off-by: Prike Liang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index f8b48fd93108..55d5508987ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -687,7 +687,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id) r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); if (r) - DRM_ERROR("KCQ enable failed\n"); + DRM_ERROR("KGQ enable failed\n"); return r; } From 9b3fec307f50ae62bd20281c277e9510c631000b Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 15 Mar 2024 15:10:05 +0800 Subject: [PATCH 339/496] drm/amdgpu: Bypass display ta if display hw is not available Do not load/invoke display TA if display hardware is not available. Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 3c2b1413058b..94b310fdb719 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1830,6 +1830,10 @@ static int psp_hdcp_initialize(struct psp_context *psp) if (amdgpu_sriov_vf(psp->adev)) return 0; + /* bypass hdcp initialization if dmu is harvested */ + if (!amdgpu_device_has_display_hardware(psp->adev)) + return 0; + if (!psp->hdcp_context.context.bin_desc.size_bytes || !psp->hdcp_context.context.bin_desc.start_addr) { dev_info(psp->adev->dev, "HDCP: optional hdcp ta ucode is not available\n"); @@ -1862,6 +1866,9 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id) if (amdgpu_sriov_vf(psp->adev)) return 0; + if (!psp->hdcp_context.context.initialized) + return 0; + return psp_ta_invoke(psp, ta_cmd_id, &psp->hdcp_context.context); } @@ -1897,6 +1904,10 @@ static int psp_dtm_initialize(struct psp_context *psp) if (amdgpu_sriov_vf(psp->adev)) return 0; + /* bypass dtm initialization if dmu is harvested */ + if (!amdgpu_device_has_display_hardware(psp->adev)) + return 0; + if (!psp->dtm_context.context.bin_desc.size_bytes || !psp->dtm_context.context.bin_desc.start_addr) { dev_info(psp->adev->dev, "DTM: optional dtm ta ucode is not available\n"); @@ -1929,6 +1940,9 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id) if (amdgpu_sriov_vf(psp->adev)) return 0; + if (!psp->dtm_context.context.initialized) + return 0; + return psp_ta_invoke(psp, ta_cmd_id, &psp->dtm_context.context); } @@ -2063,6 +2077,10 @@ static int psp_securedisplay_initialize(struct psp_context *psp) if (amdgpu_sriov_vf(psp->adev)) return 0; + /* bypass securedisplay initialization if dmu is harvested */ + if (!amdgpu_device_has_display_hardware(psp->adev)) + return 0; + if (!psp->securedisplay_context.context.bin_desc.size_bytes || !psp->securedisplay_context.context.bin_desc.start_addr) { dev_info(psp->adev->dev, "SECUREDISPLAY: securedisplay ta ucode is not available\n"); From 26fbcb3da77efc77bd7327b7916338d773cca484 Mon Sep 17 00:00:00 2001 From: Sohaib Nadeem Date: Wed, 14 Feb 2024 13:51:16 -0500 Subject: [PATCH 340/496] drm/amd/display: Override min required DCFCLK in dml1_validate [WHY]: Increasing min DCFCLK addresses underflow issues that occur when phantom pipe is turned on for some Sub-Viewport configs [HOW]: dcn32_override_min_req_dcfclk is added to override DCFCLK value in dml1_validate when subviewport is being used. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Alex Hung Signed-off-by: Sohaib Nadeem Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c | 6 ++++++ .../gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c | 1 + .../gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h | 3 +++ 3 files changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c index 87760600e154..f98def6c8c2d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c +++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_resource_helpers.c @@ -782,3 +782,9 @@ void dcn32_update_dml_pipes_odm_policy_based_on_context(struct dc *dc, struct dc pipe_cnt++; } } + +void dcn32_override_min_req_dcfclk(struct dc *dc, struct dc_state *context) +{ + if (dcn32_subvp_in_use(dc, context) && context->bw_ctx.bw.dcn.clk.dcfclk_khz <= MIN_SUBVP_DCFCLK_KHZ) + context->bw_ctx.bw.dcn.clk.dcfclk_khz = MIN_SUBVP_DCFCLK_KHZ; +} diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index 3f3951f3ba98..f844f57ecc49 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -1771,6 +1771,7 @@ static bool dml1_validate(struct dc *dc, struct dc_state *context, bool fast_val dc->res_pool->funcs->calculate_wm_and_dlg(dc, context, pipes, pipe_cnt, vlevel); dcn32_override_min_req_memclk(dc, context); + dcn32_override_min_req_dcfclk(dc, context); BW_VAL_TRACE_END_WATERMARKS(); diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h index 0c87b0fabba7..2258c5c7212d 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.h @@ -42,6 +42,7 @@ #define SUBVP_ACTIVE_MARGIN_LIST_LEN 2 #define DCN3_2_MAX_SUBVP_PIXEL_RATE_MHZ 1800 #define DCN3_2_VMIN_DISPCLK_HZ 717000000 +#define MIN_SUBVP_DCFCLK_KHZ 400000 #define TO_DCN32_RES_POOL(pool)\ container_of(pool, struct dcn32_resource_pool, base) @@ -181,6 +182,8 @@ bool dcn32_subvp_vblank_admissable(struct dc *dc, struct dc_state *context, int void dcn32_update_dml_pipes_odm_policy_based_on_context(struct dc *dc, struct dc_state *context, display_e2e_pipe_params_st *pipes); +void dcn32_override_min_req_dcfclk(struct dc *dc, struct dc_state *context); + /* definitions for run time init of reg offsets */ /* CLK SRC */ From 7fb19d9510937121a1f285894cffd30bc96572e3 Mon Sep 17 00:00:00 2001 From: Josip Pavic Date: Fri, 9 Feb 2024 16:05:18 -0500 Subject: [PATCH 341/496] drm/amd/display: Allow dirty rects to be sent to dmub when abm is active [WHY] It's beneficial for ABM to know when new frame data are available. [HOW] Add new condition to allow dirty rects to be sent to DMUB when ABM is active. ABM will use this as a signal that a new frame has arrived. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Anthony Koo Acked-by: Alex Hung Signed-off-by: Josip Pavic Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 5211c1c0f3c0..613d09c42f3b 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -3270,6 +3270,9 @@ static bool dc_dmub_should_send_dirty_rect_cmd(struct dc *dc, struct dc_stream_s if (stream->link->replay_settings.config.replay_supported) return true; + if (stream->ctx->dce_version >= DCN_VERSION_3_5 && stream->abm_level) + return true; + return false; } From 04a59c54757567f19dff4571ff7338476ec0f604 Mon Sep 17 00:00:00 2001 From: Ryan Lin Date: Wed, 21 Feb 2024 19:10:27 +0800 Subject: [PATCH 342/496] drm/amd/display: Add monitor patch for specific eDP [WHY] Some eDP panels' ext caps don't write initial values. The value of dpcd_addr (0x317) can be random and the backlight control interface will be incorrect. [HOW] Add new panel patches to remove sink ext caps. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org # 6.5.x Cc: Tsung-hua Lin Cc: Chris Chi Reviewed-by: Wayne Lin Acked-by: Alex Hung Signed-off-by: Ryan Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index 85b7f58a7f35..c27063305a13 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -67,6 +67,8 @@ static void apply_edid_quirks(struct edid *edid, struct dc_edid_caps *edid_caps) /* Workaround for some monitors that do not clear DPCD 0x317 if FreeSync is unsupported */ case drm_edid_encode_panel_id('A', 'U', 'O', 0xA7AB): case drm_edid_encode_panel_id('A', 'U', 'O', 0xE69B): + case drm_edid_encode_panel_id('B', 'O', 'E', 0x092A): + case drm_edid_encode_panel_id('L', 'G', 'D', 0x06D1): DRM_DEBUG_DRIVER("Clearing DPCD 0x317 on monitor with panel id %X\n", panel_id); edid_caps->panel_patch.remove_sink_ext_caps = true; break; @@ -120,6 +122,8 @@ enum dc_edid_status dm_helpers_parse_edid_caps( edid_caps->edid_hdmi = connector->display_info.is_hdmi; + apply_edid_quirks(edid_buf, edid_caps); + sad_count = drm_edid_to_sad((struct edid *) edid->raw_edid, &sads); if (sad_count <= 0) return result; @@ -146,8 +150,6 @@ enum dc_edid_status dm_helpers_parse_edid_caps( else edid_caps->speaker_flags = DEFAULT_SPEAKER_LOCATION; - apply_edid_quirks(edid_buf, edid_caps); - kfree(sads); kfree(sadb); From 4f5b8d78ca43fcc695ba16c83ebfabbfe09506d6 Mon Sep 17 00:00:00 2001 From: Dillon Varone Date: Wed, 21 Feb 2024 13:21:20 -0500 Subject: [PATCH 343/496] drm/amd/display: Init DPPCLK from SMU on dcn32 [WHY & HOW] DPPCLK ranges should be obtained from the SMU when available. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Dhere Acked-by: Alex Hung Signed-off-by: Dillon Varone Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c | 14 ++++++++++ .../drm/amd/display/dc/dml2/dml2_wrapper.c | 28 +++++++++++++------ .../drm/amd/display/dc/dml2/dml2_wrapper.h | 3 ++ .../dc/resource/dcn32/dcn32_resource.c | 2 ++ .../dc/resource/dcn321/dcn321_resource.c | 2 ++ 5 files changed, 41 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c index 668f05c8654e..bec252e1dd27 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c @@ -216,6 +216,16 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base) if (clk_mgr_base->bw_params->dc_mode_limit.dispclk_mhz > 1950) clk_mgr_base->bw_params->dc_mode_limit.dispclk_mhz = 1950; + /* DPPCLK */ + dcn32_init_single_clock(clk_mgr, PPCLK_DPPCLK, + &clk_mgr_base->bw_params->clk_table.entries[0].dppclk_mhz, + &num_entries_per_clk->num_dppclk_levels); + num_levels = num_entries_per_clk->num_dppclk_levels; + clk_mgr_base->bw_params->dc_mode_limit.dppclk_mhz = dcn30_smu_get_dc_mode_max_dpm_freq(clk_mgr, PPCLK_DPPCLK); + //HW recommends limit of 1950 MHz in display clock for all DCN3.2.x + if (clk_mgr_base->bw_params->dc_mode_limit.dppclk_mhz > 1950) + clk_mgr_base->bw_params->dc_mode_limit.dppclk_mhz = 1950; + if (num_entries_per_clk->num_dcfclk_levels && num_entries_per_clk->num_dtbclk_levels && num_entries_per_clk->num_dispclk_levels) @@ -240,6 +250,10 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base) = khz_to_mhz_ceil(clk_mgr_base->ctx->dc->debug.min_dpp_clk_khz); } + for (i = 0; i < num_levels; i++) + if (clk_mgr_base->bw_params->clk_table.entries[i].dppclk_mhz > 1950) + clk_mgr_base->bw_params->clk_table.entries[i].dppclk_mhz = 1950; + /* Get UCLK, update bounding box */ clk_mgr_base->funcs->get_memclk_states_from_smu(clk_mgr_base); diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c index 2a58a7687bdb..72cca367062e 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c @@ -703,13 +703,8 @@ static inline struct dml2_context *dml2_allocate_memory(void) return (struct dml2_context *) kzalloc(sizeof(struct dml2_context), GFP_KERNEL); } -bool dml2_create(const struct dc *in_dc, const struct dml2_configuration_options *config, struct dml2_context **dml2) +static void dml2_init(const struct dc *in_dc, const struct dml2_configuration_options *config, struct dml2_context **dml2) { - // Allocate Mode Lib Ctx - *dml2 = dml2_allocate_memory(); - - if (!(*dml2)) - return false; // Store config options (*dml2)->config = *config; @@ -737,9 +732,18 @@ bool dml2_create(const struct dc *in_dc, const struct dml2_configuration_options initialize_dml2_soc_bbox(*dml2, in_dc, &(*dml2)->v20.dml_core_ctx.soc); initialize_dml2_soc_states(*dml2, in_dc, &(*dml2)->v20.dml_core_ctx.soc, &(*dml2)->v20.dml_core_ctx.states); +} + +bool dml2_create(const struct dc *in_dc, const struct dml2_configuration_options *config, struct dml2_context **dml2) +{ + // Allocate Mode Lib Ctx + *dml2 = dml2_allocate_memory(); + + if (!(*dml2)) + return false; + + dml2_init(in_dc, config, dml2); - /*Initialize DML20 instance which calls dml2_core_create, and core_dcn3_populate_informative*/ - //dml2_initialize_instance(&(*dml_ctx)->v20.dml_init); return true; } @@ -779,3 +783,11 @@ bool dml2_create_copy(struct dml2_context **dst_dml2, return true; } + +void dml2_reinit(const struct dc *in_dc, + const struct dml2_configuration_options *config, + struct dml2_context **dml2) +{ + + dml2_init(in_dc, config, dml2); +} diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.h b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.h index ee0eb184eb6d..cc662d682fd4 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.h +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.h @@ -214,6 +214,9 @@ void dml2_copy(struct dml2_context *dst_dml2, struct dml2_context *src_dml2); bool dml2_create_copy(struct dml2_context **dst_dml2, struct dml2_context *src_dml2); +void dml2_reinit(const struct dc *in_dc, + const struct dml2_configuration_options *config, + struct dml2_context **dml2); /* * dml2_validate - Determines if a display configuration is supported or not. diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c index f844f57ecc49..ce1754cc1f46 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn32/dcn32_resource.c @@ -1931,6 +1931,8 @@ static void dcn32_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *bw { DC_FP_START(); dcn32_update_bw_bounding_box_fpu(dc, bw_params); + if (dc->debug.using_dml2 && dc->current_state && dc->current_state->bw_ctx.dml2) + dml2_reinit(dc, &dc->dml2_options, &dc->current_state->bw_ctx.dml2); DC_FP_END(); } diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c index b356fed1726d..296a0a8e7145 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn321/dcn321_resource.c @@ -1581,6 +1581,8 @@ static void dcn321_update_bw_bounding_box(struct dc *dc, struct clk_bw_params *b { DC_FP_START(); dcn321_update_bw_bounding_box_fpu(dc, bw_params); + if (dc->debug.using_dml2 && dc->current_state && dc->current_state->bw_ctx.dml2) + dml2_reinit(dc, &dc->dml2_options, &dc->current_state->bw_ctx.dml2); DC_FP_END(); } From 86e9523fb0efce27095d3086473c739cce720d01 Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Wed, 21 Feb 2024 16:55:04 -0500 Subject: [PATCH 344/496] drm/amd/display: Update odm when ODM combine is changed on an otg master pipe with no plane [WHY] When committing an update with ODM combine change when the plane is removing or already removed, we fail to detect odm change in pipe update flags. This has caused mismatch between new dc state and the actual hardware state, because we missed odm programming. [HOW] - Detect odm change even for otg master pipe without a plane. - Update odm config before calling program pipes for pipe with planes. The commit also updates blank pattern programming when odm is changed without plane. This is because number of OPP is changed when ODM combine is changed. Blank pattern is per OPP so we will need to reprogram OPP based on the new pipe topology. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Dillon Varone Acked-by: Alex Hung Signed-off-by: Wenjing Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 41 ++++++++++--------- .../amd/display/dc/hwss/dcn32/dcn32_hwseq.c | 7 ++++ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index c55d5155ecb9..40098d9f70cb 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -1498,6 +1498,11 @@ static void dcn20_detect_pipe_changes(struct dc_state *old_state, return; } + if (resource_is_pipe_type(new_pipe, OTG_MASTER) && + resource_is_odm_topology_changed(new_pipe, old_pipe)) + /* Detect odm changes */ + new_pipe->update_flags.bits.odm = 1; + /* Exit on unchanged, unused pipe */ if (!old_pipe->plane_state && !new_pipe->plane_state) return; @@ -1551,10 +1556,6 @@ static void dcn20_detect_pipe_changes(struct dc_state *old_state, /* Detect top pipe only changes */ if (resource_is_pipe_type(new_pipe, OTG_MASTER)) { - /* Detect odm changes */ - if (resource_is_odm_topology_changed(new_pipe, old_pipe)) - new_pipe->update_flags.bits.odm = 1; - /* Detect global sync changes */ if (old_pipe->pipe_dlg_param.vready_offset != new_pipe->pipe_dlg_param.vready_offset || old_pipe->pipe_dlg_param.vstartup_start != new_pipe->pipe_dlg_param.vstartup_start @@ -1999,19 +2000,20 @@ void dcn20_program_front_end_for_ctx( DC_LOGGER_INIT(dc->ctx->logger); unsigned int prev_hubp_count = 0; unsigned int hubp_count = 0; + struct pipe_ctx *pipe; if (resource_is_pipe_topology_changed(dc->current_state, context)) resource_log_pipe_topology_update(dc, context); if (dc->hwss.program_triplebuffer != NULL && dc->debug.enable_tri_buf) { for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i]; + pipe = &context->res_ctx.pipe_ctx[i]; - if (!pipe_ctx->top_pipe && !pipe_ctx->prev_odm_pipe && pipe_ctx->plane_state) { - ASSERT(!pipe_ctx->plane_state->triplebuffer_flips); + if (!pipe->top_pipe && !pipe->prev_odm_pipe && pipe->plane_state) { + ASSERT(!pipe->plane_state->triplebuffer_flips); /*turn off triple buffer for full update*/ dc->hwss.program_triplebuffer( - dc, pipe_ctx, pipe_ctx->plane_state->triplebuffer_flips); + dc, pipe, pipe->plane_state->triplebuffer_flips); } } } @@ -2085,12 +2087,22 @@ void dcn20_program_front_end_for_ctx( DC_LOG_DC("Reset mpcc for pipe %d\n", dc->current_state->res_ctx.pipe_ctx[i].pipe_idx); } + /* update ODM for blanked OTG master pipes */ + for (i = 0; i < dc->res_pool->pipe_count; i++) { + pipe = &context->res_ctx.pipe_ctx[i]; + if (resource_is_pipe_type(pipe, OTG_MASTER) && + !resource_is_pipe_type(pipe, DPP_PIPE) && + pipe->update_flags.bits.odm && + hws->funcs.update_odm) + hws->funcs.update_odm(dc, context, pipe); + } + /* * Program all updated pipes, order matters for mpcc setup. Start with * top pipe and program all pipes that follow in order */ for (i = 0; i < dc->res_pool->pipe_count; i++) { - struct pipe_ctx *pipe = &context->res_ctx.pipe_ctx[i]; + pipe = &context->res_ctx.pipe_ctx[i]; if (pipe->plane_state && !pipe->top_pipe) { while (pipe) { @@ -2129,17 +2141,6 @@ void dcn20_program_front_end_for_ctx( context->stream_status[0].plane_count > 1) { pipe->plane_res.hubp->funcs->hubp_wait_pipe_read_start(pipe->plane_res.hubp); } - - /* when dynamic ODM is active, pipes must be reconfigured when all planes are - * disabled, as some transitions will leave software and hardware state - * mismatched. - */ - if (dc->debug.enable_single_display_2to1_odm_policy && - pipe->stream && - pipe->update_flags.bits.disable && - !pipe->prev_odm_pipe && - hws->funcs.update_odm) - hws->funcs.update_odm(dc, context, pipe); } } diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c index aa36d7a56ca8..b890db0bfc46 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c @@ -1156,6 +1156,13 @@ void dcn32_update_odm(struct dc *dc, struct dc_state *context, struct pipe_ctx * dsc->funcs->dsc_disconnect(dsc); } } + + if (!resource_is_pipe_type(pipe_ctx, DPP_PIPE)) + /* + * blank pattern is generated by OPP, reprogram blank pattern + * due to OPP count change + */ + dc->hwseq->funcs.blank_pixel_data(dc, pipe_ctx, true); } unsigned int dcn32_calculate_dccg_k1_k2_values(struct pipe_ctx *pipe_ctx, unsigned int *k1_div, unsigned int *k2_div) From 3d066f9547dd58329b526db44f42c487a7974703 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Wed, 21 Feb 2024 12:27:31 -0500 Subject: [PATCH 345/496] drm/amd/display: Fix idle check for shared firmware state [WHY] We still had an instance of get_idle_state checking the PMFW scratch register instead of the actual idle allow signal. [HOW] Replace it with the SW state check for whether we had allowed idle through notify_idle. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Duncan Ma Acked-by: Alex Hung Signed-off-by: Nicholas Kazlauskas Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 613d09c42f3b..958552a8605f 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -4847,22 +4847,16 @@ void dc_exit_ips_for_hw_access(struct dc *dc) bool dc_dmub_is_ips_idle_state(struct dc *dc) { - uint32_t idle_state = 0; - if (dc->debug.disable_idle_power_optimizations) return false; if (!dc->caps.ips_support || (dc->config.disable_ips == DMUB_IPS_DISABLE_ALL)) return false; - if (dc->hwss.get_idle_state) - idle_state = dc->hwss.get_idle_state(dc); + if (!dc->ctx->dmub_srv) + return false; - if (!(idle_state & DMUB_IPS1_ALLOW_MASK) || - !(idle_state & DMUB_IPS2_ALLOW_MASK)) - return true; - - return false; + return dc->ctx->dmub_srv->idle_allowed; } /* set min and max memory clock to lowest and highest DPM level, respectively */ From 8e054b0f1e71531762b8ded7f66c1b4af734671b Mon Sep 17 00:00:00 2001 From: ChunTao Tso Date: Tue, 20 Feb 2024 17:08:39 +0800 Subject: [PATCH 346/496] drm/amd/display: Amend coasting vtotal for replay low hz [WHY] The original coasting vtotal is 2 bytes, and it need to be amended to 4 bytes because low hz case. [HOW] Amend coasting vtotal from 2 bytes to 4 bytes. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Alex Hung Signed-off-by: ChunTao Tso Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dc_types.h | 4 ++-- drivers/gpu/drm/amd/display/dc/inc/link.h | 4 ++-- .../display/dc/link/protocols/link_edp_panel_control.c | 4 ++-- .../display/dc/link/protocols/link_edp_panel_control.h | 4 ++-- drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h | 8 ++++++++ drivers/gpu/drm/amd/display/modules/power/power_helpers.c | 2 +- drivers/gpu/drm/amd/display/modules/power/power_helpers.h | 2 +- 7 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h index 9900dda2eef5..be2ac5c442a4 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_types.h @@ -1085,9 +1085,9 @@ struct replay_settings { /* SMU optimization is enabled */ bool replay_smu_opt_enable; /* Current Coasting vtotal */ - uint16_t coasting_vtotal; + uint32_t coasting_vtotal; /* Coasting vtotal table */ - uint16_t coasting_vtotal_table[PR_COASTING_TYPE_NUM]; + uint32_t coasting_vtotal_table[PR_COASTING_TYPE_NUM]; /* Maximum link off frame count */ enum replay_link_off_frame_count_level link_off_frame_count_level; /* Replay pseudo vtotal for abm + ips on full screen video which can improve ips residency */ diff --git a/drivers/gpu/drm/amd/display/dc/inc/link.h b/drivers/gpu/drm/amd/display/dc/inc/link.h index 26fe81f213da..bf29fc58ea6a 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/link.h +++ b/drivers/gpu/drm/amd/display/dc/inc/link.h @@ -285,12 +285,12 @@ struct link_service { enum replay_FW_Message_type msg, union dmub_replay_cmd_set *cmd_data); bool (*edp_set_coasting_vtotal)( - struct dc_link *link, uint16_t coasting_vtotal); + struct dc_link *link, uint32_t coasting_vtotal); bool (*edp_replay_residency)(const struct dc_link *link, unsigned int *residency, const bool is_start, const bool is_alpm); bool (*edp_set_replay_power_opt_and_coasting_vtotal)(struct dc_link *link, - const unsigned int *power_opts, uint16_t coasting_vtotal); + const unsigned int *power_opts, uint32_t coasting_vtotal); bool (*edp_wait_for_t12)(struct dc_link *link); bool (*edp_is_ilr_optimization_required)(struct dc_link *link, diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c index acfbbc638cc6..3baa2bdd6dd6 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.c @@ -1034,7 +1034,7 @@ bool edp_send_replay_cmd(struct dc_link *link, return true; } -bool edp_set_coasting_vtotal(struct dc_link *link, uint16_t coasting_vtotal) +bool edp_set_coasting_vtotal(struct dc_link *link, uint32_t coasting_vtotal) { struct dc *dc = link->ctx->dc; struct dmub_replay *replay = dc->res_pool->replay; @@ -1073,7 +1073,7 @@ bool edp_replay_residency(const struct dc_link *link, } bool edp_set_replay_power_opt_and_coasting_vtotal(struct dc_link *link, - const unsigned int *power_opts, uint16_t coasting_vtotal) + const unsigned int *power_opts, uint32_t coasting_vtotal) { struct dc *dc = link->ctx->dc; struct dmub_replay *replay = dc->res_pool->replay; diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.h index 34e521af7bb4..a158c6234d42 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.h +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_edp_panel_control.h @@ -59,12 +59,12 @@ bool edp_setup_replay(struct dc_link *link, bool edp_send_replay_cmd(struct dc_link *link, enum replay_FW_Message_type msg, union dmub_replay_cmd_set *cmd_data); -bool edp_set_coasting_vtotal(struct dc_link *link, uint16_t coasting_vtotal); +bool edp_set_coasting_vtotal(struct dc_link *link, uint32_t coasting_vtotal); bool edp_replay_residency(const struct dc_link *link, unsigned int *residency, const bool is_start, const bool is_alpm); bool edp_get_replay_state(const struct dc_link *link, uint64_t *state); bool edp_set_replay_power_opt_and_coasting_vtotal(struct dc_link *link, - const unsigned int *power_opts, uint16_t coasting_vtotal); + const unsigned int *power_opts, uint32_t coasting_vtotal); bool edp_wait_for_t12(struct dc_link *link); bool edp_is_ilr_optimization_required(struct dc_link *link, struct dc_crtc_timing *crtc_timing); diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h index a529e369b2ac..af3fe8bb0728 100644 --- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h +++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h @@ -3238,6 +3238,14 @@ struct dmub_cmd_replay_set_coasting_vtotal_data { * Currently the support is only for 0 or 1 */ uint8_t panel_inst; + /** + * 16-bit value dicated by driver that indicates the coasting vtotal high byte part. + */ + uint16_t coasting_vtotal_high; + /** + * Explicit padding to 4 byte boundary. + */ + uint8_t pad[2]; }; /** diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c index e304e8435fb8..2a3698fd2dc2 100644 --- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.c +++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.c @@ -975,7 +975,7 @@ bool psr_su_set_dsc_slice_height(struct dc *dc, struct dc_link *link, void set_replay_coasting_vtotal(struct dc_link *link, enum replay_coasting_vtotal_type type, - uint16_t vtotal) + uint32_t vtotal) { link->replay_settings.coasting_vtotal_table[type] = vtotal; } diff --git a/drivers/gpu/drm/amd/display/modules/power/power_helpers.h b/drivers/gpu/drm/amd/display/modules/power/power_helpers.h index bef4815e1703..ff7e6f3cd6be 100644 --- a/drivers/gpu/drm/amd/display/modules/power/power_helpers.h +++ b/drivers/gpu/drm/amd/display/modules/power/power_helpers.h @@ -56,7 +56,7 @@ bool dmub_init_abm_config(struct resource_pool *res_pool, void init_replay_config(struct dc_link *link, struct replay_config *pr_config); void set_replay_coasting_vtotal(struct dc_link *link, enum replay_coasting_vtotal_type type, - uint16_t vtotal); + uint32_t vtotal); void set_replay_ips_full_screen_video_src_vtotal(struct dc_link *link, uint16_t vtotal); void calculate_replay_link_off_frame_count(struct dc_link *link, uint16_t vtotal, uint16_t htotal); From 94040c2cbb1a872ff779da06bf034ccfee0f9cba Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 23 Feb 2024 15:17:39 -0500 Subject: [PATCH 347/496] drm/amd/display: Lock all enabled otg pipes even with no planes [WHY] On DCN32 we support dynamic ODM even when OTG is blanked. When ODM configuration is dynamically changed and the OTG is on blank pattern, we will need to reprogram OPP's test pattern based on new ODM configuration. Therefore we need to lock the OTG pipe to avoid temporary corruption when we are reprogramming OPP blank patterns. [HOW] Add a new interdependent update lock implementation to lock all enabled OTG pipes even when there is no plane on the OTG for DCN32. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Alex Hung Signed-off-by: Wenjing Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../amd/display/dc/hwss/dcn32/dcn32_hwseq.c | 23 +++++++++++++++++++ .../amd/display/dc/hwss/dcn32/dcn32_hwseq.h | 2 ++ .../amd/display/dc/hwss/dcn32/dcn32_init.c | 2 +- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c index b890db0bfc46..c0b526cf1786 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c @@ -1785,3 +1785,26 @@ void dcn32_prepare_bandwidth(struct dc *dc, context->bw_ctx.bw.dcn.clk.p_state_change_support = p_state_change_support; } } + +void dcn32_interdependent_update_lock(struct dc *dc, + struct dc_state *context, bool lock) +{ + unsigned int i; + struct pipe_ctx *pipe; + struct timing_generator *tg; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + pipe = &context->res_ctx.pipe_ctx[i]; + tg = pipe->stream_res.tg; + + if (!resource_is_pipe_type(pipe, OTG_MASTER) || + !tg->funcs->is_tg_enabled(tg) || + dc_state_get_pipe_subvp_type(context, pipe) == SUBVP_PHANTOM) + continue; + + if (lock) + dc->hwss.pipe_control_lock(dc, pipe, true); + else + dc->hwss.pipe_control_lock(dc, pipe, false); + } +} diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h index 069e20bc87c0..f55c11fc56ec 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.h @@ -129,4 +129,6 @@ bool dcn32_is_pipe_topology_transition_seamless(struct dc *dc, void dcn32_prepare_bandwidth(struct dc *dc, struct dc_state *context); +void dcn32_interdependent_update_lock(struct dc *dc, + struct dc_state *context, bool lock); #endif /* __DC_HWSS_DCN32_H__ */ diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c index 2b073123d3ed..67d661dbd5b7 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_init.c @@ -58,7 +58,7 @@ static const struct hw_sequencer_funcs dcn32_funcs = { .disable_plane = dcn20_disable_plane, .disable_pixel_data = dcn20_disable_pixel_data, .pipe_control_lock = dcn20_pipe_control_lock, - .interdependent_update_lock = dcn10_lock_all_pipes, + .interdependent_update_lock = dcn32_interdependent_update_lock, .cursor_lock = dcn10_cursor_lock, .prepare_bandwidth = dcn32_prepare_bandwidth, .optimize_bandwidth = dcn20_optimize_bandwidth, From 2d7f3d1a5866705be2393150e1ffdf67030ab88d Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Fri, 23 Feb 2024 15:38:40 -0500 Subject: [PATCH 348/496] drm/amd/display: Implement wait_for_odm_update_pending_complete [WHY] Odm update is doubled buffered. We need to wait for ODM update to be completed before optimizing bandwidth or programming new udpates. [HOW] implement wait_for_odm_update_pending_complete function to wait for: 1. odm configuration update is no longer pending in timing generator. 2. no pending dpg pattern update for each active OPP. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Alvin Lee Acked-by: Alex Hung Signed-off-by: Wenjing Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc.c | 56 ++++++++++++++++++- .../gpu/drm/amd/display/dc/dcn10/dcn10_opp.c | 1 + .../gpu/drm/amd/display/dc/dcn20/dcn20_opp.c | 14 +++++ .../gpu/drm/amd/display/dc/dcn20/dcn20_opp.h | 2 + .../drm/amd/display/dc/dcn201/dcn201_opp.c | 1 + .../amd/display/dc/hwss/dcn20/dcn20_hwseq.c | 4 +- drivers/gpu/drm/amd/display/dc/inc/hw/opp.h | 3 + .../amd/display/dc/inc/hw/timing_generator.h | 1 + .../amd/display/dc/optc/dcn10/dcn10_optc.h | 3 +- .../amd/display/dc/optc/dcn32/dcn32_optc.c | 8 +++ .../amd/display/dc/optc/dcn32/dcn32_optc.h | 1 + 11 files changed, 90 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 958552a8605f..e7dc128f6284 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -1302,6 +1302,54 @@ static void disable_vbios_mode_if_required( } } +/** + * wait_for_blank_complete - wait for all active OPPs to finish pending blank + * pattern updates + * + * @dc: [in] dc reference + * @context: [in] hardware context in use + */ +static void wait_for_blank_complete(struct dc *dc, + struct dc_state *context) +{ + struct pipe_ctx *opp_head; + struct dce_hwseq *hws = dc->hwseq; + int i; + + if (!hws->funcs.wait_for_blank_complete) + return; + + for (i = 0; i < MAX_PIPES; i++) { + opp_head = &context->res_ctx.pipe_ctx[i]; + + if (!resource_is_pipe_type(opp_head, OPP_HEAD) || + dc_state_get_pipe_subvp_type(context, opp_head) == SUBVP_PHANTOM) + continue; + + hws->funcs.wait_for_blank_complete(opp_head->stream_res.opp); + } +} + +static void wait_for_odm_update_pending_complete(struct dc *dc, struct dc_state *context) +{ + struct pipe_ctx *otg_master; + struct timing_generator *tg; + int i; + + for (i = 0; i < MAX_PIPES; i++) { + otg_master = &context->res_ctx.pipe_ctx[i]; + if (!resource_is_pipe_type(otg_master, OTG_MASTER) || + dc_state_get_pipe_subvp_type(context, otg_master) == SUBVP_PHANTOM) + continue; + tg = otg_master->stream_res.tg; + if (tg->funcs->wait_odm_doublebuffer_pending_clear) + tg->funcs->wait_odm_doublebuffer_pending_clear(tg); + } + + /* ODM update may require to reprogram blank pattern for each OPP */ + wait_for_blank_complete(dc, context); +} + static void wait_for_no_pipes_pending(struct dc *dc, struct dc_state *context) { int i; @@ -1993,6 +2041,11 @@ static enum dc_status dc_commit_state_no_check(struct dc *dc, struct dc_state *c context->stream_count == 0) { /* Must wait for no flips to be pending before doing optimize bw */ wait_for_no_pipes_pending(dc, context); + /* + * optimized dispclk depends on ODM setup. Need to wait for ODM + * update pending complete before optimizing bandwidth. + */ + wait_for_odm_update_pending_complete(dc, context); /* pplib is notified if disp_num changed */ dc->hwss.optimize_bandwidth(dc, context); /* Need to do otg sync again as otg could be out of sync due to otg @@ -3496,7 +3549,7 @@ static void commit_planes_for_stream_fast(struct dc *dc, top_pipe_to_program->stream->update_flags.raw = 0; } -static void wait_for_outstanding_hw_updates(struct dc *dc, const struct dc_state *dc_context) +static void wait_for_outstanding_hw_updates(struct dc *dc, struct dc_state *dc_context) { /* * This function calls HWSS to wait for any potentially double buffered @@ -3534,6 +3587,7 @@ static void wait_for_outstanding_hw_updates(struct dc *dc, const struct dc_state } } } + wait_for_odm_update_pending_complete(dc, dc_context); } static void commit_planes_for_stream(struct dc *dc, diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_opp.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_opp.c index 48a40dcc7050..5838a11efd00 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_opp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_opp.c @@ -384,6 +384,7 @@ static const struct opp_funcs dcn10_opp_funcs = { .opp_set_disp_pattern_generator = NULL, .opp_program_dpg_dimensions = NULL, .dpg_is_blanked = NULL, + .dpg_is_pending = NULL, .opp_destroy = opp1_destroy }; diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.c b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.c index 0784d0198661..fbf1b6370eb2 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.c @@ -337,6 +337,19 @@ bool opp2_dpg_is_blanked(struct output_pixel_processor *opp) (double_buffer_pending == 0); } +bool opp2_dpg_is_pending(struct output_pixel_processor *opp) +{ + struct dcn20_opp *oppn20 = TO_DCN20_OPP(opp); + uint32_t double_buffer_pending; + uint32_t dpg_en; + + REG_GET(DPG_CONTROL, DPG_EN, &dpg_en); + + REG_GET(DPG_STATUS, DPG_DOUBLE_BUFFER_PENDING, &double_buffer_pending); + + return (dpg_en == 1 && double_buffer_pending == 1); +} + void opp2_program_left_edge_extra_pixel ( struct output_pixel_processor *opp, bool count) @@ -363,6 +376,7 @@ static struct opp_funcs dcn20_opp_funcs = { .opp_set_disp_pattern_generator = opp2_set_disp_pattern_generator, .opp_program_dpg_dimensions = opp2_program_dpg_dimensions, .dpg_is_blanked = opp2_dpg_is_blanked, + .dpg_is_pending = opp2_dpg_is_pending, .opp_dpg_set_blank_color = opp2_dpg_set_blank_color, .opp_destroy = opp1_destroy, .opp_program_left_edge_extra_pixel = opp2_program_left_edge_extra_pixel, diff --git a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.h b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.h index 3ab221bdd27d..8f186abd558d 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.h +++ b/drivers/gpu/drm/amd/display/dc/dcn20/dcn20_opp.h @@ -159,6 +159,8 @@ void opp2_program_dpg_dimensions( bool opp2_dpg_is_blanked(struct output_pixel_processor *opp); +bool opp2_dpg_is_pending(struct output_pixel_processor *opp); + void opp2_dpg_set_blank_color( struct output_pixel_processor *opp, const struct tg_color *color); diff --git a/drivers/gpu/drm/amd/display/dc/dcn201/dcn201_opp.c b/drivers/gpu/drm/amd/display/dc/dcn201/dcn201_opp.c index 8e77db46a409..6a71ba3dfc63 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn201/dcn201_opp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn201/dcn201_opp.c @@ -50,6 +50,7 @@ static struct opp_funcs dcn201_opp_funcs = { .opp_set_disp_pattern_generator = opp2_set_disp_pattern_generator, .opp_program_dpg_dimensions = opp2_program_dpg_dimensions, .dpg_is_blanked = opp2_dpg_is_blanked, + .dpg_is_pending = opp2_dpg_is_pending, .opp_dpg_set_blank_color = opp2_dpg_set_blank_color, .opp_destroy = opp1_destroy, .opp_program_left_edge_extra_pixel = opp2_program_left_edge_extra_pixel, diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c index 40098d9f70cb..8b3536c380b8 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn20/dcn20_hwseq.c @@ -2452,7 +2452,7 @@ bool dcn20_wait_for_blank_complete( int counter; for (counter = 0; counter < 1000; counter++) { - if (opp->funcs->dpg_is_blanked(opp)) + if (!opp->funcs->dpg_is_pending(opp)) break; udelay(100); @@ -2463,7 +2463,7 @@ bool dcn20_wait_for_blank_complete( return false; } - return true; + return opp->funcs->dpg_is_blanked(opp); } bool dcn20_dmdata_status_done(struct pipe_ctx *pipe_ctx) diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h index aee5372e292c..d89c92370d5b 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/opp.h @@ -337,6 +337,9 @@ struct opp_funcs { bool (*dpg_is_blanked)( struct output_pixel_processor *opp); + bool (*dpg_is_pending)(struct output_pixel_processor *opp); + + void (*opp_dpg_set_blank_color)( struct output_pixel_processor *opp, const struct tg_color *color); diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h b/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h index d98d72f35be5..ffad8fe16c54 100644 --- a/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h +++ b/drivers/gpu/drm/amd/display/dc/inc/hw/timing_generator.h @@ -331,6 +331,7 @@ struct timing_generator_funcs { void (*init_odm)(struct timing_generator *tg); void (*wait_drr_doublebuffer_pending_clear)(struct timing_generator *tg); + void (*wait_odm_doublebuffer_pending_clear)(struct timing_generator *tg); }; #endif diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.h b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.h index ab81594a7fad..6c2e84d3967f 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.h +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.h @@ -557,7 +557,8 @@ struct dcn_optc_registers { type OTG_CRC_DATA_STREAM_SPLIT_MODE;\ type OTG_CRC_DATA_FORMAT;\ type OTG_V_TOTAL_LAST_USED_BY_DRR;\ - type OTG_DRR_TIMING_DBUF_UPDATE_PENDING; + type OTG_DRR_TIMING_DBUF_UPDATE_PENDING;\ + type OTG_H_TIMING_DIV_MODE_DB_UPDATE_PENDING; #define TG_REG_FIELD_LIST_DCN3_2(type) \ type OTG_H_TIMING_DIV_MODE_MANUAL; diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c index 823493543325..f07a4c7e48bc 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.c @@ -122,6 +122,13 @@ void optc32_get_odm_combine_segments(struct timing_generator *tg, int *odm_combi } } +void optc32_wait_odm_doublebuffer_pending_clear(struct timing_generator *tg) +{ + struct optc *optc1 = DCN10TG_FROM_TG(tg); + + REG_WAIT(OTG_DOUBLE_BUFFER_CONTROL, OTG_H_TIMING_DIV_MODE_DB_UPDATE_PENDING, 0, 2, 50000); +} + void optc32_set_h_timing_div_manual_mode(struct timing_generator *optc, bool manual_mode) { struct optc *optc1 = DCN10TG_FROM_TG(optc); @@ -345,6 +352,7 @@ static struct timing_generator_funcs dcn32_tg_funcs = { .set_odm_bypass = optc32_set_odm_bypass, .set_odm_combine = optc32_set_odm_combine, .get_odm_combine_segments = optc32_get_odm_combine_segments, + .wait_odm_doublebuffer_pending_clear = optc32_wait_odm_doublebuffer_pending_clear, .set_h_timing_div_manual_mode = optc32_set_h_timing_div_manual_mode, .get_optc_source = optc2_get_optc_source, .set_out_mux = optc3_set_out_mux, diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.h b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.h index 8ce3b178cab0..0c2c14695561 100644 --- a/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.h +++ b/drivers/gpu/drm/amd/display/dc/optc/dcn32/dcn32_optc.h @@ -183,5 +183,6 @@ void optc32_set_h_timing_div_manual_mode(struct timing_generator *optc, bool man void optc32_get_odm_combine_segments(struct timing_generator *tg, int *odm_combine_segments); void optc32_set_odm_bypass(struct timing_generator *optc, const struct dc_crtc_timing *dc_crtc_timing); +void optc32_wait_odm_doublebuffer_pending_clear(struct timing_generator *tg); #endif /* __DC_OPTC_DCN32_H__ */ From e64b3f55e458ce7e2087a0051f47edabf74545e7 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Wed, 14 Feb 2024 13:29:51 -0700 Subject: [PATCH 349/496] drm/amd/display: Return the correct HDCP error code [WHY & HOW] If the display is null when creating an HDCP session, return a proper error code. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Acked-by: Alex Hung Signed-off-by: Rodrigo Siqueira Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c index 8c137d7c032e..7c9805705fd3 100644 --- a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c +++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c @@ -513,6 +513,9 @@ enum mod_hdcp_status mod_hdcp_hdcp2_create_session(struct mod_hdcp *hdcp) hdcp_cmd = (struct ta_hdcp_shared_memory *)psp->hdcp_context.context.mem_context.shared_buf; memset(hdcp_cmd, 0, sizeof(struct ta_hdcp_shared_memory)); + if (!display) + return MOD_HDCP_STATUS_DISPLAY_NOT_FOUND; + hdcp_cmd->in_msg.hdcp2_create_session_v2.display_handle = display->index; if (hdcp->connection.link.adjust.hdcp2.force_type == MOD_HDCP_FORCE_TYPE_0) From 334b56cea5d9df5989be6cf1a5898114fa70ad98 Mon Sep 17 00:00:00 2001 From: Allen Pan Date: Fri, 23 Feb 2024 18:20:16 -0500 Subject: [PATCH 350/496] drm/amd/display: Add a dc_state NULL check in dc_state_release [How] Check wheather state is NULL before releasing it. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Charlene Liu Acked-by: Alex Hung Signed-off-by: Allen Pan Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_state.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c index 180ac47868c2..5cc7f8da209c 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c @@ -334,7 +334,8 @@ static void dc_state_free(struct kref *kref) void dc_state_release(struct dc_state *state) { - kref_put(&state->refcount, dc_state_free); + if (state != NULL) + kref_put(&state->refcount, dc_state_free); } /* * dc_state_add_stream() - Add a new dc_stream_state to a dc_state. From 03c6284df179de3a4a6e0684764b1c71d2a405e2 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Tue, 19 Mar 2024 15:24:03 +0800 Subject: [PATCH 351/496] Revert "drm/amd/amdgpu: Fix potential ioremap() memory leaks in amdgpu_device_init()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch causes the following iounmap erorr and calltrace iounmap: bad address 00000000d0b3631f The original patch was unjustified because amdgpu_device_fini_sw() will always cleanup the rmmio mapping. This reverts commit eb4f139888f636614dab3bcce97ff61cefc4b3a7. Signed-off-by: Ma Jun Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e9454e6e4cb..5dc24c971b41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4040,10 +4040,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, * early on during init and before calling to RREG32. */ adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev"); - if (!adev->reset_domain) { - r = -ENOMEM; - goto unmap_memory; - } + if (!adev->reset_domain) + return -ENOMEM; /* detect hw virtualization here */ amdgpu_detect_virtualization(adev); @@ -4053,7 +4051,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - goto unmap_memory; + return r; } amdgpu_device_set_mcbp(adev); @@ -4061,12 +4059,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) - goto unmap_memory; + return r; /* Get rid of things like offb */ r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); if (r) - goto unmap_memory; + return r; /* Enable TMZ based on IP_VERSION */ amdgpu_gmc_tmz_set(adev); @@ -4076,7 +4074,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (adev->gmc.xgmi.supported) { r = adev->gfxhub.funcs->get_xgmi_info(adev); if (r) - goto unmap_memory; + return r; } /* enable PCIE atomic ops */ @@ -4345,8 +4343,6 @@ release_ras_con: failed: amdgpu_vf_error_trans_all(adev); -unmap_memory: - iounmap(adev->rmmio); return r; } From 1b7eec6bf360145bbca959a6c036e885dc5cf8f5 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Mon, 18 Mar 2024 18:31:30 +0800 Subject: [PATCH 352/496] Revert "drm/amdgpu/vpe: don't emit cond exec command under collaborate mode" Ready now. Remove this workaround. This reverts commit d40f6213b52c161fd4634933acbc32103a283363. Signed-off-by: Lang Yu Tested-by: Alan Liu Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 70c5cc80ecdc..7a65a2b128ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -575,9 +575,6 @@ static unsigned int vpe_ring_init_cond_exec(struct amdgpu_ring *ring, { unsigned int ret; - if (ring->adev->vpe.collaborate_mode) - return ~0; - amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_COND_EXE, 0)); amdgpu_ring_write(ring, lower_32_bits(addr)); amdgpu_ring_write(ring, upper_32_bits(addr)); From cf8c498694a443e28dc1222f3ab94677114a4724 Mon Sep 17 00:00:00 2001 From: Wenjing Liu Date: Mon, 4 Mar 2024 11:20:27 -0500 Subject: [PATCH 353/496] drm/amd/display: Revert Remove pixle rate limit for subvp This reverts commit 340383c734f8 ("drm/amd/display: Remove pixle rate limit for subvp") [why] The original commit causes a regression when subvp is applied on ODM required 8k60hz timing. The display shows black screen on boot. The issue can be recovered with hotplug. It also causes MPO to fail. We will temprarily revert this commit and investigate the root cause further. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Dhere Reviewed-by: Martin Leung Acked-by: Wayne Lin Signed-off-by: Wenjing Liu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c index b49e1dc9d8ba..a0a65e099104 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c @@ -623,6 +623,7 @@ static bool dcn32_assign_subvp_pipe(struct dc *dc, * - Not TMZ surface */ if (pipe->plane_state && !pipe->top_pipe && !dcn32_is_center_timing(pipe) && + !(pipe->stream->timing.pix_clk_100hz / 10000 > DCN3_2_MAX_SUBVP_PIXEL_RATE_MHZ) && (!dcn32_is_psr_capable(pipe) || (context->stream_count == 1 && dc->caps.dmub_caps.subvp_psr)) && dc_state_get_pipe_subvp_type(context, pipe) == SUBVP_NONE && (refresh_rate < 120 || dcn32_allow_subvp_high_refresh_rate(dc, context, pipe)) && From 69e3be6893a7e668660b05a966bead82bbddb01d Mon Sep 17 00:00:00 2001 From: Leo Ma Date: Fri, 28 Jul 2023 08:35:07 -0400 Subject: [PATCH 354/496] drm/amd/display: Fix noise issue on HDMI AV mute [Why] When mode switching is triggered there is momentary noise visible on some HDMI TV or displays. [How] Wait for 2 frames to make sure we have enough time to send out AV mute and sink receives a full frame. Cc: Mario Limonciello Cc: Alex Deucher Cc: stable@vger.kernel.org Reviewed-by: Wenjing Liu Acked-by: Wayne Lin Signed-off-by: Leo Ma Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c index 7e6b7f2a6dc9..8bc3d01537bb 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c @@ -812,10 +812,20 @@ void dcn30_set_avmute(struct pipe_ctx *pipe_ctx, bool enable) if (pipe_ctx == NULL) return; - if (dc_is_hdmi_signal(pipe_ctx->stream->signal) && pipe_ctx->stream_res.stream_enc != NULL) + if (dc_is_hdmi_signal(pipe_ctx->stream->signal) && pipe_ctx->stream_res.stream_enc != NULL) { pipe_ctx->stream_res.stream_enc->funcs->set_avmute( pipe_ctx->stream_res.stream_enc, enable); + + /* Wait for two frame to make sure AV mute is sent out */ + if (enable) { + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VBLANK); + pipe_ctx->stream_res.tg->funcs->wait_for_state(pipe_ctx->stream_res.tg, CRTC_STATE_VACTIVE); + } + } } void dcn30_update_info_frame(struct pipe_ctx *pipe_ctx) From ad550dbe8ae4ba833371a018265c1c3ae88559f0 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Fri, 15 Mar 2024 16:55:39 +0800 Subject: [PATCH 355/496] drm/amdgpu: drop setting buffer funcs in sdma442 To fix the entity rq NULL issue. This setting has been moved to upper level. Fixes: b70438004a14 ("drm/amdgpu: move buffer funcs setting up a level") Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index eaa4f5f49949..103dc9c7325f 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -431,16 +431,11 @@ static void sdma_v4_4_2_inst_gfx_stop(struct amdgpu_device *adev, struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES]; u32 doorbell_offset, doorbell; u32 rb_cntl, ib_cntl; - int i, unset = 0; + int i; for_each_inst(i, inst_mask) { sdma[i] = &adev->sdma.instance[i].ring; - if ((adev->mman.buffer_funcs_ring == sdma[i]) && unset != 1) { - amdgpu_ttm_set_buffer_funcs_status(adev, false); - unset = 1; - } - rb_cntl = RREG32_SDMA(i, regSDMA_GFX_RB_CNTL); rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_GFX_RB_CNTL, RB_ENABLE, 0); WREG32_SDMA(i, regSDMA_GFX_RB_CNTL, rb_cntl); @@ -487,20 +482,10 @@ static void sdma_v4_4_2_inst_rlc_stop(struct amdgpu_device *adev, static void sdma_v4_4_2_inst_page_stop(struct amdgpu_device *adev, uint32_t inst_mask) { - struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES]; u32 rb_cntl, ib_cntl; int i; - bool unset = false; for_each_inst(i, inst_mask) { - sdma[i] = &adev->sdma.instance[i].page; - - if ((adev->mman.buffer_funcs_ring == sdma[i]) && - (!unset)) { - amdgpu_ttm_set_buffer_funcs_status(adev, false); - unset = true; - } - rb_cntl = RREG32_SDMA(i, regSDMA_PAGE_RB_CNTL); rb_cntl = REG_SET_FIELD(rb_cntl, SDMA_PAGE_RB_CNTL, RB_ENABLE, 0); @@ -950,13 +935,7 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, r = amdgpu_ring_test_helper(page); if (r) return r; - - if (adev->mman.buffer_funcs_ring == page) - amdgpu_ttm_set_buffer_funcs_status(adev, true); } - - if (adev->mman.buffer_funcs_ring == ring) - amdgpu_ttm_set_buffer_funcs_status(adev, true); } return r; From bc55c344b06f7e6f99eb92d393ff0a84c1532514 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Tue, 19 Mar 2024 11:02:29 +0800 Subject: [PATCH 356/496] drm/amdgpu/pm: Don't use OD table on Arcturus OD is not supported on Arcturus, so the OD table should not be used. Signed-off-by: Ma Jun Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 33 +++---------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 40ba7227cca5..0c2d04f978ac 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1283,11 +1283,8 @@ static int arcturus_get_power_limit(struct smu_context *smu, uint32_t *max_power_limit, uint32_t *min_power_limit) { - struct smu_11_0_powerplay_table *powerplay_table = - (struct smu_11_0_powerplay_table *)smu->smu_table.power_play_table; - struct smu_11_0_overdrive_table *od_settings = smu->od_settings; PPTable_t *pptable = smu->smu_table.driver_pptable; - uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0; + uint32_t power_limit; if (smu_v11_0_get_current_power_limit(smu, &power_limit)) { /* the last hope to figure out the ppt limit */ @@ -1303,30 +1300,10 @@ static int arcturus_get_power_limit(struct smu_context *smu, *current_power_limit = power_limit; if (default_power_limit) *default_power_limit = power_limit; - - if (powerplay_table) { - if (smu->od_enabled && - od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) { - od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - } else if (od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) { - od_percent_upper = 0; - od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]); - } - } - - dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n", - od_percent_upper, od_percent_lower, power_limit); - - if (max_power_limit) { - *max_power_limit = power_limit * (100 + od_percent_upper); - *max_power_limit /= 100; - } - - if (min_power_limit) { - *min_power_limit = power_limit * (100 - od_percent_lower); - *min_power_limit /= 100; - } + if (max_power_limit) + *max_power_limit = power_limit; + if (min_power_limit) + *min_power_limit = power_limit; return 0; } From 6e7132ed3c07bd8a6ce3db4bb307ef2852b322dc Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 20 Mar 2024 18:43:11 +0100 Subject: [PATCH 357/496] dm snapshot: fix lockup in dm_exception_table_exit There was reported lockup when we exit a snapshot with many exceptions. Fix this by adding "cond_resched" to the loop that frees the exceptions. Reported-by: John Pittman Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index bf7a574499a3..0ace06d1bee3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -684,8 +684,10 @@ static void dm_exception_table_exit(struct dm_exception_table *et, for (i = 0; i < size; i++) { slot = et->table + i; - hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) + hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) { kmem_cache_free(mem, ex); + cond_resched(); + } } kvfree(et->table); From a9ad73295cc1e3af0253eee7d08943b2419444c4 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 11 Mar 2024 19:31:44 +0000 Subject: [PATCH 358/496] riscv: Fix syscall wrapper for >word-size arguments The current syscall wrapper macros break 64-bit arguments on rv32 because they only guarantee the first N input registers are passed to syscalls that accept N arguments. According to the calling convention, values twice the word size reside in register pairs and as a result, syscall arguments don't always have a direct register mapping on rv32. Instead of using `__MAP(x,__SC_LONG,__VA_ARGS__)` to declare the type of the `__se(_compat)_sys_*` functions on rv32, change the function declarations to accept `ulong` arguments and alias them to the actual syscall implementations, similarly to the existing macros in include/linux/syscalls.h. This matches previous behavior and ensures registers are passed to syscalls as-is, no matter which argument types they expect. Fixes: 08d0ce30e0e4 ("riscv: Implement syscall wrappers") Reported-by: Khem Raj Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20240311193143.2981310-2-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/syscall_wrapper.h | 53 +++++++++++++++++------- 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index eeec04b7dae6..980094c2e976 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -12,26 +12,52 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); -#define SC_RISCV_REGS_TO_ARGS(x, ...) \ - __MAP(x,__SC_ARGS \ - ,,regs->orig_a0,,regs->a1,,regs->a2 \ +#ifdef CONFIG_64BIT + +#define __SYSCALL_SE_DEFINEx(x, prefix, name, ...) \ + static long __se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static long __se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) + +#define SC_RISCV_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,regs->orig_a0,,regs->a1,,regs->a2 \ ,,regs->a3,,regs->a4,,regs->a5,,regs->a6) +#else +/* + * Use type aliasing to ensure registers a0-a6 are correctly passed to the syscall + * implementation when >word-size arguments are used. + */ +#define __SYSCALL_SE_DEFINEx(x, prefix, name, ...) \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ + static long __se_##prefix##name(ulong, ulong, ulong, ulong, ulong, ulong, \ + ulong) \ + __attribute__((alias(__stringify(___se_##prefix##name)))); \ + __diag_pop(); \ + static long noinline ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static long ___se_##prefix##name(__MAP(x,__SC_LONG,__VA_ARGS__)) + +#define SC_RISCV_REGS_TO_ARGS(x, ...) \ + regs->orig_a0,regs->a1,regs->a2,regs->a3,regs->a4,regs->a5,regs->a6 + +#endif /* CONFIG_64BIT */ + #ifdef CONFIG_COMPAT #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__riscv_compat_sys##name, ERRNO); \ - static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + __SYSCALL_SE_DEFINEx(x, compat_sys, name, __VA_ARGS__) \ + { \ + return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ + } \ asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs) \ { \ return __se_compat_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \ } \ - static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ - { \ - return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ - } \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #define COMPAT_SYSCALL_DEFINE0(sname) \ @@ -51,19 +77,18 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long __riscv_sys##name(const struct pt_regs *regs); \ ALLOW_ERROR_INJECTION(__riscv_sys##name, ERRNO); \ - static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - asmlinkage long __riscv_sys##name(const struct pt_regs *regs) \ - { \ - return __se_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \ - } \ - static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + __SYSCALL_SE_DEFINEx(x, sys, name, __VA_ARGS__) \ { \ long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ __MAP(x,__SC_TEST,__VA_ARGS__); \ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ } \ + asmlinkage long __riscv_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \ + } \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #define SYSCALL_DEFINE0(sname) \ From f7cee094fb3b370e56b3c8aac89038de818d7aec Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Tue, 19 Mar 2024 17:05:55 -0700 Subject: [PATCH 359/496] MAINTAINER: Include linux-arm-msm for Qualcomm RTC patches Add Qualcomm RTC driver to the linux-arm-msm list, to ensure that members of the Qualcomm community gets Cc'ed, to assist with reviews etc. Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240319-maintainer-msm-add-rtc-v1-1-3a4f7d41b4d4@quicinc.com Signed-off-by: Alexandre Belloni --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index eeba9c259449..0c60a61f035a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2594,6 +2594,7 @@ F: drivers/pci/controller/dwc/pcie-qcom.c F: drivers/phy/qualcomm/ F: drivers/power/*/msm* F: drivers/reset/reset-qcom-* +F: drivers/rtc/rtc-pm8xxx.c F: drivers/spi/spi-geni-qcom.c F: drivers/spi/spi-qcom-qspi.c F: drivers/spi/spi-qup.c From b0f269728ccd1d3cabcb6f3a5b610147d98a5dd6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 21 Mar 2024 00:43:12 +0900 Subject: [PATCH 360/496] x86/config: Fix warning for 'make ARCH=x86_64 tinyconfig' Kconfig emits a warning for the following command: $ make ARCH=x86_64 tinyconfig ... .config:1380:warning: override: UNWINDER_GUESS changes choice state When X86_64=y, the unwinder is exclusively selected from the following three options: - UNWINDER_ORC - UNWINDER_FRAME_POINTER - UNWINDER_GUESS However, arch/x86/configs/tiny.config only specifies the values of the last two. UNWINDER_ORC must be explicitly disabled. Signed-off-by: Masahiro Yamada Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240320154313.612342-1-masahiroy@kernel.org --- arch/x86/configs/tiny.config | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 66c9e2aab16c..be3ee4294903 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1,5 +1,6 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +# CONFIG_UNWINDER_ORC is not set CONFIG_UNWINDER_GUESS=y # CONFIG_UNWINDER_FRAME_POINTER is not set From a51cd6bf8e10793103c5870ff9e4db295a843604 Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Thu, 21 Mar 2024 09:18:09 +0100 Subject: [PATCH 361/496] arm64: bpf: fix 32bit unconditional bswap In case when is64 == 1 in emit(A64_REV32(is64, dst, dst), ctx) the generated insn reverses byte order for both high and low 32-bit words, resuling in an incorrect swap as indicated by the jit test: [ 9757.262607] test_bpf: #312 BSWAP 16: 0x0123456789abcdef -> 0xefcd jited:1 8 PASS [ 9757.264435] test_bpf: #313 BSWAP 32: 0x0123456789abcdef -> 0xefcdab89 jited:1 ret 1460850314 != -271733879 (0x5712ce8a != 0xefcdab89)FAIL (1 times) [ 9757.266260] test_bpf: #314 BSWAP 64: 0x0123456789abcdef -> 0x67452301 jited:1 8 PASS [ 9757.268000] test_bpf: #315 BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89 jited:1 8 PASS [ 9757.269686] test_bpf: #316 BSWAP 16: 0xfedcba9876543210 -> 0x1032 jited:1 8 PASS [ 9757.271380] test_bpf: #317 BSWAP 32: 0xfedcba9876543210 -> 0x10325476 jited:1 ret -1460850316 != 271733878 (0xa8ed3174 != 0x10325476)FAIL (1 times) [ 9757.273022] test_bpf: #318 BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe jited:1 7 PASS [ 9757.274721] test_bpf: #319 BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476 jited:1 9 PASS Fix this by forcing 32bit variant of rev32. Fixes: 1104247f3f979 ("bpf, arm64: Support unconditional bswap") Signed-off-by: Artem Savkov Tested-by: Puranjay Mohan Acked-by: Puranjay Mohan Acked-by: Xu Kuohai Message-ID: <20240321081809.158803-1-asavkov@redhat.com> Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 48b19a233299..122021f9bdfc 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -943,7 +943,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_UXTH(is64, dst, dst), ctx); break; case 32: - emit(A64_REV32(is64, dst, dst), ctx); + emit(A64_REV32(0, dst, dst), ctx); /* upper 32 bits already cleared */ break; case 64: From a20ad45008a7c82f1184dc6dee280096009ece55 Mon Sep 17 00:00:00 2001 From: Fei Shao Date: Thu, 21 Mar 2024 15:08:57 +0800 Subject: [PATCH 362/496] spi: spi-mt65xx: Fix NULL pointer access in interrupt handler The TX buffer in spi_transfer can be a NULL pointer, so the interrupt handler may end up writing to the invalid memory and cause crashes. Add a check to trans->tx_buf before using it. Fixes: 1ce24864bff4 ("spi: mediatek: Only do dma for 4-byte aligned buffers") Signed-off-by: Fei Shao Reviewed-by: AngeloGioacchino Del Regno Link: https://msgid.link/r/20240321070942.1587146-2-fshao@chromium.org Signed-off-by: Mark Brown --- drivers/spi/spi-mt65xx.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/spi/spi-mt65xx.c b/drivers/spi/spi-mt65xx.c index 8d4633b353ee..e4cb22fe0075 100644 --- a/drivers/spi/spi-mt65xx.c +++ b/drivers/spi/spi-mt65xx.c @@ -788,17 +788,19 @@ static irqreturn_t mtk_spi_interrupt(int irq, void *dev_id) mdata->xfer_len = min(MTK_SPI_MAX_FIFO_SIZE, len); mtk_spi_setup_packet(host); - cnt = mdata->xfer_len / 4; - iowrite32_rep(mdata->base + SPI_TX_DATA_REG, - trans->tx_buf + mdata->num_xfered, cnt); + if (trans->tx_buf) { + cnt = mdata->xfer_len / 4; + iowrite32_rep(mdata->base + SPI_TX_DATA_REG, + trans->tx_buf + mdata->num_xfered, cnt); - remainder = mdata->xfer_len % 4; - if (remainder > 0) { - reg_val = 0; - memcpy(®_val, - trans->tx_buf + (cnt * 4) + mdata->num_xfered, - remainder); - writel(reg_val, mdata->base + SPI_TX_DATA_REG); + remainder = mdata->xfer_len % 4; + if (remainder > 0) { + reg_val = 0; + memcpy(®_val, + trans->tx_buf + (cnt * 4) + mdata->num_xfered, + remainder); + writel(reg_val, mdata->base + SPI_TX_DATA_REG); + } } mtk_spi_enable_transfer(host); From 2aea94ac14d1e0a8ae9e34febebe208213ba72f7 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Wed, 20 Mar 2024 11:26:07 -0700 Subject: [PATCH 363/496] exec: Fix NOMMU linux_binprm::exec in transfer_args_to_stack() In NOMMU kernel the value of linux_binprm::p is the offset inside the temporary program arguments array maintained in separate pages in the linux_binprm::page. linux_binprm::exec being a copy of linux_binprm::p thus must be adjusted when that array is copied to the user stack. Without that adjustment the value passed by the NOMMU kernel to the ELF program in the AT_EXECFN entry of the aux array doesn't make any sense and it may break programs that try to access memory pointed to by that entry. Adjust linux_binprm::exec before the successful return from the transfer_args_to_stack(). Cc: Fixes: b6a2fea39318 ("mm: variable length argument support") Fixes: 5edc2a5123a7 ("binfmt_elf_fdpic: wire up AT_EXECFD, AT_EXECFN, AT_SECURE") Signed-off-by: Max Filippov Link: https://lore.kernel.org/r/20240320182607.1472887-1-jcmvbkbc@gmail.com Signed-off-by: Kees Cook --- fs/exec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/exec.c b/fs/exec.c index e7d9d6ad980b..f66639820580 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -895,6 +895,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm, goto out; } + bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: From b4d78cfeb30476239cf08f4f40afc095c173d6e3 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 21 Mar 2024 17:48:45 +0100 Subject: [PATCH 364/496] dm-integrity: align the outgoing bio in integrity_recheck It is possible to set up dm-integrity with smaller sector size than the logical sector size of the underlying device. In this situation, dm-integrity guarantees that the outgoing bios have the same alignment as incoming bios (so, if you create a filesystem with 4k block size, dm-integrity would send 4k-aligned bios to the underlying device). This guarantee was broken when integrity_recheck was implemented. integrity_recheck sends bio that is aligned to ic->sectors_per_block. So if we set up integrity with 512-byte sector size on a device with logical block size 4k, we would be sending unaligned bio. This triggered a bug in one of our internal tests. This commit fixes it by determining the actual alignment of the incoming bio and then makes sure that the outgoing bio in integrity_recheck has the same alignment. Fixes: c88f5e553fe3 ("dm-integrity: recheck the integrity tag after a failure") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 3329e1e93524..37b9f8f1ae1a 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1699,7 +1699,6 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks struct bio_vec bv; sector_t sector, logical_sector, area, offset; struct page *page; - void *buffer; get_area_and_offset(ic, dio->range.logical_sector, &area, &offset); dio->metadata_block = get_metadata_sector_and_offset(ic, area, offset, @@ -1708,13 +1707,14 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks logical_sector = dio->range.logical_sector; page = mempool_alloc(&ic->recheck_pool, GFP_NOIO); - buffer = page_to_virt(page); __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos = 0; do { + sector_t alignment; char *mem; + char *buffer = page_to_virt(page); int r; struct dm_io_request io_req; struct dm_io_region io_loc; @@ -1727,6 +1727,14 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks io_loc.sector = sector; io_loc.count = ic->sectors_per_block; + /* Align the bio to logical block size */ + alignment = dio->range.logical_sector | bio_sectors(bio) | (PAGE_SIZE >> SECTOR_SHIFT); + alignment &= -alignment; + io_loc.sector = round_down(io_loc.sector, alignment); + io_loc.count += sector - io_loc.sector; + buffer += (sector - io_loc.sector) << SECTOR_SHIFT; + io_loc.count = round_up(io_loc.count, alignment); + r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT); if (unlikely(r)) { dio->bi_status = errno_to_blk_status(r); From 1e1c4bd16e385f0b1b39920ce3aa16bb33fcdb27 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Mar 2024 17:19:49 +0800 Subject: [PATCH 365/496] nvme: remove redundant BUILD_BUG_ON check Remove redundant BUILD_BUG_ON check of struct nvme_dsm_range, it's already checked in nvme_init_ctrl(). Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fb55b6ac0385..212005933782 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1803,9 +1803,6 @@ static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) { struct nvme_ctrl *ctrl = ns->ctrl; - BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < - NVME_DSM_MAX_RANGES); - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) lim->max_hw_discard_sectors = nvme_lba_to_sect(ns->head, ctrl->dmrsl); From 910934da9444dbb102294796481ab05e4419d311 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Mar 2024 19:08:21 +0800 Subject: [PATCH 366/496] nvmet-rdma: remove NVMET_RDMA_REQ_INVALIDATE_RKEY flag We can simply use invalidate_rkey to check instead of adding a flag. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f2bb9d95ecf4..5b8c63e74639 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -53,7 +53,6 @@ struct nvmet_rdma_cmd { enum { NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), - NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), }; struct nvmet_rdma_rsp { @@ -722,7 +721,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct ib_send_wr *first_wr; - if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + if (rsp->invalidate_rkey) { rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; } else { @@ -905,10 +904,8 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, goto error_out; rsp->n_rdma += ret; - if (invalidate) { + if (invalidate) rsp->invalidate_rkey = key; - rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; - } return 0; @@ -1047,6 +1044,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) rsp->req.cmd = cmd->nvme_cmd; rsp->req.port = queue->port; rsp->n_rdma = 0; + rsp->invalidate_rkey = 0; if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { unsigned long flags; From ddb2ffdc474a3000887dc776b971d04bde29decc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 21 Mar 2024 13:01:58 -0300 Subject: [PATCH 367/496] libbpf: Define MFD_CLOEXEC if not available Since its going directly to the syscall to avoid not having memfd_create() available in some systems, do the same for its MFD_CLOEXEC flags, defining it if not available. This fixes the build in those systems, noticed while building perf on a set of build containers. Fixes: 9fa5e1a180aa639f ("libbpf: Call memfd_create() syscall directly") Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/ZfxZ9nCyKvwmpKkE@x1 --- tools/lib/bpf/libbpf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1e3e697b98bc..a2061fcd612d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1650,6 +1650,10 @@ static int sys_memfd_create(const char *name, unsigned flags) return syscall(__NR_memfd_create, name, flags); } +#ifndef MFD_CLOEXEC +#define MFD_CLOEXEC 0x0001U +#endif + static int create_placeholder_fd(void) { int fd; From 763865fed8641920791580901a7dd1f100aa9452 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Tue, 19 Mar 2024 17:23:33 +0800 Subject: [PATCH 368/496] fbdev: panel-tpo-td043mtea1: Convert sprintf() to sysfs_emit() Per filesystems/sysfs.rst, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. coccinelle complains that there are still a couple of functions that use snprintf(). Convert them to sysfs_emit(). CC: Helge Deller CC: linux-omap@vger.kernel.org CC: linux-fbdev@vger.kernel.org CC: dri-devel@lists.freedesktop.org Signed-off-by: Li Zhijian Signed-off-by: Helge Deller --- .../omap2/omapfb/displays/panel-tpo-td043mtea1.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-tpo-td043mtea1.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-tpo-td043mtea1.c index 477789cff8e0..d487941853e6 100644 --- a/drivers/video/fbdev/omap2/omapfb/displays/panel-tpo-td043mtea1.c +++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-tpo-td043mtea1.c @@ -225,17 +225,12 @@ static ssize_t tpo_td043_gamma_show(struct device *dev, { struct panel_drv_data *ddata = dev_get_drvdata(dev); ssize_t len = 0; - int ret; int i; - for (i = 0; i < ARRAY_SIZE(ddata->gamma); i++) { - ret = snprintf(buf + len, PAGE_SIZE - len, "%u ", - ddata->gamma[i]); - if (ret < 0) - return ret; - len += ret; - } - buf[len - 1] = '\n'; + for (i = 0; i < ARRAY_SIZE(ddata->gamma); i++) + len += sysfs_emit_at(buf, len, "%u ", ddata->gamma[i]); + if (len) + buf[len - 1] = '\n'; return len; } From a26979377bf34534ce5ee2712d2a46157ec61498 Mon Sep 17 00:00:00 2001 From: Mukesh Kumar Chaurasiya Date: Wed, 20 Mar 2024 23:08:16 +0530 Subject: [PATCH 369/496] sched/doc: Update documentation for base_slice_ns and CONFIG_HZ relation The tunable base_slice_ns is dependent on CONFIG_HZ (i.e. TICK_NSEC) for any significant performance improvement. The reason being the scheduler tick is not frequent enough to force preemption when base_slice expires in case of: base_slice_ns < TICK_NSEC The below data is of stress-ng: Number of CPU: 1 Stressor threads: 4 Time: 30sec On CONFIG_HZ=1000 | base_slice | avg-run (msec) | context-switches | | ---------- | -------------- | ---------------- | | 3ms | 2.914 | 10342 | | 6ms | 4.857 | 6196 | | 9ms | 6.754 | 4482 | | 12ms | 7.872 | 3802 | | 22ms | 11.294 | 2710 | | 32ms | 13.425 | 2284 | On CONFIG_HZ=100 | base_slice | avg-run (msec) | context-switches | | ---------- | -------------- | ---------------- | | 3ms | 9.144 | 3337 | | 6ms | 9.113 | 3301 | | 9ms | 8.991 | 3315 | | 12ms | 12.935 | 2328 | | 22ms | 16.031 | 1915 | | 32ms | 18.608 | 1622 | base_slice: the value of base_slice in ms avg-run (msec): average time of the stressor threads got on cpu before it got preempted context-switches: number of context switches for the stress-ng process Signed-off-by: Mukesh Kumar Chaurasiya Signed-off-by: Ingo Molnar Cc: Randy Dunlap Link: https://lore.kernel.org/r/20240320173815.927637-2-mchauras@linux.ibm.com --- Documentation/scheduler/sched-design-CFS.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst index 6cffffe26500..e030876fbd68 100644 --- a/Documentation/scheduler/sched-design-CFS.rst +++ b/Documentation/scheduler/sched-design-CFS.rst @@ -100,6 +100,9 @@ which can be used to tune the scheduler from "desktop" (i.e., low latencies) to "server" (i.e., good batching) workloads. It defaults to a setting suitable for desktop workloads. SCHED_BATCH is handled by the CFS scheduler module too. +In case CONFIG_HZ results in base_slice_ns < TICK_NSEC, the value of +base_slice_ns will have little to no impact on the workloads. + Due to its design, the CFS scheduler is not prone to any of the "attacks" that exist today against the heuristics of the stock scheduler: fiftyp.c, thud.c, chew.c, ring-test.c, massive_intr.c all work fine and do not impact From 5248f4097308c1cdcf163314a6ea3c8c88c98cd9 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Thu, 21 Mar 2024 20:04:08 +0000 Subject: [PATCH 370/496] binfmt: replace deprecated strncpy strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. There is a _nearly_ identical implementation of fill_psinfo present in binfmt_elf.c -- except that one uses get_task_comm over strncpy(). Let's mirror that in binfmt_elf_fdpic.c Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 Cc: Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240321-strncpy-fs-binfmt_elf_fdpic-c-v2-1-0b6daec6cc56@google.com Signed-off-by: Kees Cook --- fs/binfmt_elf_fdpic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1920ed69279b..3314249e8674 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1359,7 +1359,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } From fc7f27cda843ce294c71767d42b9d8abd015d7cb Mon Sep 17 00:00:00 2001 From: Dave Young Date: Fri, 22 Mar 2024 13:15:08 +0800 Subject: [PATCH 371/496] x86/kexec: Do not update E820 kexec table for setup_data crashkernel reservation failed on a Thinkpad t440s laptop recently. Actually the memblock reservation succeeded, but later insert_resource() failed. Test steps: kexec load -> /* make sure add crashkernel param eg. crashkernel=160M */ kexec reboot -> dmesg|grep "crashkernel reserved"; crashkernel memory range like below reserved successfully: 0x00000000d0000000 - 0x00000000da000000 But no such "Crash kernel" region in /proc/iomem The background story: Currently the E820 code reserves setup_data regions for both the current kernel and the kexec kernel, and it inserts them into the resources list. Before the kexec kernel reboots nobody passes the old setup_data, and kexec only passes fresh SETUP_EFI/SETUP_IMA/SETUP_RNG_SEED if needed. Thus the old setup data memory is not used at all. Due to old kernel updates the kexec e820 table as well so kexec kernel sees them as E820_TYPE_RESERVED_KERN regions, and later the old setup_data regions are inserted into resources list in the kexec kernel by e820__reserve_resources(). Note, due to no setup_data is passed in for those old regions they are not early reserved (by function early_reserve_memory), and the crashkernel memblock reservation will just treat them as usable memory and it could reserve the crashkernel region which overlaps with the old setup_data regions. And just like the bug I noticed here, kdump insert_resource failed because e820__reserve_resources has added the overlapped chunks in /proc/iomem already. Finally, looking at the code, the old setup_data regions are not used at all as no setup_data is passed in by the kexec boot loader. Although something like SETUP_PCI etc could be needed, kexec should pass the info as new setup_data so that kexec kernel can take care of them. This should be taken care of in other separate patches if needed. Thus drop the useless buggy code here. Signed-off-by: Dave Young Signed-off-by: Ingo Molnar Cc: Jiri Bohac Cc: Eric DeVolder Cc: Baoquan He Cc: Ard Biesheuvel Cc: Kees Cook Cc: "Kirill A. Shutemov" Link: https://lore.kernel.org/r/Zf0T3HCG-790K-pZ@darkstar.users.ipa.redhat.com --- arch/x86/kernel/e820.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b66f540de054..6f1b379e3b38 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1016,17 +1016,6 @@ void __init e820__reserve_setup_data(void) e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - /* - * SETUP_EFI, SETUP_IMA and SETUP_RNG_SEED are supplied by - * kexec and do not need to be reserved. - */ - if (data->type != SETUP_EFI && - data->type != SETUP_IMA && - data->type != SETUP_RNG_SEED) - e820__range_update_kexec(pa_data, - sizeof(*data) + data->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - if (data->type == SETUP_INDIRECT) { len += data->len; early_memunmap(data, sizeof(*data)); @@ -1038,12 +1027,9 @@ void __init e820__reserve_setup_data(void) indirect = (struct setup_indirect *)data->data; - if (indirect->type != SETUP_INDIRECT) { + if (indirect->type != SETUP_INDIRECT) e820__range_update(indirect->addr, indirect->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - e820__range_update_kexec(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } } pa_data = pa_next; @@ -1051,7 +1037,6 @@ void __init e820__reserve_setup_data(void) } e820__update_table(e820_table); - e820__update_table(e820_table_kexec); pr_info("extended physical RAM map:\n"); e820__print_table("reserve setup_data"); From d24b03535e5eb82e025219c2f632b485409c898f Mon Sep 17 00:00:00 2001 From: Ryosuke Yasuoka Date: Wed, 20 Mar 2024 09:54:10 +0900 Subject: [PATCH 372/496] nfc: nci: Fix uninit-value in nci_dev_up and nci_ntf_packet syzbot reported the following uninit-value access issue [1][2]: nci_rx_work() parses and processes received packet. When the payload length is zero, each message type handler reads uninitialized payload and KMSAN detects this issue. The receipt of a packet with a zero-size payload is considered unexpected, and therefore, such packets should be silently discarded. This patch resolved this issue by checking payload size before calling each message type handler codes. Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reported-and-tested-by: syzbot+7ea9413ea6749baf5574@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+29b5ca705d2e0f4a44d2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7ea9413ea6749baf5574 [1] Closes: https://syzkaller.appspot.com/bug?extid=29b5ca705d2e0f4a44d2 [2] Signed-off-by: Ryosuke Yasuoka Reviewed-by: Jeremy Cline Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index cdad47b140fa..0d26c8ec9993 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1516,6 +1516,11 @@ static void nci_rx_work(struct work_struct *work) nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); + if (!nci_plen(skb->data)) { + kfree_skb(skb); + break; + } + /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: From e3f269ed0accbb22aa8f25d2daffa23c3fccd407 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 14 Mar 2024 14:26:56 +0000 Subject: [PATCH 373/496] x86/pm: Work around false positive kmemleak report in msr_build_context() Since: 7ee18d677989 ("x86/power: Make restore_processor_context() sane") kmemleak reports this issue: unreferenced object 0xf68241e0 (size 32): comm "swapper/0", pid 1, jiffies 4294668610 (age 68.432s) hex dump (first 32 bytes): 00 cc cc cc 29 10 01 c0 00 00 00 00 00 00 00 00 ....)........... 00 42 82 f6 cc cc cc cc cc cc cc cc cc cc cc cc .B.............. backtrace: [<461c1d50>] __kmem_cache_alloc_node+0x106/0x260 [] __kmalloc+0x54/0x160 [] msr_build_context.constprop.0+0x35/0x100 [<46635aff>] pm_check_save_msr+0x63/0x80 [<6b6bb938>] do_one_initcall+0x41/0x1f0 [<3f3add60>] kernel_init_freeable+0x199/0x1e8 [<3b538fde>] kernel_init+0x1a/0x110 [<938ae2b2>] ret_from_fork+0x1c/0x28 Which is a false positive. Reproducer: - Run rsync of whole kernel tree (multiple times if needed). - start a kmemleak scan - Note this is just an example: a lot of our internal tests hit these. The root cause is similar to the fix in: b0b592cf0836 x86/pm: Fix false positive kmemleak report in msr_build_context() ie. the alignment within the packed struct saved_context which has everything unaligned as there is only "u16 gs;" at start of struct where in the past there were four u16 there thus aligning everything afterwards. The issue is with the fact that Kmemleak only searches for pointers that are aligned (see how pointers are scanned in kmemleak.c) so when the struct members are not aligned it doesn't see them. Testing: We run a lot of tests with our CI, and after applying this fix we do not see any kmemleak issues any more whilst without it we see hundreds of the above report. From a single, simple test run consisting of 416 individual test cases on kernel 5.10 x86 with kmemleak enabled we got 20 failures due to this, which is quite a lot. With this fix applied we get zero kmemleak related failures. Fixes: 7ee18d677989 ("x86/power: Make restore_processor_context() sane") Signed-off-by: Anton Altaparmakov Signed-off-by: Ingo Molnar Acked-by: "Rafael J. Wysocki" Cc: stable@vger.kernel.org Cc: Linus Torvalds Link: https://lore.kernel.org/r/20240314142656.17699-1-anton@tuxera.com --- arch/x86/include/asm/suspend_32.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index a800abb1a992..d8416b3bf832 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -12,11 +12,6 @@ /* image of the saved processor state */ struct saved_context { - /* - * On x86_32, all segment registers except gs are saved at kernel - * entry in pt_regs. - */ - u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; struct saved_msrs saved_msrs; @@ -27,6 +22,11 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + /* + * On x86_32, all segment registers except gs are saved at kernel + * entry in pt_regs. + */ + u16 gs; bool misc_enable_saved; } __attribute__((packed)); From 4e51653d5d871f40f1bd5cf95cc7f2d8b33d063b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 15 Mar 2024 00:17:30 +0900 Subject: [PATCH 374/496] kprobes/x86: Use copy_from_kernel_nofault() to read from unsafe address Read from an unsafe address with copy_from_kernel_nofault() in arch_adjust_kprobe_addr() because this function is used before checking the address is in text or not. Syzcaller bot found a bug and reported the case if user specifies inaccessible data area, arch_adjust_kprobe_addr() will cause a kernel panic. [ mingo: Clarified the comment. ] Fixes: cc66bb914578 ("x86/ibt,kprobes: Cure sym+0 equals fentry woes") Reported-by: Qiang Zhang Tested-by: Jinghao Jia Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/171042945004.154897.2221804961882915806.stgit@devnote2 --- arch/x86/kernel/kprobes/core.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 091b3ab76a18..d0e49bd7c6f3 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,7 +373,16 @@ out: kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - if (is_endbr(*(u32 *)addr)) { + u32 insn; + + /* + * Since 'addr' is not guaranteed to be safe to access, use + * copy_from_kernel_nofault() to read the instruction: + */ + if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) + return NULL; + + if (is_endbr(insn)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; From 203a6763ab699da0568fd2b76303d03bb121abd4 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 13 Mar 2024 16:32:27 -0700 Subject: [PATCH 375/496] Revert "crypto: pkcs7 - remove sha1 support" This reverts commit 16ab7cb5825fc3425c16ad2c6e53d827f382d7c6 because it broke iwd. iwd uses the KEYCTL_PKEY_* UAPIs via its dependency libell, and apparently it is relying on SHA-1 signature support. These UAPIs are fairly obscure, and their documentation does not mention which algorithms they support. iwd really should be using a properly supported userspace crypto library instead. Regardless, since something broke we have to revert the change. It may be possible that some parts of this commit can be reinstated without breaking iwd (e.g. probably the removal of MODULE_SIG_SHA1), but for now this just does a full revert to get things working again. Reported-by: Karel Balej Closes: https://lore.kernel.org/r/CZSHRUIJ4RKL.34T4EASV5DNJM@matfyz.cz Cc: Dimitri John Ledkov Signed-off-by: Eric Biggers Tested-by: Karel Balej Signed-off-by: Herbert Xu --- crypto/asymmetric_keys/mscode_parser.c | 3 + crypto/asymmetric_keys/pkcs7_parser.c | 4 ++ crypto/asymmetric_keys/public_key.c | 3 +- crypto/asymmetric_keys/signature.c | 2 +- crypto/asymmetric_keys/x509_cert_parser.c | 8 +++ crypto/testmgr.h | 80 +++++++++++++++++++++++ include/linux/oid_registry.h | 4 ++ kernel/module/Kconfig | 5 ++ 8 files changed, 107 insertions(+), 2 deletions(-) diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c index 05402ef8964e..8aecbe4637f3 100644 --- a/crypto/asymmetric_keys/mscode_parser.c +++ b/crypto/asymmetric_keys/mscode_parser.c @@ -75,6 +75,9 @@ int mscode_note_digest_algo(void *context, size_t hdrlen, oid = look_up_OID(value, vlen); switch (oid) { + case OID_sha1: + ctx->digest_algo = "sha1"; + break; case OID_sha256: ctx->digest_algo = "sha256"; break; diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 5b08c50722d0..231ad7b3789d 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -227,6 +227,9 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, struct pkcs7_parse_context *ctx = context; switch (ctx->last_oid) { + case OID_sha1: + ctx->sinfo->sig->hash_algo = "sha1"; + break; case OID_sha256: ctx->sinfo->sig->hash_algo = "sha256"; break; @@ -278,6 +281,7 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "rsa"; ctx->sinfo->sig->encoding = "pkcs1"; break; + case OID_id_ecdsa_with_sha1: case OID_id_ecdsa_with_sha224: case OID_id_ecdsa_with_sha256: case OID_id_ecdsa_with_sha384: diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index e5f22691febd..e314fd57e6f8 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -115,7 +115,8 @@ software_key_determine_akcipher(const struct public_key *pkey, */ if (!hash_algo) return -EINVAL; - if (strcmp(hash_algo, "sha224") != 0 && + if (strcmp(hash_algo, "sha1") != 0 && + strcmp(hash_algo, "sha224") != 0 && strcmp(hash_algo, "sha256") != 0 && strcmp(hash_algo, "sha384") != 0 && strcmp(hash_algo, "sha512") != 0 && diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c index 398983be77e8..2deff81f8af5 100644 --- a/crypto/asymmetric_keys/signature.c +++ b/crypto/asymmetric_keys/signature.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(decrypt_blob); * Sign the specified data blob using the private key specified by params->key. * The signature is wrapped in an encoding if params->encoding is specified * (eg. "pkcs1"). If the encoding needs to know the digest type, this can be - * passed through params->hash_algo (eg. "sha512"). + * passed through params->hash_algo (eg. "sha1"). * * Returns the length of the data placed in the signature buffer or an error. */ diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 487204d39426..bb0bffa271b5 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -198,6 +198,10 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, default: return -ENOPKG; /* Unsupported combination */ + case OID_sha1WithRSAEncryption: + ctx->cert->sig->hash_algo = "sha1"; + goto rsa_pkcs1; + case OID_sha256WithRSAEncryption: ctx->cert->sig->hash_algo = "sha256"; goto rsa_pkcs1; @@ -214,6 +218,10 @@ int x509_note_sig_algo(void *context, size_t hdrlen, unsigned char tag, ctx->cert->sig->hash_algo = "sha224"; goto rsa_pkcs1; + case OID_id_ecdsa_with_sha1: + ctx->cert->sig->hash_algo = "sha1"; + goto ecdsa; + case OID_id_rsassa_pkcs1_v1_5_with_sha3_256: ctx->cert->sig->hash_algo = "sha3-256"; goto rsa_pkcs1; diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 986f331a5fc2..12e1c892f366 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -653,6 +653,30 @@ static const struct akcipher_testvec rsa_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = { { .key = + "\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1" + "\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68" + "\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08" + "\x98", + .key_len = 49, + .params = + "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48" + "\xce\x3d\x03\x01\x01", + .param_len = 21, + .m = + "\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae" + "\x63\x85\xe7\x82", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91" + "\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10" + "\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86" + "\x80\x6f\xa5\x79\x77\xda\xd0", + .c_size = 55, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { + .key = "\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd" "\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75" "\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee" @@ -756,6 +780,32 @@ static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = { { .key = + "\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41" + "\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9" + "\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc" + "\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8" + "\xaf", + .key_len = 65, + .params = + "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48" + "\xce\x3d\x03\x01\x07", + .param_len = 21, + .m = + "\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c" + "\x0b\xde\x6a\x42", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7" + "\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a" + "\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d" + "\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7" + "\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad", + .c_size = 72, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { + .key = "\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9" "\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0" "\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88" @@ -866,6 +916,36 @@ static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = { static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = { { + .key = /* secp384r1(sha1) */ + "\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1" + "\x5a\xa5\x24\xf1\x95\x06\x9e\x28\xfb\xc4\xb9\xbe\x5a\x0d\xd9\x9f" + "\xf3\xd1\x4d\x2d\x07\x99\xbd\xda\xa7\x66\xec\xbb\xea\xba\x79\x42" + "\xc9\x34\x89\x6a\xe7\x0b\xc3\xf2\xfe\x32\x30\xbe\xba\xf9\xdf\x7e" + "\x4b\x6a\x07\x8e\x26\x66\x3f\x1d\xec\xa2\x57\x91\x51\xdd\x17\x0e" + "\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3" + "\xf1", + .key_len = 97, + .params = + "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04" + "\x00\x22", + .param_len = 18, + .m = + "\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22" + "\x3a\x69\xc1\x93", + .m_size = 20, + .algo = OID_id_ecdsa_with_sha1, + .c = + "\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07" + "\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1" + "\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e" + "\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0" + "\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88" + "\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26" + "\x79\x12\x2a\xb7\xc5\x15\x92\xc5", + .c_size = 104, + .public_key_vec = true, + .siggen_sigver_test = true, + }, { .key = /* secp384r1(sha224) */ "\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0" "\xd9\x98\x8d\x92\x2a\xab\x9b\x11\xcb\x48\x18\xa1\xa9\x0d\xd5\x18" diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 3921fbed0b28..51421fdbb0ba 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -17,10 +17,12 @@ * build_OID_registry.pl to generate the data for look_up_OID(). */ enum OID { + OID_id_dsa_with_sha1, /* 1.2.840.10030.4.3 */ OID_id_dsa, /* 1.2.840.10040.4.1 */ OID_id_ecPublicKey, /* 1.2.840.10045.2.1 */ OID_id_prime192v1, /* 1.2.840.10045.3.1.1 */ OID_id_prime256v1, /* 1.2.840.10045.3.1.7 */ + OID_id_ecdsa_with_sha1, /* 1.2.840.10045.4.1 */ OID_id_ecdsa_with_sha224, /* 1.2.840.10045.4.3.1 */ OID_id_ecdsa_with_sha256, /* 1.2.840.10045.4.3.2 */ OID_id_ecdsa_with_sha384, /* 1.2.840.10045.4.3.3 */ @@ -28,6 +30,7 @@ enum OID { /* PKCS#1 {iso(1) member-body(2) us(840) rsadsi(113549) pkcs(1) pkcs-1(1)} */ OID_rsaEncryption, /* 1.2.840.113549.1.1.1 */ + OID_sha1WithRSAEncryption, /* 1.2.840.113549.1.1.5 */ OID_sha256WithRSAEncryption, /* 1.2.840.113549.1.1.11 */ OID_sha384WithRSAEncryption, /* 1.2.840.113549.1.1.12 */ OID_sha512WithRSAEncryption, /* 1.2.840.113549.1.1.13 */ @@ -64,6 +67,7 @@ enum OID { OID_PKU2U, /* 1.3.5.1.5.2.7 */ OID_Scram, /* 1.3.6.1.5.5.14 */ OID_certAuthInfoAccess, /* 1.3.6.1.5.5.7.1.1 */ + OID_sha1, /* 1.3.14.3.2.26 */ OID_id_ansip384r1, /* 1.3.132.0.34 */ OID_sha256, /* 2.16.840.1.101.3.4.2.1 */ OID_sha384, /* 2.16.840.1.101.3.4.2.2 */ diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 0ea1b2970a23..28db5b7589eb 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -236,6 +236,10 @@ choice possible to load a signed module containing the algorithm to check the signature on that module. +config MODULE_SIG_SHA1 + bool "Sign modules with SHA-1" + select CRYPTO_SHA1 + config MODULE_SIG_SHA256 bool "Sign modules with SHA-256" select CRYPTO_SHA256 @@ -265,6 +269,7 @@ endchoice config MODULE_SIG_HASH string depends on MODULE_SIG || IMA_APPRAISE_MODSIG + default "sha1" if MODULE_SIG_SHA1 default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 From 5a7e89d3315d1be86aff8a8bf849023cda6547f7 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Thu, 21 Mar 2024 16:08:45 -0500 Subject: [PATCH 376/496] crypto: iaa - Fix nr_cpus < nr_iaa case If nr_cpus < nr_iaa, the calculated cpus_per_iaa will be 0, which causes a divide-by-0 in rebalance_wq_table(). Make sure cpus_per_iaa is 1 in that case, and also in the nr_iaa == 0 case, even though cpus_per_iaa is never used if nr_iaa == 0, for paranoia. Cc: # v6.8+ Reported-by: Jerry Snitselaar Signed-off-by: Tom Zanussi Signed-off-by: Herbert Xu --- drivers/crypto/intel/iaa/iaa_crypto_main.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c index 1cd304de5388..b2191ade9011 100644 --- a/drivers/crypto/intel/iaa/iaa_crypto_main.c +++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c @@ -806,6 +806,8 @@ static int save_iaa_wq(struct idxd_wq *wq) return -EINVAL; cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa; + if (!cpus_per_iaa) + cpus_per_iaa = 1; out: return 0; } @@ -821,10 +823,12 @@ static void remove_iaa_wq(struct idxd_wq *wq) } } - if (nr_iaa) + if (nr_iaa) { cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa; - else - cpus_per_iaa = 0; + if (!cpus_per_iaa) + cpus_per_iaa = 1; + } else + cpus_per_iaa = 1; } static int wq_table_add_wqs(int iaa, int cpu) From 3cb4a4827596abc82e55b80364f509d0fefc3051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?KONDO=20KAZUMA=28=E8=BF=91=E8=97=A4=E3=80=80=E5=92=8C?= =?UTF-8?q?=E7=9C=9F=29?= Date: Fri, 22 Mar 2024 10:47:02 +0000 Subject: [PATCH 377/496] efi/libstub: fix efi_random_alloc() to allocate memory at alloc_min or higher address Following warning is sometimes observed while booting my servers: [ 3.594838] DMA: preallocated 4096 KiB GFP_KERNEL pool for atomic allocations [ 3.602918] swapper/0: page allocation failure: order:10, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0-1 ... [ 3.851862] DMA: preallocated 1024 KiB GFP_KERNEL|GFP_DMA pool for atomic allocation If 'nokaslr' boot option is set, the warning always happens. On x86, ZONE_DMA is small zone at the first 16MB of physical address space. When this problem happens, most of that space seems to be used by decompressed kernel. Thereby, there is not enough space at DMA_ZONE to meet the request of DMA pool allocation. The commit 2f77465b05b1 ("x86/efistub: Avoid placing the kernel below LOAD_PHYSICAL_ADDR") tried to fix this problem by introducing lower bound of allocation. But the fix is not complete. efi_random_alloc() allocates pages by following steps. 1. Count total available slots ('total_slots') 2. Select a slot ('target_slot') to allocate randomly 3. Calculate a starting address ('target') to be included target_slot 4. Allocate pages, which starting address is 'target' In step 1, 'alloc_min' is used to offset the starting address of memory chunk. But in step 3 'alloc_min' is not considered at all. As the result, 'target' can be miscalculated and become lower than 'alloc_min'. When KASLR is disabled, 'target_slot' is always 0 and the problem happens everytime if the EFI memory map of the system meets the condition. Fix this problem by calculating 'target' considering 'alloc_min'. Cc: linux-efi@vger.kernel.org Cc: Tom Englund Cc: linux-kernel@vger.kernel.org Fixes: 2f77465b05b1 ("x86/efistub: Avoid placing the kernel below LOAD_PHYSICAL_ADDR") Signed-off-by: Kazuma Kondo Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/randomalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index 4e96a855fdf4..7e1852859550 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -120,7 +120,7 @@ efi_status_t efi_random_alloc(unsigned long size, continue; } - target = round_up(md->phys_addr, align) + target_slot * align; + target = round_up(max(md->phys_addr, alloc_min), align) + target_slot * align; pages = size / EFI_PAGE_SIZE; status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, From d8e45f2929b94099913eb66c3ebb18b5063e9421 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Mar 2024 15:51:36 -0800 Subject: [PATCH 378/496] overflow: Change DEFINE_FLEX to take __counted_by member The norm should be flexible array structures with __counted_by annotations, so DEFINE_FLEX() is updated to expect that. Rename the non-annotated version to DEFINE_RAW_FLEX(), and update the few existing users. Additionally add selftests for the macros. Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240306235128.it.933-kees@kernel.org Reviewed-by: Przemek Kitszel Signed-off-by: Kees Cook --- drivers/net/ethernet/intel/ice/ice_base.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_common.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_ddp.c | 8 +++---- drivers/net/ethernet/intel/ice/ice_lag.c | 6 ++--- drivers/net/ethernet/intel/ice/ice_sched.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_switch.c | 10 ++++----- include/linux/overflow.h | 25 +++++++++++++++++---- lib/overflow_kunit.c | 19 ++++++++++++++++ 8 files changed, 58 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index d2fd315556a3..a545a7917e4f 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -956,7 +956,7 @@ ice_vsi_cfg_txq(struct ice_vsi *vsi, struct ice_tx_ring *ring, int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_tx_ring **tx_rings, u16 q_idx) { - DEFINE_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1); + DEFINE_RAW_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1); if (q_idx >= vsi->alloc_txq || !tx_rings || !tx_rings[q_idx]) return -EINVAL; @@ -978,7 +978,7 @@ int ice_vsi_cfg_single_txq(struct ice_vsi *vsi, struct ice_tx_ring **tx_rings, static int ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_tx_ring **rings, u16 count) { - DEFINE_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1); + DEFINE_RAW_FLEX(struct ice_aqc_add_tx_qgrp, qg_buf, txqs, 1); int err = 0; u16 q_idx; diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 4d8111aeb0ff..db4b2844e1f7 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -4695,7 +4695,7 @@ ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues, enum ice_disq_rst_src rst_src, u16 vmvf_num, struct ice_sq_cd *cd) { - DEFINE_FLEX(struct ice_aqc_dis_txq_item, qg_list, q_id, 1); + DEFINE_RAW_FLEX(struct ice_aqc_dis_txq_item, qg_list, q_id, 1); u16 i, buf_size = __struct_size(qg_list); struct ice_q_ctx *q_ctx; int status = -ENOENT; @@ -4917,7 +4917,7 @@ int ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid, u16 *q_id) { - DEFINE_FLEX(struct ice_aqc_dis_txq_item, qg_list, q_id, 1); + DEFINE_RAW_FLEX(struct ice_aqc_dis_txq_item, qg_list, q_id, 1); u16 qg_size = __struct_size(qg_list); struct ice_hw *hw; int status = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index 7532d11ad7f3..fc91c4d41186 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -1938,8 +1938,8 @@ static enum ice_ddp_state ice_init_pkg_info(struct ice_hw *hw, */ static enum ice_ddp_state ice_get_pkg_info(struct ice_hw *hw) { - DEFINE_FLEX(struct ice_aqc_get_pkg_info_resp, pkg_info, pkg_info, - ICE_PKG_CNT); + DEFINE_RAW_FLEX(struct ice_aqc_get_pkg_info_resp, pkg_info, pkg_info, + ICE_PKG_CNT); u16 size = __struct_size(pkg_info); u32 i; @@ -1990,8 +1990,8 @@ static enum ice_ddp_state ice_chk_pkg_compat(struct ice_hw *hw, struct ice_pkg_hdr *ospkg, struct ice_seg **seg) { - DEFINE_FLEX(struct ice_aqc_get_pkg_info_resp, pkg, pkg_info, - ICE_PKG_CNT); + DEFINE_RAW_FLEX(struct ice_aqc_get_pkg_info_resp, pkg, pkg_info, + ICE_PKG_CNT); u16 size = __struct_size(pkg); enum ice_ddp_state state; u32 i; diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 467372d541d2..f97128b69f87 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -491,7 +491,7 @@ static void ice_lag_move_vf_node_tc(struct ice_lag *lag, u8 oldport, u8 newport, u16 vsi_num, u8 tc) { - DEFINE_FLEX(struct ice_aqc_move_elem, buf, teid, 1); + DEFINE_RAW_FLEX(struct ice_aqc_move_elem, buf, teid, 1); struct device *dev = ice_pf_to_dev(lag->pf); u16 numq, valq, num_moved, qbuf_size; u16 buf_size = __struct_size(buf); @@ -849,7 +849,7 @@ static void ice_lag_reclaim_vf_tc(struct ice_lag *lag, struct ice_hw *src_hw, u16 vsi_num, u8 tc) { - DEFINE_FLEX(struct ice_aqc_move_elem, buf, teid, 1); + DEFINE_RAW_FLEX(struct ice_aqc_move_elem, buf, teid, 1); struct device *dev = ice_pf_to_dev(lag->pf); u16 numq, valq, num_moved, qbuf_size; u16 buf_size = __struct_size(buf); @@ -1873,7 +1873,7 @@ static void ice_lag_move_vf_nodes_tc_sync(struct ice_lag *lag, struct ice_hw *dest_hw, u16 vsi_num, u8 tc) { - DEFINE_FLEX(struct ice_aqc_move_elem, buf, teid, 1); + DEFINE_RAW_FLEX(struct ice_aqc_move_elem, buf, teid, 1); struct device *dev = ice_pf_to_dev(lag->pf); u16 numq, valq, num_moved, qbuf_size; u16 buf_size = __struct_size(buf); diff --git a/drivers/net/ethernet/intel/ice/ice_sched.c b/drivers/net/ethernet/intel/ice/ice_sched.c index d174a4eeb899..a1525992d14b 100644 --- a/drivers/net/ethernet/intel/ice/ice_sched.c +++ b/drivers/net/ethernet/intel/ice/ice_sched.c @@ -237,7 +237,7 @@ static int ice_sched_remove_elems(struct ice_hw *hw, struct ice_sched_node *parent, u32 node_teid) { - DEFINE_FLEX(struct ice_aqc_delete_elem, buf, teid, 1); + DEFINE_RAW_FLEX(struct ice_aqc_delete_elem, buf, teid, 1); u16 buf_size = __struct_size(buf); u16 num_groups_removed = 0; int status; @@ -2219,7 +2219,7 @@ int ice_sched_move_nodes(struct ice_port_info *pi, struct ice_sched_node *parent, u16 num_items, u32 *list) { - DEFINE_FLEX(struct ice_aqc_move_elem, buf, teid, 1); + DEFINE_RAW_FLEX(struct ice_aqc_move_elem, buf, teid, 1); u16 buf_len = __struct_size(buf); struct ice_sched_node *node; u16 i, grps_movd = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index f84bab80ca42..d4baae8c3b72 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -1812,7 +1812,7 @@ ice_aq_alloc_free_vsi_list(struct ice_hw *hw, u16 *vsi_list_id, enum ice_sw_lkup_type lkup_type, enum ice_adminq_opc opc) { - DEFINE_FLEX(struct ice_aqc_alloc_free_res_elem, sw_buf, elem, 1); + DEFINE_RAW_FLEX(struct ice_aqc_alloc_free_res_elem, sw_buf, elem, 1); u16 buf_len = __struct_size(sw_buf); struct ice_aqc_res_elem *vsi_ele; int status; @@ -2081,7 +2081,7 @@ ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, */ int ice_alloc_recipe(struct ice_hw *hw, u16 *rid) { - DEFINE_FLEX(struct ice_aqc_alloc_free_res_elem, sw_buf, elem, 1); + DEFINE_RAW_FLEX(struct ice_aqc_alloc_free_res_elem, sw_buf, elem, 1); u16 buf_len = __struct_size(sw_buf); int status; @@ -4418,7 +4418,7 @@ int ice_alloc_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items, u16 *counter_id) { - DEFINE_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); + DEFINE_RAW_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); u16 buf_len = __struct_size(buf); int status; @@ -4446,7 +4446,7 @@ int ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items, u16 counter_id) { - DEFINE_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); + DEFINE_RAW_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); u16 buf_len = __struct_size(buf); int status; @@ -4476,7 +4476,7 @@ ice_free_res_cntr(struct ice_hw *hw, u8 type, u8 alloc_shared, u16 num_items, */ int ice_share_res(struct ice_hw *hw, u16 type, u8 shared, u16 res_id) { - DEFINE_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); + DEFINE_RAW_FLEX(struct ice_aqc_alloc_free_res_elem, buf, elem, 1); u16 buf_len = __struct_size(buf); u16 res_type; int status; diff --git a/include/linux/overflow.h b/include/linux/overflow.h index aa691f2119b0..0c7e3dcfe867 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -398,7 +398,7 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * @count: Number of elements in the array; must be compile-time const. * @initializer: initializer expression (could be empty for no init). */ -#define _DEFINE_FLEX(type, name, member, count, initializer) \ +#define _DEFINE_FLEX(type, name, member, count, initializer...) \ _Static_assert(__builtin_constant_p(count), \ "onstack flex array members require compile-time const count"); \ union { \ @@ -408,8 +408,8 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) type *name = (type *)&name##_u /** - * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing - * flexible array member. + * DEFINE_RAW_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member, when it does not have a __counted_by annotation. * * @type: structure type name, including "struct" keyword. * @name: Name for a variable to define. @@ -420,7 +420,24 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) * flexible array member. * Use __struct_size(@name) to get compile-time size of it afterwards. */ -#define DEFINE_FLEX(type, name, member, count) \ +#define DEFINE_RAW_FLEX(type, name, member, count) \ _DEFINE_FLEX(type, name, member, count, = {}) +/** + * DEFINE_FLEX() - Define an on-stack instance of structure with a trailing + * flexible array member. + * + * @TYPE: structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @MEMBER: Name of the array member. + * @COUNTER: Name of the __counted_by member. + * @COUNT: Number of elements in the array; must be compile-time const. + * + * Define a zeroed, on-stack, instance of @TYPE structure with a trailing + * flexible array member. + * Use __struct_size(@NAME) to get compile-time size of it afterwards. + */ +#define DEFINE_FLEX(TYPE, NAME, MEMBER, COUNTER, COUNT) \ + _DEFINE_FLEX(TYPE, NAME, MEMBER, COUNT, = { .obj.COUNTER = COUNT, }) + #endif /* __LINUX_OVERFLOW_H */ diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c index 65e8a72a83bf..4ef31b0bb74d 100644 --- a/lib/overflow_kunit.c +++ b/lib/overflow_kunit.c @@ -1172,6 +1172,24 @@ static void castable_to_type_test(struct kunit *test) #undef TEST_CASTABLE_TO_TYPE } +struct foo { + int a; + u32 counter; + s16 array[] __counted_by(counter); +}; + +static void DEFINE_FLEX_test(struct kunit *test) +{ + DEFINE_RAW_FLEX(struct foo, two, array, 2); + DEFINE_FLEX(struct foo, eight, array, counter, 8); + DEFINE_FLEX(struct foo, empty, array, counter, 0); + + KUNIT_EXPECT_EQ(test, __struct_size(two), + sizeof(struct foo) + sizeof(s16) + sizeof(s16)); + KUNIT_EXPECT_EQ(test, __struct_size(eight), 24); + KUNIT_EXPECT_EQ(test, __struct_size(empty), sizeof(struct foo)); +} + static struct kunit_case overflow_test_cases[] = { KUNIT_CASE(u8_u8__u8_overflow_test), KUNIT_CASE(s8_s8__s8_overflow_test), @@ -1194,6 +1212,7 @@ static struct kunit_case overflow_test_cases[] = { KUNIT_CASE(overflows_type_test), KUNIT_CASE(same_type_test), KUNIT_CASE(castable_to_type_test), + KUNIT_CASE(DEFINE_FLEX_test), {} }; From 231dc3f0c936db142ef3fa922f1ab751dd532d70 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 21 Mar 2024 13:18:17 -0700 Subject: [PATCH 379/496] lkdtm/bugs: Improve warning message for compilers without counted_by support The current message for telling the user that their compiler does not support the counted_by attribute in the FAM_BOUNDS test does not make much sense either grammatically or semantically. Fix it to make it correct in both aspects. Signed-off-by: Nathan Chancellor Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/20240321-lkdtm-improve-lack-of-counted_by-msg-v1-1-0fbf7481a29c@kernel.org Signed-off-by: Kees Cook --- drivers/misc/lkdtm/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c index b92767d6bdd2..5178c02b21eb 100644 --- a/drivers/misc/lkdtm/bugs.c +++ b/drivers/misc/lkdtm/bugs.c @@ -417,7 +417,7 @@ static void lkdtm_FAM_BOUNDS(void) pr_err("FAIL: survived access of invalid flexible array member index!\n"); if (!__has_attribute(__counted_by__)) - pr_warn("This is expected since this %s was built a compiler supporting __counted_by\n", + pr_warn("This is expected since this %s was built with a compiler that does not support __counted_by\n", lkdtm_kernel_info); else if (IS_ENABLED(CONFIG_UBSAN_BOUNDS)) pr_expected_config(CONFIG_UBSAN_TRAP); From f6c8f5e8694c7a78c94e408b628afa6255cc428a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 20 Mar 2024 19:02:14 -0700 Subject: [PATCH 380/496] tools: ynl: fix setting presence bits in simple nests When we set members of simple nested structures in requests we need to set "presence" bits for all the nesting layers below. This has nothing to do with the presence type of the last layer. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/20240321020214.1250202-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/ynl-gen-c.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 6b7eb2d2aaf1..a451cbfbd781 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -228,8 +228,11 @@ class Type(SpecAttr): presence = '' for i in range(0, len(ref)): presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}" - if self.presence_type() == 'bit': - code.append(presence + ' = 1;') + # Every layer below last is a nest, so we know it uses bit presence + # last layer is "self" and may be a complex type + if i == len(ref) - 1 and self.presence_type() != 'bit': + continue + code.append(presence + ' = 1;') code += self._setter_lines(ri, member, presence) func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}" From 9145e2249ed68af99fdbbbf6111aaf0e2dfb72d0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Mar 2024 17:42:18 +0300 Subject: [PATCH 381/496] nexthop: fix uninitialized variable in nla_put_nh_group_stats() The "*hw_stats_used" value needs to be set on the success paths to prevent an uninitialized variable bug in the caller, nla_put_nh_group_stats(). Fixes: 5072ae00aea4 ("net: nexthop: Expose nexthop group HW stats to user space") Signed-off-by: Dan Carpenter Reviewed-by: Jiri Pirko Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/f08ac289-d57f-4a1a-830f-cf9a0563cb9c@moroto.mountain Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 74928a9d1aa4..535856b0f0ed 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -768,8 +768,10 @@ static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) struct net *net = nh->net; int err; - if (nexthop_notifiers_is_empty(net)) + if (nexthop_notifiers_is_empty(net)) { + *hw_stats_used = false; return 0; + } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) From c04f7dfe6ec2a3a20a8578d5f67a436ae36e2a2a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 21 Mar 2024 19:30:42 +0200 Subject: [PATCH 382/496] ipv6: Fix address dump when IPv6 is disabled on an interface Cited commit started returning an error when user space requests to dump the interface's IPv6 addresses and IPv6 is disabled on the interface. Restore the previous behavior and do not return an error. Before cited commit: # ip address show dev dummy1 2: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 1a:52:02:5a:c2:6e brd ff:ff:ff:ff:ff:ff inet6 fe80::1852:2ff:fe5a:c26e/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 1000 # ip address show dev dummy1 2: dummy1: mtu 1000 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 1a:52:02:5a:c2:6e brd ff:ff:ff:ff:ff:ff After cited commit: # ip address show dev dummy1 2: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 1e:9b:94:00:ac:e8 brd ff:ff:ff:ff:ff:ff inet6 fe80::1c9b:94ff:fe00:ace8/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 1000 # ip address show dev dummy1 RTNETLINK answers: No such device Dump terminated With this patch: # ip address show dev dummy1 2: dummy1: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 42:35:fc:53:66:cf brd ff:ff:ff:ff:ff:ff inet6 fe80::4035:fcff:fe53:66cf/64 scope link proto kernel_ll valid_lft forever preferred_lft forever # ip link set dev dummy1 mtu 1000 # ip address show dev dummy1 2: dummy1: mtu 1000 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 42:35:fc:53:66:cf brd ff:ff:ff:ff:ff:ff Fixes: 9cc4cc329d30 ("ipv6: use xa_array iterator to implement inet6_dump_addr()") Reported-by: Gal Pressman Closes: https://lore.kernel.org/netdev/7e261328-42eb-411d-b1b4-ad884eeaae4d@linux.dev/ Tested-by: Gal Pressman Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240321173042.2151756-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 247bd4d8ee45..92db9b474f2b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5416,10 +5416,11 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, err = 0; if (fillargs.ifindex) { - err = -ENODEV; dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); - if (!dev) + if (!dev) { + err = -ENODEV; goto done; + } idev = __in6_dev_get(dev); if (idev) err = in6_dump_addrs(idev, skb, cb, From f7f5d1808b1b66935a24dd796dd1a0612ca9c147 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 21 Mar 2024 15:39:39 +0000 Subject: [PATCH 383/496] bpf: verifier: fix addr_space_cast from as(1) to as(0) The verifier currently converts addr_space_cast from as(1) to as(0) that is: BPF_ALU64 | BPF_MOV | BPF_X with off=1 and imm=1 to BPF_ALU | BPF_MOV | BPF_X with imm=1 (32-bit mov) Because of this imm=1, the JITs that have bpf_jit_needs_zext() == true, interpret the converted instruction as BPF_ZEXT_REG(DST) which is a special form of mov32, used for doing explicit zero extension on dst. These JITs will just zero extend the dst reg and will not move the src to dst before the zext. Fix do_misc_fixups() to set imm=0 when converting addr_space_cast to a normal mov32. The JITs that have bpf_jit_needs_zext() == true rely on the verifier to emit zext instructions. Mark dst_reg as subreg when doing cast from as(1) to as(0) so the verifier emits a zext instruction after the mov. Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240321153939.113996-1-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dd3b99d1bb9..2cd7e0e79283 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14054,8 +14054,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (insn->imm) { /* off == BPF_ADDR_SPACE_CAST */ mark_reg_unknown(env, regs, insn->dst_reg); - if (insn->imm == 1) /* cast from as(1) to as(0) */ + if (insn->imm == 1) { /* cast from as(1) to as(0) */ dst_reg->type = PTR_TO_ARENA; + /* PTR_TO_ARENA is 32-bit */ + dst_reg->subreg_def = env->insn_idx + 1; + } } else if (insn->off == 0) { /* case: R1 = R2 * copy register state to dest reg @@ -19609,8 +19612,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (((struct bpf_map *)env->prog->aux->arena)->map_flags & BPF_F_NO_USER_CONV)) { /* convert to 32-bit mov that clears upper 32-bit */ insn->code = BPF_ALU | BPF_MOV | BPF_X; - /* clear off, so it's a normal 'wX = wY' from JIT pov */ + /* clear off and imm, so it's a normal 'wX = wY' from JIT pov */ insn->off = 0; + insn->imm = 0; } /* cast from as(0) to as(1) should be handled by JIT */ goto next_insn; } From fa3550dca8f02ec312727653a94115ef3ab68445 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 22 Mar 2024 13:35:52 +0000 Subject: [PATCH 384/496] selftests/bpf: verifier_arena: fix mmap address for arm64 The arena_list selftest uses (1ull << 32) in the mmap address computation for arm64. Use the same in the verifier_arena selftest. This makes the selftest pass for arm64 on the CI[1]. [1] https://github.com/kernel-patches/bpf/pull/6622 Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240322133552.70681-1-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/verifier_arena.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/verifier_arena.c b/tools/testing/selftests/bpf/progs/verifier_arena.c index 969bc091060b..93144ae6df74 100644 --- a/tools/testing/selftests/bpf/progs/verifier_arena.c +++ b/tools/testing/selftests/bpf/progs/verifier_arena.c @@ -12,7 +12,11 @@ struct { __uint(type, BPF_MAP_TYPE_ARENA); __uint(map_flags, BPF_F_MMAPABLE); __uint(max_entries, 2); /* arena of two pages close to 32-bit boundary*/ - __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, (1ull << 32) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */ +#else + __ulong(map_extra, (1ull << 44) | (~0u - __PAGE_SIZE * 2 + 1)); /* start of mmap() region */ +#endif } arena SEC(".maps"); SEC("syscall") From 122fdbd2a030a95128737fc77e47df15a8f170c3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Fri, 22 Mar 2024 15:35:18 +0000 Subject: [PATCH 385/496] bpf: verifier: reject addr_space_cast insn without arena The verifier allows using the addr_space_cast instruction in a program that doesn't have an associated arena. This was caught in the form an invalid memory access in do_misc_fixups() when while converting addr_space_cast to a normal 32-bit mov, env->prog->aux->arena was dereferenced to check for BPF_F_NO_USER_CONV flag. Reject programs that include the addr_space_cast instruction but don't have an associated arena. root@rv-tester:~# ./reproducer Unable to handle kernel access to user memory without uaccess routines at virtual address 0000000000000030 Oops [#1] [] do_misc_fixups+0x43c/0x1168 [] bpf_check+0xda8/0x22b6 [] bpf_prog_load+0x486/0x8dc [] __sys_bpf+0xbd8/0x214e [] __riscv_sys_bpf+0x22/0x2a [] do_trap_ecall_u+0x102/0x17c [] ret_from_exception+0x0/0x64 Fixes: 6082b6c328b5 ("bpf: Recognize addr_space_cast instruction in the verifier.") Reported-by: xingwei lee Reported-by: yue sun Closes: https://lore.kernel.org/bpf/CABOYnLz09O1+2gGVJuCxd_24a-7UueXzV-Ff+Fr+h5EKFDiYCQ@mail.gmail.com/ Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240322153518.11555-1-puranjay12@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cd7e0e79283..0bfc0050db28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14022,6 +14022,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) verbose(env, "addr_space_cast insn can only convert between address space 1 and 0\n"); return -EINVAL; } + if (!env->prog->aux->arena) { + verbose(env, "addr_space_cast insn can only be used in a program that has an associated arena\n"); + return -EINVAL; + } } else { if ((insn->off != 0 && insn->off != 8 && insn->off != 16 && insn->off != 32) || insn->imm) { From c90399fbd74a0713d5972a6d931e4a9918621e88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Mar 2024 19:56:35 +0100 Subject: [PATCH 386/496] x86/cpu: Ensure that CPU info updates are propagated on UP The boot sequence evaluates CPUID information twice: 1) During early boot 2) When finalizing the early setup right before mitigations are selected and alternatives are patched. In both cases the evaluation is stored in boot_cpu_data, but on UP the copying of boot_cpu_data to the per CPU info of the boot CPU happens between #1 and #2. So any update which happens in #2 is never propagated to the per CPU info instance. Consolidate the whole logic and copy boot_cpu_data right before applying alternatives as that's the point where boot_cpu_data is in it's final state and not supposed to change anymore. This also removes the voodoo mb() from smp_prepare_cpus_common() which had absolutely no purpose. Fixes: 71eb4893cfaf ("x86/percpu: Cure per CPU madness on UP") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240322185305.127642785@linutronix.de --- arch/x86/kernel/cpu/common.c | 9 +++++++++ arch/x86/kernel/setup.c | 10 ---------- arch/x86/kernel/smpboot.c | 32 +++++--------------------------- 3 files changed, 14 insertions(+), 37 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ba8cf5e9ce56..5c1e6d6be267 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2307,6 +2307,8 @@ void arch_smt_update(void) void __init arch_cpu_finalize_init(void) { + struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); + identify_boot_cpu(); select_idle_routine(); @@ -2345,6 +2347,13 @@ void __init arch_cpu_finalize_init(void) fpu__init_system(); fpu__init_cpu(); + /* + * Ensure that access to the per CPU representation has the initial + * boot CPU configuration. + */ + *c = boot_cpu_data; + c->initialized = true; + alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3e1e96efadfe..ef206500ed6f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1206,16 +1206,6 @@ void __init i386_reserve_resources(void) #endif /* CONFIG_X86_32 */ -#ifndef CONFIG_SMP -void __init smp_prepare_boot_cpu(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - *c = boot_cpu_data; - c->initialized = true; -} -#endif - static struct notifier_block kernel_offset_notifier = { .notifier_call = dump_kernel_offset }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fe355c89f6c1..76bb65045c64 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -313,14 +313,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -static void __init smp_store_boot_cpu_info(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - *c = boot_cpu_data; - c->initialized = true; -} - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -1039,29 +1031,15 @@ static __init void disable_smp(void) cpumask_set_cpu(0, topology_die_cpumask(0)); } -static void __init smp_cpu_index_default(void) -{ - int i; - struct cpuinfo_x86 *c; - - for_each_possible_cpu(i) { - c = &cpu_data(i); - /* mark all to hotplug */ - c->cpu_index = nr_cpu_ids; - } -} - void __init smp_prepare_cpus_common(void) { unsigned int i; - smp_cpu_index_default(); - - /* - * Setup boot CPU information - */ - smp_store_boot_cpu_info(); /* Final full version of the data */ - mb(); + /* Mark all except the boot CPU as hotpluggable */ + for_each_possible_cpu(i) { + if (i) + per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids; + } for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); From 7af541cee1e0eb48c6eb439bc6309175339fa96f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Mar 2024 19:56:36 +0100 Subject: [PATCH 387/496] x86/topology: Don't evaluate logical IDs during early boot The local APICs have not yet been enumerated so the logical ID evaluation from the topology bitmaps does not work and would return an error code. Skip the evaluation during the early boot CPUID evaluation and only apply it on the final run. Fixes: 380414be78bf ("x86/cpu/topology: Use topology logical mapping mechanism") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240322185305.186943142@linutronix.de --- arch/x86/kernel/cpu/topology_common.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index a50ae8d63d1c..9a6069e7133c 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -140,7 +140,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) } } -static void topo_set_ids(struct topo_scan *tscan) +static void topo_set_ids(struct topo_scan *tscan, bool early) { struct cpuinfo_x86 *c = tscan->c; u32 apicid = c->topo.apicid; @@ -148,8 +148,10 @@ static void topo_set_ids(struct topo_scan *tscan) c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN); c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN); - c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); - c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + } /* Package relative core ID */ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >> @@ -187,7 +189,7 @@ void cpu_parse_topology(struct cpuinfo_x86 *c) tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]); } - topo_set_ids(&tscan); + topo_set_ids(&tscan, false); } void __init cpu_init_topology(struct cpuinfo_x86 *c) @@ -208,7 +210,7 @@ void __init cpu_init_topology(struct cpuinfo_x86 *c) x86_topo_system.dom_size[dom] = 1U << sft; } - topo_set_ids(&tscan); + topo_set_ids(&tscan, true); /* * AMD systems have Nodes per package which cannot be mapped to From 5e25eb25dae9fa0700bbe42aff0e2f105fcd096a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Mar 2024 19:56:38 +0100 Subject: [PATCH 388/496] x86/topology: Handle the !APIC case gracefully If there is no local APIC enumerated and registered then the topology bitmaps are empty. Therefore, topology_init_possible_cpus() will die with a division by zero exception. Prevent this by registering a fake APIC id to populate the topology bitmap. This also allows to use all topology query interfaces unconditionally. It does not affect the actual APIC code because either the local APIC address was not registered or no local APIC could be detected. Fixes: f1f758a80516 ("x86/topology: Add a mechanism to track topology via APIC IDs") Reported-by: Guenter Roeck Reported-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240322185305.242709302@linutronix.de --- arch/x86/kernel/cpu/topology.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 3259b1d4fefe..aaca8d235dc2 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -415,6 +415,17 @@ void __init topology_init_possible_cpus(void) unsigned int total = assigned + disabled; u32 apicid, firstid; + /* + * If there was no APIC registered, then fake one so that the + * topology bitmap is populated. That ensures that the code below + * is valid and the various query interfaces can be used + * unconditionally. This does not affect the actual APIC code in + * any way because either the local APIC address has not been + * registered or the local APIC was disabled on the command line. + */ + if (topo_info.boot_cpu_apic_id == BAD_APICID) + topology_register_boot_apic(0); + if (!restrict_to_up()) { if (WARN_ON_ONCE(assigned > nr_cpu_ids)) { disabled += assigned - nr_cpu_ids; From f2208aa12c27bfada3c15c550c03ca81d42dcac2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 22 Mar 2024 19:56:39 +0100 Subject: [PATCH 389/496] x86/mpparse: Register APIC address only once The APIC address is registered twice. First during the early detection and afterwards when actually scanning the table for APIC IDs. The APIC and topology core warn about the second attempt. Restrict it to the early detection call. Fixes: 81287ad65da5 ("x86/apic: Sanitize APIC address setup") Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240322185305.297774848@linutronix.de --- arch/x86/kernel/mpparse.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 1ccd30c8246f..e89171b0347a 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -197,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (!smp_check_mpc(mpc, oem, str)) return 0; - /* Initialize the lapic mapping */ - if (!acpi_lapic) - register_lapic_address(mpc->lapic); - - if (early) + if (early) { + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); return 1; + } /* Now process the configuration blocks. */ while (count < mpc->length) { From a8ed59a3a8de2648e69dd5936f5771ac4c92d085 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 22 Mar 2024 11:20:15 -0700 Subject: [PATCH 390/496] Documentation/x86: Document that resctrl bandwidth control units are MiB The memory bandwidth software controller uses 2^20 units rather than 10^6. See mbm_bw_count() which computes bandwidth using the "SZ_1M" Linux define for 0x00100000. Update the documentation to use MiB when describing this feature. It's too late to fix the mount option "mba_MBps" as that is now an established user interface. Signed-off-by: Tony Luck Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240322182016.196544-1-tony.luck@intel.com --- Documentation/arch/x86/resctrl.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/arch/x86/resctrl.rst b/Documentation/arch/x86/resctrl.rst index a6279df64a9d..3712d81cb50c 100644 --- a/Documentation/arch/x86/resctrl.rst +++ b/Documentation/arch/x86/resctrl.rst @@ -45,7 +45,7 @@ mount options are: Enable code/data prioritization in L2 cache allocations. "mba_MBps": Enable the MBA Software Controller(mba_sc) to specify MBA - bandwidth in MBps + bandwidth in MiBps "debug": Make debug files accessible. Available debug files are annotated with "Available only with debug option". @@ -526,7 +526,7 @@ threads start using more cores in an rdtgroup, the actual bandwidth may increase or vary although user specified bandwidth percentage is same. In order to mitigate this and make the interface more user friendly, -resctrl added support for specifying the bandwidth in MBps as well. The +resctrl added support for specifying the bandwidth in MiBps as well. The kernel underneath would use a software feedback mechanism or a "Software Controller(mba_sc)" which reads the actual bandwidth using MBM counters and adjust the memory bandwidth percentages to ensure:: @@ -573,13 +573,13 @@ Memory b/w domain is L3 cache. MB:=bandwidth0;=bandwidth1;... -Memory bandwidth Allocation specified in MBps +Memory bandwidth Allocation specified in MiBps --------------------------------------------- Memory bandwidth domain is L3 cache. :: - MB:=bw_MBps0;=bw_MBps1;... + MB:=bw_MiBps0;=bw_MiBps1;... Slow Memory Bandwidth Allocation (SMBA) --------------------------------------- From 10e4b5166df9ff7a2d5316138ca668b42d004422 Mon Sep 17 00:00:00 2001 From: Adamos Ttofari Date: Fri, 22 Mar 2024 16:04:39 -0700 Subject: [PATCH 391/496] x86/fpu: Keep xfd_state in sync with MSR_IA32_XFD Commit 672365477ae8 ("x86/fpu: Update XFD state where required") and commit 8bf26758ca96 ("x86/fpu: Add XFD state to fpstate") introduced a per CPU variable xfd_state to keep the MSR_IA32_XFD value cached, in order to avoid unnecessary writes to the MSR. On CPU hotplug MSR_IA32_XFD is reset to the init_fpstate.xfd, which wipes out any stale state. But the per CPU cached xfd value is not reset, which brings them out of sync. As a consequence a subsequent xfd_update_state() might fail to update the MSR which in turn can result in XRSTOR raising a #NM in kernel space, which crashes the kernel. To fix this, introduce xfd_set_state() to write xfd_state together with MSR_IA32_XFD, and use it in all places that set MSR_IA32_XFD. Fixes: 672365477ae8 ("x86/fpu: Update XFD state where required") Signed-off-by: Adamos Ttofari Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240322230439.456571-1-chang.seok.bae@intel.com Closes: https://lore.kernel.org/lkml/20230511152818.13839-1-attofari@amazon.de --- arch/x86/kernel/fpu/xstate.c | 5 +++-- arch/x86/kernel/fpu/xstate.h | 14 ++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 117e74c44e75..33a214b1a4ce 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -178,10 +178,11 @@ void fpu__init_cpu_xstate(void) * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures - * that any stale state is wiped out from XFD. + * that any stale state is wiped out from XFD. Reset the per CPU + * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) - wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); + xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 3518fb26d06b..19ca623ffa2a 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -148,20 +148,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #endif #ifdef CONFIG_X86_64 +static inline void xfd_set_state(u64 xfd) +{ + wrmsrl(MSR_IA32_XFD, xfd); + __this_cpu_write(xfd_state, xfd); +} + static inline void xfd_update_state(struct fpstate *fpstate) { if (fpu_state_size_dynamic()) { u64 xfd = fpstate->xfd; - if (__this_cpu_read(xfd_state) != xfd) { - wrmsrl(MSR_IA32_XFD, xfd); - __this_cpu_write(xfd_state, xfd); - } + if (__this_cpu_read(xfd_state) != xfd) + xfd_set_state(xfd); } } extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu); #else +static inline void xfd_set_state(u64 xfd) { } + static inline void xfd_update_state(struct fpstate *fpstate) { } static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) { From 8a8a9c9047d1089598bdb010ec44d7f14b4f9203 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 22 Mar 2024 09:17:25 -0700 Subject: [PATCH 392/496] x86/cpu: Add model number for another Intel Arrow Lake mobile processor This one is the regular laptop CPU. Signed-off-by: Tony Luck Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20240322161725.195614-1-tony.luck@intel.com --- arch/x86/include/asm/intel-family.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index b65e9c46b922..d0941f4c2724 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -127,6 +127,7 @@ #define INTEL_FAM6_ARROWLAKE_H 0xC5 #define INTEL_FAM6_ARROWLAKE 0xC6 +#define INTEL_FAM6_ARROWLAKE_U 0xB5 #define INTEL_FAM6_LUNARLAKE_M 0xBD From 4d0d7e7852752ea56375ac8645f0239e21ca2b50 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 22 Mar 2024 10:41:06 -0500 Subject: [PATCH 393/496] x86/boot/64: Apply encryption mask to 5-level pagetable update When running with 5-level page tables, the kernel mapping PGD entry is updated to point to the P4D table. The assignment uses _PAGE_TABLE_NOENC, which, when SME is active (mem_encrypt=on), results in a page table entry without the encryption mask set, causing the system to crash on boot. Change the assignment to use _PAGE_TABLE instead of _PAGE_TABLE_NOENC so that the encryption mask is set for the PGD entry. Fixes: 533568e06b15 ("x86/boot/64: Use RIP_REL_REF() to access early_top_pgt[]") Signed-off-by: Tom Lendacky Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/8f20345cda7dbba2cf748b286e1bc00816fe649a.1711122067.git.thomas.lendacky@amd.com --- arch/x86/kernel/head64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 212e8e06aeba..7d2eb035b6a3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -175,7 +175,7 @@ unsigned long __head __startup_64(unsigned long physaddr, p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE_NOENC; + pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; } RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; From 9843231c97267d72be38a0409f5097987bc2cfa4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Fri, 22 Mar 2024 10:41:07 -0500 Subject: [PATCH 394/496] x86/boot/64: Move 5-level paging global variable assignments back Commit 63bed9660420 ("x86/startup_64: Defer assignment of 5-level paging global variables") moved assignment of 5-level global variables to later in the boot in order to avoid having to use RIP relative addressing in order to set them. However, when running with 5-level paging and SME active (mem_encrypt=on), the variables are needed as part of the page table setup needed to encrypt the kernel (using pgd_none(), p4d_offset(), etc.). Since the variables haven't been set, the page table manipulation is done as if 4-level paging is active, causing the system to crash on boot. While only a subset of the assignments that were moved need to be set early, move all of the assignments back into check_la57_support() so that these assignments aren't spread between two locations. Instead of just reverting the fix, this uses the new RIP_REL_REF() macro when assigning the variables. Fixes: 63bed9660420 ("x86/startup_64: Defer assignment of 5-level paging global variables") Signed-off-by: Tom Lendacky Signed-off-by: Ingo Molnar Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/2ca419f4d0de719926fd82353f6751f717590a86.1711122067.git.thomas.lendacky@amd.com --- arch/x86/kernel/head64.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 7d2eb035b6a3..a817ed0724d1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -81,6 +81,13 @@ static inline bool check_la57_support(void) if (!(native_read_cr4() & X86_CR4_LA57)) return false; + RIP_REL_REF(__pgtable_l5_enabled) = 1; + RIP_REL_REF(pgdir_shift) = 48; + RIP_REL_REF(ptrs_per_p4d) = 512; + RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; + RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; + RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; + return true; } @@ -431,15 +438,6 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); - if (check_la57_support()) { - __pgtable_l5_enabled = 1; - pgdir_shift = 48; - ptrs_per_p4d = 512; - page_offset_base = __PAGE_OFFSET_BASE_L5; - vmalloc_base = __VMALLOC_BASE_L5; - vmemmap_base = __VMEMMAP_BASE_L5; - } - cr4_init_shadow(); /* Kill off the identity-map trampoline */ From cefcd4fe2e3aaf792c14c9e56dab89e3d7a65d02 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 22 Mar 2024 17:03:58 +0200 Subject: [PATCH 395/496] x86/efistub: Call mixed mode boot services on the firmware's stack Normally, the EFI stub calls into the EFI boot services using the stack that was live when the stub was entered. According to the UEFI spec, this stack needs to be at least 128k in size - this might seem large but all asynchronous processing and event handling in EFI runs from the same stack and so quite a lot of space may be used in practice. In mixed mode, the situation is a bit different: the bootloader calls the 32-bit EFI stub entry point, which calls the decompressor's 32-bit entry point, where the boot stack is set up, using a fixed allocation of 16k. This stack is still in use when the EFI stub is started in 64-bit mode, and so all calls back into the EFI firmware will be using the decompressor's limited boot stack. Due to the placement of the boot stack right after the boot heap, any stack overruns have gone unnoticed. However, commit 5c4feadb0011983b ("x86/decompressor: Move global symbol references to C code") moved the definition of the boot heap into C code, and now the boot stack is placed right at the base of BSS, where any overruns will corrupt the end of the .data section. While it would be possible to work around this by increasing the size of the boot stack, doing so would affect all x86 systems, and mixed mode systems are a tiny (and shrinking) fraction of the x86 installed base. So instead, record the firmware stack pointer value when entering from the 32-bit firmware, and switch to this stack every time a EFI boot service call is made. Cc: # v6.1+ Signed-off-by: Ard Biesheuvel --- arch/x86/boot/compressed/efi_mixed.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/boot/compressed/efi_mixed.S b/arch/x86/boot/compressed/efi_mixed.S index f4e22ef774ab..719e939050cb 100644 --- a/arch/x86/boot/compressed/efi_mixed.S +++ b/arch/x86/boot/compressed/efi_mixed.S @@ -49,6 +49,11 @@ SYM_FUNC_START(startup_64_mixed_mode) lea efi32_boot_args(%rip), %rdx mov 0(%rdx), %edi mov 4(%rdx), %esi + + /* Switch to the firmware's stack */ + movl efi32_boot_sp(%rip), %esp + andl $~7, %esp + #ifdef CONFIG_EFI_HANDOVER_PROTOCOL mov 8(%rdx), %edx // saved bootparams pointer test %edx, %edx @@ -254,6 +259,9 @@ SYM_FUNC_START_LOCAL(efi32_entry) /* Store firmware IDT descriptor */ sidtl (efi32_boot_idt - 1b)(%ebx) + /* Store firmware stack pointer */ + movl %esp, (efi32_boot_sp - 1b)(%ebx) + /* Store boot arguments */ leal (efi32_boot_args - 1b)(%ebx), %ebx movl %ecx, 0(%ebx) @@ -318,5 +326,6 @@ SYM_DATA_END(efi32_boot_idt) SYM_DATA_LOCAL(efi32_boot_cs, .word 0) SYM_DATA_LOCAL(efi32_boot_ds, .word 0) +SYM_DATA_LOCAL(efi32_boot_sp, .long 0) SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0) SYM_DATA(efi_is64, .byte 1) From df7ecce842b846a04d087ba85fdb79a90e26a1b0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 22 Mar 2024 17:01:45 +0100 Subject: [PATCH 396/496] x86/efistub: Don't clear BSS twice in mixed mode Clearing BSS should only be done once, at the very beginning. efi_pe_entry() is the entrypoint from the firmware, which may not clear BSS and so it is done explicitly. However, efi_pe_entry() is also used as an entrypoint by the mixed mode startup code, in which case BSS will already have been cleared, and doing it again at this point will corrupt global variables holding the firmware's GDT/IDT and segment selectors. So make the memset() conditional on whether the EFI stub is running in native mode. Fixes: b3810c5a2cc4a666 ("x86/efistub: Clear decompressor BSS in native EFI entrypoint") Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/x86-stub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 2096ae09438e..1edf93e63897 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -476,7 +476,8 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_status_t status; char *cmdline_ptr; - memset(_bss, 0, _ebss - _bss); + if (efi_is_native()) + memset(_bss, 0, _ebss - _bss); efi_system_table = sys_table_arg; From 62b71cd73d41ddac6b1760402bbe8c4932e23531 Mon Sep 17 00:00:00 2001 From: Oleksandr Tymoshenko Date: Sat, 23 Mar 2024 06:33:33 +0000 Subject: [PATCH 397/496] efi: fix panic in kdump kernel Check if get_next_variable() is actually valid pointer before calling it. In kdump kernel this method is set to NULL that causes panic during the kexec-ed kernel boot. Tested with QEMU and OVMF firmware. Fixes: bad267f9e18f ("efi: verify that variable services are supported") Signed-off-by: Oleksandr Tymoshenko Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 8859fb0b006d..fdf07dd6f459 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -203,6 +203,8 @@ static bool generic_ops_supported(void) name_size = sizeof(name); + if (!efi.get_next_variable) + return false; status = efi.get_next_variable(&name_size, &name, &guid); if (status == EFI_UNSUPPORTED) return false; From 4cece764965020c22cff7665b18a012006359095 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Mar 2024 14:10:05 -0700 Subject: [PATCH 398/496] Linux 6.9-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 5e09b53b4850..763b6792d3d5 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 -PATCHLEVEL = 8 +PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* From 11763a8598f888dec631a8a903f7ada32181001f Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Fri, 2 Feb 2024 20:15:31 +0800 Subject: [PATCH 399/496] fs/9p: fix uaf in in v9fs_stat2inode_dotl The incorrect logical order of accessing the st object code in v9fs_fid_iget_dotl is causing this uaf. Fixes: 724a08450f74 ("fs/9p: simplify iget to remove unnecessary paths") Reported-and-tested-by: syzbot+7a3d75905ea1a830dbe5@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Tested-by: Breno Leitao Reviewed-by: Dominique Martinet Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode_dotl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index ef9db3e03506..2b313fe7003e 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -78,11 +78,11 @@ struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid) retval = v9fs_init_inode(v9ses, inode, &fid->qid, st->st_mode, new_decode_dev(st->st_rdev)); + v9fs_stat2inode_dotl(st, inode, 0); kfree(st); if (retval) goto error; - v9fs_stat2inode_dotl(st, inode, 0); v9fs_set_netfs_context(inode); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); From 10211b4a23cf4a3df5c11a10e5b3d371f16a906f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Feb 2024 22:22:50 +0000 Subject: [PATCH 400/496] fs/9p: remove redundant pointer v9ses Pointer v9ses is being assigned the value from the return of inlined function v9fs_inode2v9ses (which just returns inode->i_sb->s_fs_info). The pointer is not used after the assignment, so the variable is redundant and can be removed. Cleans up clang scan warnings such as: fs/9p/vfs_inode_dotl.c:300:28: warning: variable 'v9ses' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Reviewed-by: Dominique Martinet Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode_dotl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 2b313fe7003e..55dde186041a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -297,7 +297,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, umode_t omode) { int err; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; kgid_t gid; const unsigned char *name; @@ -307,7 +306,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); - v9ses = v9fs_inode2v9ses(dir); omode |= S_IFDIR; if (dir->i_mode & S_ISGID) @@ -739,7 +737,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, kgid_t gid; const unsigned char *name; umode_t mode; - struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; @@ -749,7 +746,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); - v9ses = v9fs_inode2v9ses(dir); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); From a97b59ed796804612468a3fb0ac2a5567a100a7a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 25 Mar 2024 08:51:16 +0800 Subject: [PATCH 401/496] erofs: drop experimental warning for FSDAX As EXT4/XFS filesystems, FSDAX functionality is considered to be stable. Let's drop this warning. Reviewed-by: Jingbo Xu Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20240325005116.106351-1-hsiangkao@linux.alibaba.com --- fs/erofs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 69308fd73e4a..c0eb139adb07 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -430,7 +430,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: - warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); set_opt(&ctx->opt, DAX_ALWAYS); clear_opt(&ctx->opt, DAX_NEVER); return true; From 7557d296ad439f66a87cd34917af2a4172517826 Mon Sep 17 00:00:00 2001 From: Sandeep Dhavale Date: Thu, 14 Mar 2024 16:14:06 -0700 Subject: [PATCH 402/496] MAINTAINERS: erofs: add myself as reviewer I have been contributing to erofs for sometime and I would like to help with code reviews as well. Signed-off-by: Sandeep Dhavale Acked-by: Chao Yu Reviewed-by: Gao Xiang Link: https://lore.kernel.org/r/20240314231407.1000541-1-dhavale@google.com Signed-off-by: Gao Xiang --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb080..5ca50939497c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7941,6 +7941,7 @@ M: Gao Xiang M: Chao Yu R: Yue Hu R: Jeffle Xu +R: Sandeep Dhavale L: linux-erofs@lists.ozlabs.org S: Maintained W: https://erofs.docs.kernel.org From 9eb05877dbee03064d3d3483cd6702f610d5a358 Mon Sep 17 00:00:00 2001 From: Zoltan HERPAI Date: Wed, 20 Mar 2024 09:36:02 +0100 Subject: [PATCH 403/496] pwm: img: fix pwm clock lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 22e8e19 has introduced a regression in the imgchip->pwm_clk lookup, whereas the clock name has also been renamed to "imgchip". This causes the driver failing to load: [ 0.546905] img-pwm 18101300.pwm: failed to get imgchip clock [ 0.553418] img-pwm: probe of 18101300.pwm failed with error -2 Fix this lookup by reverting the clock name back to "pwm". Signed-off-by: Zoltan HERPAI Link: https://lore.kernel.org/r/20240320083602.81592-1-wigyori@uid0.hu Fixes: 22e8e19a46f7 ("pwm: img: Rename variable pointing to driver private data") Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-img.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index d79a96679a26..d6596583ed4e 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -284,9 +284,9 @@ static int img_pwm_probe(struct platform_device *pdev) return PTR_ERR(imgchip->sys_clk); } - imgchip->pwm_clk = devm_clk_get(&pdev->dev, "imgchip"); + imgchip->pwm_clk = devm_clk_get(&pdev->dev, "pwm"); if (IS_ERR(imgchip->pwm_clk)) { - dev_err(&pdev->dev, "failed to get imgchip clock\n"); + dev_err(&pdev->dev, "failed to get pwm clock\n"); return PTR_ERR(imgchip->pwm_clk); } From 0add699ad068d26e5b1da9ff28b15461fc4005df Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 20 Mar 2024 17:10:38 +0900 Subject: [PATCH 404/496] tracing: probes: Fix to zero initialize a local variable Fix to initialize 'val' local variable with zero. Dan reported that Smatch static code checker reports an error that a local 'val' variable needs to be initialized. Actually, the 'val' is expected to be initialized by FETCH_OP_ARG in the same loop, but it is not obvious. So initialize it with zero. Link: https://lore.kernel.org/all/171092223833.237219.17304490075697026697.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/b010488e-68aa-407c-add0-3e059254aaa0@moroto.mountain/ Fixes: 25f00e40ce79 ("tracing/probes: Support $argN in return probe (kprobe and fprobe)") Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 217169de0920..dfe3ee6035ec 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -839,7 +839,7 @@ out: void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) { struct probe_entry_arg *earg = tp->entry_arg; - unsigned long val; + unsigned long val = 0; int i; if (!earg) From d6c30c5a168f8586b8bcc0d8e42e2456eb05209b Mon Sep 17 00:00:00 2001 From: David Thompson Date: Wed, 20 Mar 2024 15:31:17 -0400 Subject: [PATCH 405/496] mlxbf_gige: stop PHY during open() error paths The mlxbf_gige_open() routine starts the PHY as part of normal initialization. The mlxbf_gige_open() routine must stop the PHY during its error paths. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Reviewed-by: Asmaa Mnebhi Reviewed-by: Andrew Lunn Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 3d09fa54598f..cef0e2d3f1a7 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -157,7 +157,7 @@ static int mlxbf_gige_open(struct net_device *netdev) err = mlxbf_gige_tx_init(priv); if (err) - goto free_irqs; + goto phy_deinit; err = mlxbf_gige_rx_init(priv); if (err) goto tx_deinit; @@ -185,6 +185,9 @@ static int mlxbf_gige_open(struct net_device *netdev) tx_deinit: mlxbf_gige_tx_deinit(priv); +phy_deinit: + phy_stop(phydev); + free_irqs: mlxbf_gige_free_irqs(priv); return err; From 6630036b7c228f57c7893ee0403e92c2db2cd21d Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Tue, 19 Mar 2024 13:50:32 +0000 Subject: [PATCH 406/496] fs/9p: fix uninitialized values during inode evict If an iget fails due to not being able to retrieve information from the server then the inode structure is only partially initialized. When the inode gets evicted, references to uninitialized structures (like fscache cookies) were being made. This patch checks for a bad_inode before doing anything other than clearing the inode from the cache. Since the inode is bad, it shouldn't have any state associated with it that needs to be written back (and there really isn't a way to complete those anyways). Reported-by: syzbot+eb83fe1cce5833cd66a0@syzkaller.appspotmail.com Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 360a5304ec03..b01b1bbf2493 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -344,17 +344,21 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode); __le32 __maybe_unused version; - truncate_inode_pages_final(&inode->i_data); + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); - version = cpu_to_le32(v9inode->qid.version); - netfs_clear_inode_writeback(inode, &version); + version = cpu_to_le32(v9inode->qid.version); + netfs_clear_inode_writeback(inode, &version); - clear_inode(inode); - filemap_fdatawrite(&inode->i_data); + clear_inode(inode); + filemap_fdatawrite(&inode->i_data); #ifdef CONFIG_9P_FSCACHE - fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); + if (v9fs_inode_cookie(v9inode)) + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); #endif + } else + clear_inode(inode); } struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid) From 27f8f108c8455b42ec5f55806c5dc73ae2c5d075 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 25 Mar 2024 09:59:48 +0100 Subject: [PATCH 407/496] wifi: mac80211: fix mlme_link_id_dbg() Make sure that the new mlme_link_id_dbg() macro honours CONFIG_MAC80211_MLME_DEBUG as intended to avoid spamming the log with messages like: wlan0: no EHT support, limiting to HE wlan0: determined local STA to be HE, BW limited to 160 MHz wlan0: determined AP xx:xx:xx:xx:xx:xx to be VHT wlan0: connecting with VHT mode, max bandwidth 160 MHz Fixes: 310c8387c638 ("wifi: mac80211: clean up connection process") Signed-off-by: Johan Hovold Link: https://msgid.link/20240325085948.26203-1-johan+linaro@kernel.org Tested-by: Kalle Valo Signed-off-by: Johannes Berg --- net/mac80211/debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index 49da401c5340..35a8ba25fa57 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -158,7 +158,7 @@ do { \ _sdata_dbg(print, sdata, "[link %d] " fmt, \ link_id, ##__VA_ARGS__); \ else \ - _sdata_dbg(1, sdata, fmt, ##__VA_ARGS__); \ + _sdata_dbg(print, sdata, fmt, ##__VA_ARGS__); \ } while (0) #define link_dbg(link, fmt, ...) \ _link_id_dbg(1, (link)->sdata, (link)->link_id, \ From 4f2bdb3c5e3189297e156b3ff84b140423d64685 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 16 Mar 2024 08:43:36 +0100 Subject: [PATCH 408/496] wifi: mac80211: check/clear fast rx for non-4addr sta VLAN changes When moving a station out of a VLAN and deleting the VLAN afterwards, the fast_rx entry still holds a pointer to the VLAN's netdev, which can cause use-after-free bugs. Fix this by immediately calling ieee80211_check_fast_rx after the VLAN change. Cc: stable@vger.kernel.org Reported-by: ranygh@riseup.net Signed-off-by: Felix Fietkau Link: https://msgid.link/20240316074336.40442-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f03452dc716d..f67c1d021812 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2199,15 +2199,14 @@ static int ieee80211_change_station(struct wiphy *wiphy, } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && - sta->sdata->u.vlan.sta) { - ieee80211_clear_fast_rx(sta); + sta->sdata->u.vlan.sta) RCU_INIT_POINTER(sta->sdata->u.vlan.sta, NULL); - } if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) ieee80211_vif_dec_num_mcast(sta->sdata); sta->sdata = vlansdata; + ieee80211_check_fast_rx(sta); ieee80211_check_fast_xmit(sta); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { From 774f8841f55d7ac4044c79812691649da203584a Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Thu, 14 Mar 2024 14:23:00 -0700 Subject: [PATCH 409/496] wifi: mac80211: fix ieee80211_bss_*_flags kernel-doc Running kernel-doc on ieee80211_i.h flagged the following: net/mac80211/ieee80211_i.h:145: warning: expecting prototype for enum ieee80211_corrupt_data_flags. Prototype was for enum ieee80211_bss_corrupt_data_flags instead net/mac80211/ieee80211_i.h:162: warning: expecting prototype for enum ieee80211_valid_data_flags. Prototype was for enum ieee80211_bss_valid_data_flags instead Fix these warnings. Signed-off-by: Jeff Johnson Reviewed-by: Simon Horman Link: https://msgid.link/20240314-kdoc-ieee80211_i-v1-1-72b91b55b257@quicinc.com Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b6fead612b66..bd507d6b65e3 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -131,7 +131,7 @@ struct ieee80211_bss { }; /** - * enum ieee80211_corrupt_data_flags - BSS data corruption flags + * enum ieee80211_bss_corrupt_data_flags - BSS data corruption flags * @IEEE80211_BSS_CORRUPT_BEACON: last beacon frame received was corrupted * @IEEE80211_BSS_CORRUPT_PROBE_RESP: last probe response received was corrupted * @@ -144,7 +144,7 @@ enum ieee80211_bss_corrupt_data_flags { }; /** - * enum ieee80211_valid_data_flags - BSS valid data flags + * enum ieee80211_bss_valid_data_flags - BSS valid data flags * @IEEE80211_BSS_VALID_WMM: WMM/UAPSD data was gathered from non-corrupt IE * @IEEE80211_BSS_VALID_RATES: Supported rates were gathered from non-corrupt IE * @IEEE80211_BSS_VALID_ERP: ERP flag was gathered from non-corrupt IE From be23b2d7c3b7c8bf57b1cf0bf890bd65df9d0186 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 14 Mar 2024 11:09:51 +0100 Subject: [PATCH 410/496] wifi: cfg80211: add a flag to disable wireless extensions Wireless extensions are already disabled if MLO is enabled, given that we cannot support MLO there with all the hard- coded assumptions about BSSID etc. However, the WiFi7 ecosystem is still stabilizing, and some devices may need MLO disabled while that happens. In that case, we might end up with a device that supports wext (but not MLO) in one kernel, and then breaks wext in the future (by enabling MLO), which is not desirable. Add a flag to let such drivers/devices disable wext even if MLO isn't yet enabled. Cc: stable@vger.kernel.org Link: https://msgid.link/20240314110951.b50f1dc4ec21.I656ddd8178eedb49dc5c6c0e70f8ce5807afb54f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ net/wireless/wext-core.c | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2e2be4fd2bb6..1e09329acc42 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4991,6 +4991,7 @@ struct cfg80211_ops { * set this flag to update channels on beacon hints. * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link * of an NSTR mobile AP MLD. + * @WIPHY_FLAG_DISABLE_WEXT: disable wireless extensions for this device */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), @@ -5002,6 +5003,7 @@ enum wiphy_flags { WIPHY_FLAG_4ADDR_STATION = BIT(6), WIPHY_FLAG_CONTROL_PORT_PROTOCOL = BIT(7), WIPHY_FLAG_IBSS_RSN = BIT(8), + WIPHY_FLAG_DISABLE_WEXT = BIT(9), WIPHY_FLAG_MESH_AUTH = BIT(10), WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index a161c64d1765..838ad6541a17 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -4,6 +4,7 @@ * Authors : Jean Tourrilhes - HPL - * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * Copyright 2009 Johannes Berg + * Copyright (C) 2024 Intel Corporation * * (As all part of the Linux kernel, this file is GPL) */ @@ -662,7 +663,8 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) dev->ieee80211_ptr->wiphy->wext && dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) { wireless_warn_cfg80211_wext(); - if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + if (dev->ieee80211_ptr->wiphy->flags & (WIPHY_FLAG_SUPPORTS_MLO | + WIPHY_FLAG_DISABLE_WEXT)) return NULL; return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); } @@ -704,7 +706,8 @@ static iw_handler get_handler(struct net_device *dev, unsigned int cmd) #ifdef CONFIG_CFG80211_WEXT if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) { wireless_warn_cfg80211_wext(); - if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) + if (dev->ieee80211_ptr->wiphy->flags & (WIPHY_FLAG_SUPPORTS_MLO | + WIPHY_FLAG_DISABLE_WEXT)) return NULL; handlers = dev->ieee80211_ptr->wiphy->wext; } From 5f404005055304830bbbee0d66af2964fc48f29e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 14 Mar 2024 11:09:52 +0100 Subject: [PATCH 411/496] wifi: iwlwifi: mvm: disable MLO for the time being MLO ended up not really fully stable yet, we want to make sure it works well with the ecosystem before enabling it. Thus, remove the flag, but set WIPHY_FLAG_DISABLE_WEXT so we don't get wireless extensions back until we enable MLO for this hardware. Cc: stable@vger.kernel.org Reviewed-by: Miri Korenblit Link: https://msgid.link/20240314110951.d6ad146df98d.I47127e4fdbdef89e4ccf7483641570ee7871d4e6@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 1935630d3def..69227954e281 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -360,7 +360,7 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) if (mvm->mld_api_is_used && mvm->nvm_data->sku_cap_11be_enable && !iwlwifi_mod_params.disable_11ax && !iwlwifi_mod_params.disable_11be) - hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_MLO; + hw->wiphy->flags |= WIPHY_FLAG_DISABLE_WEXT; /* With MLD FW API, it tracks timing by itself, * no need for any timing from the host From ec50f3114e55406a1aad24b7dfaa1c3f4336d8eb Mon Sep 17 00:00:00 2001 From: Igor Artemiev Date: Mon, 11 Mar 2024 19:45:19 +0300 Subject: [PATCH 412/496] wifi: cfg80211: fix rdev_dump_mpp() arguments order Fix the order of arguments in the TP_ARGS macro for the rdev_dump_mpp tracepoint event. Found by Linux Verification Center (linuxtesting.org). Signed-off-by: Igor Artemiev Link: https://msgid.link/20240311164519.118398-1-Igor.A.Artemiev@mcst.ru Signed-off-by: Johannes Berg --- net/wireless/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index e039e66ab377..cbbf347c6b2e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1024,7 +1024,7 @@ TRACE_EVENT(rdev_get_mpp, TRACE_EVENT(rdev_dump_mpp, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int _idx, u8 *dst, u8 *mpp), - TP_ARGS(wiphy, netdev, _idx, mpp, dst), + TP_ARGS(wiphy, netdev, _idx, dst, mpp), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY From 2e6bd24339a6ff04413b2e49c0f2672d6f0edfa5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 18 Mar 2024 18:53:30 +0200 Subject: [PATCH 413/496] wifi: mac80211: fix prep_connection error path If prep_channel fails in prep_connection, the code releases the deflink's chanctx, which is wrong since we may be using a different link. It's already wrong to even do that always though, since we might still have the station. Remove it only if prep_channel succeeded and later updates fail. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240318184907.2780c1f08c3d.I033c9b15483933088f32a2c0789612a33dd33d82@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 47a2cba8313f..202b2ddb4cc1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7652,7 +7652,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata_info(sdata, "failed to insert STA entry for the AP (error %d)\n", err); - goto out_err; + goto out_release_chan; } } else WARN_ON_ONCE(!ether_addr_equal(link->u.mgd.bssid, cbss->bssid)); @@ -7663,8 +7663,9 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, return 0; +out_release_chan: + ieee80211_link_release_channel(link); out_err: - ieee80211_link_release_channel(&sdata->deflink); ieee80211_vif_set_links(sdata, 0, 0); return err; } From bbe806c294c9c4cd1221140d96e5f367673e393a Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 11 Mar 2024 08:28:01 +0200 Subject: [PATCH 414/496] wifi: iwlwifi: mvm: pick the version of SESSION_PROTECTION_NOTIF When we want to know whether we should look for the mac_id or the link_id in struct iwl_mvm_session_prot_notif, we should look at the version of SESSION_PROTECTION_NOTIF. This causes WARNINGs: WARNING: CPU: 0 PID: 11403 at drivers/net/wireless/intel/iwlwifi/mvm/time-event.c:959 iwl_mvm_rx_session_protect_notif+0x333/0x340 [iwlmvm] RIP: 0010:iwl_mvm_rx_session_protect_notif+0x333/0x340 [iwlmvm] Code: 00 49 c7 84 24 48 07 00 00 00 00 00 00 41 c6 84 24 78 07 00 00 ff 4c 89 f7 e8 e9 71 54 d9 e9 7d fd ff ff 0f 0b e9 23 fe ff ff <0f> 0b e9 1c fe ff ff 66 0f 1f 44 00 00 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffb4bb00003d40 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff9ae63a361000 RCX: ffff9ae4a98b60d4 RDX: ffff9ae4588499c0 RSI: 0000000000000305 RDI: ffff9ae4a98b6358 RBP: ffffb4bb00003d68 R08: 0000000000000003 R09: 0000000000000010 R10: ffffb4bb00003d00 R11: 000000000000000f R12: ffff9ae441399050 R13: ffff9ae4761329e8 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff9ae7af400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fb75680018 CR3: 00000003dae32006 CR4: 0000000000f70ef0 PKRU: 55555554 Call Trace: ? show_regs+0x69/0x80 ? __warn+0x8d/0x150 ? iwl_mvm_rx_session_protect_notif+0x333/0x340 [iwlmvm] ? report_bug+0x196/0x1c0 ? handle_bug+0x45/0x80 ? exc_invalid_op+0x1c/0xb0 ? asm_exc_invalid_op+0x1f/0x30 ? iwl_mvm_rx_session_protect_notif+0x333/0x340 [iwlmvm] iwl_mvm_rx_common+0x115/0x340 [iwlmvm] iwl_mvm_rx_mq+0xa6/0x100 [iwlmvm] iwl_pcie_rx_handle+0x263/0xa10 [iwlwifi] iwl_pcie_napi_poll_msix+0x32/0xd0 [iwlwifi] Fixes: 085d33c53012 ("wifi: iwlwifi: support link id in SESSION_PROTECTION_NOTIF") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://msgid.link/20240311081938.39d5618f7b9d.I564d863e53c6cbcb49141467932ecb6a9840b320@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/time-event.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c index a59d264a11c5..ad960faceb0d 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-event.c @@ -879,9 +879,8 @@ void iwl_mvm_rx_session_protect_notif(struct iwl_mvm *mvm, struct iwl_rx_packet *pkt = rxb_addr(rxb); struct iwl_mvm_session_prot_notif *notif = (void *)pkt->data; unsigned int ver = - iwl_fw_lookup_cmd_ver(mvm->fw, - WIDE_ID(MAC_CONF_GROUP, - SESSION_PROTECTION_CMD), 2); + iwl_fw_lookup_notif_ver(mvm->fw, MAC_CONF_GROUP, + SESSION_PROTECTION_NOTIF, 2); int id = le32_to_cpu(notif->mac_link_id); struct ieee80211_vif *vif; struct iwl_mvm_vif *mvmvif; From 847d7353e5a95d4df339dd86f5a4fb69f41eff75 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Mon, 11 Mar 2024 08:28:02 +0200 Subject: [PATCH 415/496] wifi: iwlwifi: mvm: consider having one active link Do not call iwl_mvm_mld_get_primary_link if only one link is active. In that case, the sole active link should be used. iwl_mvm_mld_get_primary_link returns -1 if only one link is active causing a warning. Fixes: 8c9bef26e98b ("wifi: iwlwifi: mvm: d3: implement suspend with MLO") Signed-off-by: Shaul Triebitz Signed-off-by: Miri Korenblit Link: https://msgid.link/20240311081938.6c50061bf69b.I05b0ac7fa7149eabaa5570a6f65b0d9bfb09a6f1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 553c6fffc7c6..52518a47554e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -1260,15 +1260,15 @@ static int __iwl_mvm_suspend(struct ieee80211_hw *hw, if (IS_ERR_OR_NULL(vif)) return 1; - if (ieee80211_vif_is_mld(vif) && vif->cfg.assoc) { + if (hweight16(vif->active_links) > 1) { /* - * Select the 'best' link. May need to revisit, it seems - * better to not optimize for throughput but rather range, - * reliability and power here - and select 2.4 GHz ... + * Select the 'best' link. + * May need to revisit, it seems better to not optimize + * for throughput but rather range, reliability and + * power here - and select 2.4 GHz ... */ - primary_link = - iwl_mvm_mld_get_primary_link(mvm, vif, - vif->active_links); + primary_link = iwl_mvm_mld_get_primary_link(mvm, vif, + vif->active_links); if (WARN_ONCE(primary_link < 0, "no primary link in 0x%x\n", vif->active_links)) @@ -1277,6 +1277,8 @@ static int __iwl_mvm_suspend(struct ieee80211_hw *hw, ret = ieee80211_set_active_links(vif, BIT(primary_link)); if (ret) return ret; + } else if (vif->active_links) { + primary_link = __ffs(vif->active_links); } else { primary_link = 0; } From a8b5d4809b503da668966a8187b9872e6c85291c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 11 Mar 2024 08:28:05 +0200 Subject: [PATCH 416/496] wifi: iwlwifi: mvm: Configure the link mapping for non-MLD FW In the non MLD firmware flows, although the deflink is used, the mapping of link ID to BSS configuration was missing, which causes flows that need this mapping to crash. Fix this by adding the link ID to BSS configuration mapping to non MLD flows as well. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://msgid.link/20240311081938.0b5c361e8f0c.Ib11f41815d2efa5d1ec57f855de4c8563142987b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/link.c | 59 ++++++++++++++----- .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 7 +++ drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 4 ++ 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/link.c b/drivers/net/wireless/intel/iwlwifi/mvm/link.c index f13f13e6b71a..9f69e04594e4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/link.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/link.c @@ -46,6 +46,27 @@ static int iwl_mvm_link_cmd_send(struct iwl_mvm *mvm, return ret; } +int iwl_mvm_set_link_mapping(struct iwl_mvm *mvm, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf) +{ + struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); + struct iwl_mvm_vif_link_info *link_info = + mvmvif->link[link_conf->link_id]; + + if (link_info->fw_link_id == IWL_MVM_FW_LINK_ID_INVALID) { + link_info->fw_link_id = iwl_mvm_get_free_fw_link_id(mvm, + mvmvif); + if (link_info->fw_link_id >= + ARRAY_SIZE(mvm->link_id_to_link_conf)) + return -EINVAL; + + rcu_assign_pointer(mvm->link_id_to_link_conf[link_info->fw_link_id], + link_conf); + } + + return 0; +} + int iwl_mvm_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf) { @@ -55,19 +76,14 @@ int iwl_mvm_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct iwl_link_config_cmd cmd = {}; unsigned int cmd_id = WIDE_ID(MAC_CONF_GROUP, LINK_CONFIG_CMD); u8 cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, cmd_id, 1); + int ret; if (WARN_ON_ONCE(!link_info)) return -EINVAL; - if (link_info->fw_link_id == IWL_MVM_FW_LINK_ID_INVALID) { - link_info->fw_link_id = iwl_mvm_get_free_fw_link_id(mvm, - mvmvif); - if (link_info->fw_link_id >= ARRAY_SIZE(mvm->link_id_to_link_conf)) - return -EINVAL; - - rcu_assign_pointer(mvm->link_id_to_link_conf[link_info->fw_link_id], - link_conf); - } + ret = iwl_mvm_set_link_mapping(mvm, vif, link_conf); + if (ret) + return ret; /* Update SF - Disable if needed. if this fails, SF might still be on * while many macs are bound, which is forbidden - so fail the binding. @@ -248,6 +264,24 @@ send_cmd: return ret; } +int iwl_mvm_unset_link_mapping(struct iwl_mvm *mvm, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf) +{ + struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); + struct iwl_mvm_vif_link_info *link_info = + mvmvif->link[link_conf->link_id]; + + /* mac80211 thought we have the link, but it was never configured */ + if (WARN_ON(!link_info || + link_info->fw_link_id >= + ARRAY_SIZE(mvm->link_id_to_link_conf))) + return -EINVAL; + + RCU_INIT_POINTER(mvm->link_id_to_link_conf[link_info->fw_link_id], + NULL); + return 0; +} + int iwl_mvm_remove_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf) { @@ -257,13 +291,10 @@ int iwl_mvm_remove_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct iwl_link_config_cmd cmd = {}; int ret; - /* mac80211 thought we have the link, but it was never configured */ - if (WARN_ON(!link_info || - link_info->fw_link_id >= ARRAY_SIZE(mvm->link_id_to_link_conf))) + ret = iwl_mvm_unset_link_mapping(mvm, vif, link_conf); + if (ret) return 0; - RCU_INIT_POINTER(mvm->link_id_to_link_conf[link_info->fw_link_id], - NULL); cmd.link_id = cpu_to_le32(link_info->fw_link_id); iwl_mvm_release_fw_link_id(mvm, link_info->fw_link_id); link_info->fw_link_id = IWL_MVM_FW_LINK_ID_INVALID; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 69227954e281..8f4b063d6243 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1577,8 +1577,14 @@ static int iwl_mvm_mac_add_interface(struct ieee80211_hw *hw, mvmvif->mvm = mvm; /* the first link always points to the default one */ + mvmvif->deflink.fw_link_id = IWL_MVM_FW_LINK_ID_INVALID; + mvmvif->deflink.active = 0; mvmvif->link[0] = &mvmvif->deflink; + ret = iwl_mvm_set_link_mapping(mvm, vif, &vif->bss_conf); + if (ret) + goto out; + /* * Not much to do here. The stack will not allow interface * types or combinations that we didn't advertise, so we @@ -1783,6 +1789,7 @@ static void iwl_mvm_mac_remove_interface(struct ieee80211_hw *hw, mvm->p2p_device_vif = NULL; } + iwl_mvm_unset_link_mapping(mvm, vif, &vif->bss_conf); iwl_mvm_mac_ctxt_remove(mvm, vif); RCU_INIT_POINTER(mvm->vif_id_to_mac[mvmvif->id], NULL); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index a10b48947bca..f6d334eb93b7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1918,11 +1918,15 @@ int iwl_mvm_binding_remove_vif(struct iwl_mvm *mvm, struct ieee80211_vif *vif); u32 iwl_mvm_get_lmac_id(struct iwl_mvm *mvm, enum nl80211_band band); /* Links */ +int iwl_mvm_set_link_mapping(struct iwl_mvm *mvm, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); int iwl_mvm_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); int iwl_mvm_link_changed(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf, u32 changes, bool active); +int iwl_mvm_unset_link_mapping(struct iwl_mvm *mvm, struct ieee80211_vif *vif, + struct ieee80211_bss_conf *link_conf); int iwl_mvm_remove_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, struct ieee80211_bss_conf *link_conf); int iwl_mvm_disable_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif, From 134d715e9ee2611edfb51774608ad465266bb3ef Mon Sep 17 00:00:00 2001 From: Ayala Beker Date: Mon, 18 Mar 2024 18:53:22 +0200 Subject: [PATCH 417/496] wifi: mac80211: correctly set active links upon TTLM Fix ieee80211_ttlm_set_links() to not set all active links, but instead let the driver know that valid links status changed and select the active links properly. Fixes: 8f500fbc6c65 ("wifi: mac80211: process and save negotiated TID to Link mapping request") Signed-off-by: Ayala Beker Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://msgid.link/20240318184907.acddbbf39584.Ide858f95248fcb3e483c97fcaa14b0cd4e964b10@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 202b2ddb4cc1..96b70006b7fc 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5874,6 +5874,15 @@ static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, } if (sdata->vif.active_links != active_links) { + /* usable links are affected when active_links are changed, + * so notify the driver about the status change + */ + changed |= BSS_CHANGED_MLD_VALID_LINKS; + active_links &= sdata->vif.active_links; + if (!active_links) + active_links = + BIT(__ffs(sdata->vif.valid_links & + ~dormant_links)); ret = ieee80211_set_active_links(&sdata->vif, active_links); if (ret) { sdata_info(sdata, "Failed to set TTLM active links\n"); @@ -5888,7 +5897,6 @@ static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, goto out; } - changed |= BSS_CHANGED_MLD_VALID_LINKS; sdata->vif.suspended_links = suspended_links; if (sdata->vif.suspended_links) changed |= BSS_CHANGED_MLD_TTLM; From 06a093807eb7b5c5b29b6cff49f8174a4e702341 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Mar 2024 10:10:17 +0200 Subject: [PATCH 418/496] wifi: iwlwifi: mvm: rfi: fix potential response leaks If the rx payload length check fails, or if kmemdup() fails, we still need to free the command response. Fix that. Fixes: 21254908cbe9 ("iwlwifi: mvm: add RFI-M support") Co-authored-by: Anjaneyulu Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240319100755.db2fa0196aa7.I116293b132502ac68a65527330fa37799694b79c@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rfi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c b/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c index 2ecd32bed752..045c862a8fc4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c @@ -132,14 +132,18 @@ struct iwl_rfi_freq_table_resp_cmd *iwl_rfi_get_freq_table(struct iwl_mvm *mvm) if (ret) return ERR_PTR(ret); - if (WARN_ON_ONCE(iwl_rx_packet_payload_len(cmd.resp_pkt) != resp_size)) + if (WARN_ON_ONCE(iwl_rx_packet_payload_len(cmd.resp_pkt) != + resp_size)) { + iwl_free_resp(&cmd); return ERR_PTR(-EIO); + } resp = kmemdup(cmd.resp_pkt->data, resp_size, GFP_KERNEL); + iwl_free_resp(&cmd); + if (!resp) return ERR_PTR(-ENOMEM); - iwl_free_resp(&cmd); return resp; } From 045a5b645dd59929b0e05375f493cde3a0318271 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Mar 2024 10:10:20 +0200 Subject: [PATCH 419/496] wifi: iwlwifi: fw: don't always use FW dump trig Since the dump_data (struct iwl_fwrt_dump_data) is a union, it's not safe to unconditionally access and use the 'trig' member, it might be 'desc' instead. Access it only if it's known to be 'trig' rather than 'desc', i.e. if ini-debug is present. Cc: stable@vger.kernel.org Fixes: 0eb50c674a1e ("iwlwifi: yoyo: send hcmd to fw after dump collection completes.") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240319100755.e2976bc58b29.I72fbd6135b3623227de53d8a2bb82776066cb72b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c index db6d7013df66..c3bdf433d8f7 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c @@ -3081,8 +3081,6 @@ static void iwl_fw_dbg_collect_sync(struct iwl_fw_runtime *fwrt, u8 wk_idx) struct iwl_fw_dbg_params params = {0}; struct iwl_fwrt_dump_data *dump_data = &fwrt->dump.wks[wk_idx].dump_data; - u32 policy; - u32 time_point; if (!test_bit(wk_idx, &fwrt->dump.active_wks)) return; @@ -3113,13 +3111,16 @@ static void iwl_fw_dbg_collect_sync(struct iwl_fw_runtime *fwrt, u8 wk_idx) iwl_fw_dbg_stop_restart_recording(fwrt, ¶ms, false); - policy = le32_to_cpu(dump_data->trig->apply_policy); - time_point = le32_to_cpu(dump_data->trig->time_point); + if (iwl_trans_dbg_ini_valid(fwrt->trans)) { + u32 policy = le32_to_cpu(dump_data->trig->apply_policy); + u32 time_point = le32_to_cpu(dump_data->trig->time_point); - if (policy & IWL_FW_INI_APPLY_POLICY_DUMP_COMPLETE_CMD) { - IWL_DEBUG_FW_INFO(fwrt, "WRT: sending dump complete\n"); - iwl_send_dbg_dump_complete_cmd(fwrt, time_point, 0); + if (policy & IWL_FW_INI_APPLY_POLICY_DUMP_COMPLETE_CMD) { + IWL_DEBUG_FW_INFO(fwrt, "WRT: sending dump complete\n"); + iwl_send_dbg_dump_complete_cmd(fwrt, time_point, 0); + } } + if (fwrt->trans->dbg.last_tp_resetfw == IWL_FW_INI_RESET_FW_MODE_STOP_FW_ONLY) iwl_force_nmi(fwrt->trans); From c2ace6300600c634553657785dfe5ea0ed688ac2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Mar 2024 10:10:22 +0200 Subject: [PATCH 420/496] wifi: iwlwifi: read txq->read_ptr under lock If we read txq->read_ptr without lock, we can read the same value twice, then obtain the lock, and reclaim from there to two different places, but crucially reclaim the same entry twice, resulting in the WARN_ONCE() a little later. Fix that by reading txq->read_ptr under lock. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240319100755.bf4c62196504.I978a7ca56c6bd6f1bf42c15aa923ba03366a840b@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/queue/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/queue/tx.c b/drivers/net/wireless/intel/iwlwifi/queue/tx.c index 33973a60d0bf..6229c785c845 100644 --- a/drivers/net/wireless/intel/iwlwifi/queue/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/queue/tx.c @@ -1589,9 +1589,9 @@ void iwl_txq_reclaim(struct iwl_trans *trans, int txq_id, int ssn, return; tfd_num = iwl_txq_get_cmd_index(txq, ssn); - read_ptr = iwl_txq_get_cmd_index(txq, txq->read_ptr); spin_lock_bh(&txq->lock); + read_ptr = iwl_txq_get_cmd_index(txq, txq->read_ptr); if (!test_bit(txq_id, trans->txqs.queue_used)) { IWL_DEBUG_TX_QUEUES(trans, "Q %d inactive - ignoring idx %d\n", From 17f64517bf5c26af56b6c3566273aad6646c3c4f Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 20 Mar 2024 23:26:23 +0200 Subject: [PATCH 421/496] wifi: iwlwifi: mvm: guard against invalid STA ID on removal Guard against invalid station IDs in iwl_mvm_mld_rm_sta_id as that would result in out-of-bounds array accesses. This prevents issues should the driver get into a bad state during error handling. Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240320232419.d523167bda9c.I1cffd86363805bf86a95d8bdfd4b438bb54baddc@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c index 1628bf55458f..23e64a757cfe 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-sta.c @@ -855,10 +855,15 @@ int iwl_mvm_mld_rm_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif, int iwl_mvm_mld_rm_sta_id(struct iwl_mvm *mvm, u8 sta_id) { - int ret = iwl_mvm_mld_rm_sta_from_fw(mvm, sta_id); + int ret; lockdep_assert_held(&mvm->mutex); + if (WARN_ON(sta_id == IWL_MVM_INVALID_STA)) + return 0; + + ret = iwl_mvm_mld_rm_sta_from_fw(mvm, sta_id); + RCU_INIT_POINTER(mvm->fw_id_to_mac_id[sta_id], NULL); RCU_INIT_POINTER(mvm->fw_id_to_link_sta[sta_id], NULL); return ret; From 19d82bdedaf2db0bfb3762dda714ea803065eed5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Mar 2024 23:26:32 +0200 Subject: [PATCH 422/496] wifi: iwlwifi: mvm: handle debugfs names more carefully With debugfs=off, we can get here with the dbgfs_dir being an ERR_PTR(). Instead of checking for all this, which is often flagged as a mistake, simply handle the names here more carefully by printing them, then we don't need extra checks. Also, while checking, I noticed theoretically 'buf' is too small, so fix that size as well. Cc: stable@vger.kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218422 Fixes: c36235acb34f ("wifi: iwlwifi: mvm: rework debugfs handling") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240320232419.4dc1eb3dd015.I32f308b0356ef5bcf8d188dd98ce9b210e3ab9fd@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c index 51b01f7528be..7fe57ecd0682 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs-vif.c @@ -748,7 +748,9 @@ void iwl_mvm_vif_dbgfs_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif) { struct dentry *dbgfs_dir = vif->debugfs_dir; struct iwl_mvm_vif *mvmvif = iwl_mvm_vif_from_mac80211(vif); - char buf[100]; + char buf[3 * 3 + 11 + (NL80211_WIPHY_NAME_MAXLEN + 1) + + (7 + IFNAMSIZ + 1) + 6 + 1]; + char name[7 + IFNAMSIZ + 1]; /* this will happen in monitor mode */ if (!dbgfs_dir) @@ -761,10 +763,11 @@ void iwl_mvm_vif_dbgfs_add_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif) * find * netdev:wlan0 -> ../../../ieee80211/phy0/netdev:wlan0/iwlmvm/ */ - snprintf(buf, 100, "../../../%pd3/iwlmvm", dbgfs_dir); + snprintf(name, sizeof(name), "%pd", dbgfs_dir); + snprintf(buf, sizeof(buf), "../../../%pd3/iwlmvm", dbgfs_dir); - mvmvif->dbgfs_slink = debugfs_create_symlink(dbgfs_dir->d_name.name, - mvm->debugfs_dir, buf); + mvmvif->dbgfs_slink = + debugfs_create_symlink(name, mvm->debugfs_dir, buf); } void iwl_mvm_vif_dbgfs_rm_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif) From e78d7877308989ef91b64a3c746ae31324c07caa Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 20 Mar 2024 23:26:22 +0200 Subject: [PATCH 423/496] wifi: iwlwifi: mvm: include link ID when releasing frames When releasing frames from the reorder buffer, the link ID was not included in the RX status information. This subsequently led mac80211 to drop the frame. Change it so that the link information is set immediately when possible so that it doesn't not need to be filled in anymore when submitting the frame to mac80211. Fixes: b8a85a1d42d7 ("wifi: iwlwifi: mvm: rxmq: report link ID to mac80211") Signed-off-by: Benjamin Berg Tested-by: Emmanuel Grumbach Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240320232419.bbbd5e9bfe80.Iec1bf5c884e371f7bc5ea2534ed9ea8d3f2c0bf6@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 1484eaedf452..ce8d83c771a7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -236,21 +236,13 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm, static void iwl_mvm_pass_packet_to_mac80211(struct iwl_mvm *mvm, struct napi_struct *napi, struct sk_buff *skb, int queue, - struct ieee80211_sta *sta, - struct ieee80211_link_sta *link_sta) + struct ieee80211_sta *sta) { if (unlikely(iwl_mvm_check_pn(mvm, skb, queue, sta))) { kfree_skb(skb); return; } - if (sta && sta->valid_links && link_sta) { - struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); - - rx_status->link_valid = 1; - rx_status->link_id = link_sta->link_id; - } - ieee80211_rx_napi(mvm->hw, sta, skb, napi); } @@ -588,7 +580,7 @@ static void iwl_mvm_release_frames(struct iwl_mvm *mvm, while ((skb = __skb_dequeue(skb_list))) { iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, reorder_buf->queue, - sta, NULL /* FIXME */); + sta); reorder_buf->num_stored--; } } @@ -2213,6 +2205,11 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, if (IS_ERR(sta)) sta = NULL; link_sta = rcu_dereference(mvm->fw_id_to_link_sta[id]); + + if (sta && sta->valid_links && link_sta) { + rx_status->link_valid = 1; + rx_status->link_id = link_sta->link_id; + } } } else if (!is_multicast_ether_addr(hdr->addr2)) { /* @@ -2356,8 +2353,7 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, !(desc->amsdu_info & IWL_RX_MPDU_AMSDU_LAST_SUBFRAME)) rx_status->flag |= RX_FLAG_AMSDU_MORE; - iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta, - link_sta); + iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta); } out: rcu_read_unlock(); From c2deb2e971f5d9aca941ef13ee05566979e337a4 Mon Sep 17 00:00:00 2001 From: linke li Date: Thu, 21 Mar 2024 16:44:10 +0800 Subject: [PATCH 424/496] net: mark racy access on sk->sk_rcvbuf sk->sk_rcvbuf in __sock_queue_rcv_skb() and __sk_receive_skb() can be changed by other threads. Mark this as benign using READ_ONCE(). This patch is aimed at reducing the number of benign races reported by KCSAN in order to focus future debugging effort on harmful races. Signed-off-by: linke li Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 43bf3818c19e..0963689a5950 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -482,7 +482,7 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; - if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; @@ -552,7 +552,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; - if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { + if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); goto discard_and_relse; } From dbde9fd49aafc9a09480db2a827159b109042e1a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 25 Mar 2024 17:43:32 +0100 Subject: [PATCH 425/496] kunit: fix wireless test dependencies For the wireless tests, CONFIG_WLAN and CONFIG_NETDEVICES are needed, though seem to be available by default on ARCH=um, so we didn't notice this before. Add them to fix kunit running on other architectures. Fixes: 28b3df1fe6ba ("kunit: add wireless unit tests") Reported-by: Mark Brown Closes: https://lore.kernel.org/r/b743a5ec-3d07-4747-85e0-2fb2ef69db7c@sirena.org.uk/ Signed-off-by: Johannes Berg --- tools/testing/kunit/configs/all_tests.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index a6cf69a665e8..76d049cdfca1 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -28,6 +28,8 @@ CONFIG_MCTP_FLOWS=y CONFIG_INET=y CONFIG_MPTCP=y +CONFIG_NETDEVICES=y +CONFIG_WLAN=y CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_WLAN_VENDOR_INTEL=y From 817b18965b58a6e5fb6ce97abf01b03a205a6aea Mon Sep 17 00:00:00 2001 From: Steven Zou Date: Wed, 7 Feb 2024 09:49:59 +0800 Subject: [PATCH 426/496] ice: Refactor FW data type and fix bitmap casting issue According to the datasheet, the recipe association data is an 8-byte little-endian value. It is described as 'Bitmap of the recipe indexes associated with this profile', it is from 24 to 31 byte area in FW. Therefore, it is defined to '__le64 recipe_assoc' in struct ice_aqc_recipe_to_profile. And then fix the bitmap casting issue, as we must never ever use castings for bitmap type. Fixes: 1e0f9881ef79 ("ice: Flesh out implementation of support for SRIOV on bonded interface") Reviewed-by: Przemek Kitszel Reviewed-by: Andrii Staikov Reviewed-by: Jan Sokolowski Reviewed-by: Simon Horman Signed-off-by: Steven Zou Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ice/ice_adminq_cmd.h | 3 ++- drivers/net/ethernet/intel/ice/ice_lag.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_switch.c | 24 +++++++++++-------- drivers/net/ethernet/intel/ice/ice_switch.h | 4 ++-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 8040317c9561..1f3e7a6903e5 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -593,8 +593,9 @@ struct ice_aqc_recipe_data_elem { struct ice_aqc_recipe_to_profile { __le16 profile_id; u8 rsvd[6]; - DECLARE_BITMAP(recipe_assoc, ICE_MAX_NUM_RECIPES); + __le64 recipe_assoc; }; +static_assert(sizeof(struct ice_aqc_recipe_to_profile) == 16); /* Add/Update/Remove/Get switch rules (indirect 0x02A0, 0x02A1, 0x02A2, 0x02A3) */ diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 467372d541d2..a7a342809935 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -2041,7 +2041,7 @@ int ice_init_lag(struct ice_pf *pf) /* associate recipes to profiles */ for (n = 0; n < ICE_PROFID_IPV6_GTPU_IPV6_TCP_INNER; n++) { err = ice_aq_get_recipe_to_profile(&pf->hw, n, - (u8 *)&recipe_bits, NULL); + &recipe_bits, NULL); if (err) continue; @@ -2049,7 +2049,7 @@ int ice_init_lag(struct ice_pf *pf) recipe_bits |= BIT(lag->pf_recipe) | BIT(lag->lport_recipe); ice_aq_map_recipe_to_profile(&pf->hw, n, - (u8 *)&recipe_bits, NULL); + recipe_bits, NULL); } } diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index f84bab80ca42..ba0ef91e4c19 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -2025,12 +2025,12 @@ error_out: * ice_aq_map_recipe_to_profile - Map recipe to packet profile * @hw: pointer to the HW struct * @profile_id: package profile ID to associate the recipe with - * @r_bitmap: Recipe bitmap filled in and need to be returned as response + * @r_assoc: Recipe bitmap filled in and need to be returned as response * @cd: pointer to command details structure or NULL * Recipe to profile association (0x0291) */ int -ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 r_assoc, struct ice_sq_cd *cd) { struct ice_aqc_recipe_to_profile *cmd; @@ -2042,7 +2042,7 @@ ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, /* Set the recipe ID bit in the bitmask to let the device know which * profile we are associating the recipe to */ - memcpy(cmd->recipe_assoc, r_bitmap, sizeof(cmd->recipe_assoc)); + cmd->recipe_assoc = cpu_to_le64(r_assoc); return ice_aq_send_cmd(hw, &desc, NULL, 0, cd); } @@ -2051,12 +2051,12 @@ ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, * ice_aq_get_recipe_to_profile - Map recipe to packet profile * @hw: pointer to the HW struct * @profile_id: package profile ID to associate the recipe with - * @r_bitmap: Recipe bitmap filled in and need to be returned as response + * @r_assoc: Recipe bitmap filled in and need to be returned as response * @cd: pointer to command details structure or NULL * Associate profile ID with given recipe (0x0293) */ int -ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 *r_assoc, struct ice_sq_cd *cd) { struct ice_aqc_recipe_to_profile *cmd; @@ -2069,7 +2069,7 @@ ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd); if (!status) - memcpy(r_bitmap, cmd->recipe_assoc, sizeof(cmd->recipe_assoc)); + *r_assoc = le64_to_cpu(cmd->recipe_assoc); return status; } @@ -2108,6 +2108,7 @@ int ice_alloc_recipe(struct ice_hw *hw, u16 *rid) static void ice_get_recp_to_prof_map(struct ice_hw *hw) { DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES); + u64 recp_assoc; u16 i; for (i = 0; i < hw->switch_info->max_used_prof_index + 1; i++) { @@ -2115,8 +2116,9 @@ static void ice_get_recp_to_prof_map(struct ice_hw *hw) bitmap_zero(profile_to_recipe[i], ICE_MAX_NUM_RECIPES); bitmap_zero(r_bitmap, ICE_MAX_NUM_RECIPES); - if (ice_aq_get_recipe_to_profile(hw, i, (u8 *)r_bitmap, NULL)) + if (ice_aq_get_recipe_to_profile(hw, i, &recp_assoc, NULL)) continue; + bitmap_from_arr64(r_bitmap, &recp_assoc, ICE_MAX_NUM_RECIPES); bitmap_copy(profile_to_recipe[i], r_bitmap, ICE_MAX_NUM_RECIPES); for_each_set_bit(j, r_bitmap, ICE_MAX_NUM_RECIPES) @@ -5390,22 +5392,24 @@ ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, */ list_for_each_entry(fvit, &rm->fv_list, list_entry) { DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES); + u64 recp_assoc; u16 j; status = ice_aq_get_recipe_to_profile(hw, fvit->profile_id, - (u8 *)r_bitmap, NULL); + &recp_assoc, NULL); if (status) goto err_unroll; + bitmap_from_arr64(r_bitmap, &recp_assoc, ICE_MAX_NUM_RECIPES); bitmap_or(r_bitmap, r_bitmap, rm->r_bitmap, ICE_MAX_NUM_RECIPES); status = ice_acquire_change_lock(hw, ICE_RES_WRITE); if (status) goto err_unroll; + bitmap_to_arr64(&recp_assoc, r_bitmap, ICE_MAX_NUM_RECIPES); status = ice_aq_map_recipe_to_profile(hw, fvit->profile_id, - (u8 *)r_bitmap, - NULL); + recp_assoc, NULL); ice_release_change_lock(hw); if (status) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h index db7e501b7e0a..89ffa1b51b5a 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.h +++ b/drivers/net/ethernet/intel/ice/ice_switch.h @@ -424,10 +424,10 @@ int ice_aq_add_recipe(struct ice_hw *hw, struct ice_aqc_recipe_data_elem *s_recipe_list, u16 num_recipes, struct ice_sq_cd *cd); int -ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 *r_assoc, struct ice_sq_cd *cd); int -ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 r_assoc, struct ice_sq_cd *cd); #endif /* _ICE_SWITCH_H_ */ From 1cb7fdb1dfde1aab66780b4ba44dba6402172111 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 5 Mar 2024 15:02:03 -0800 Subject: [PATCH 427/496] ice: fix memory corruption bug with suspend and rebuild The ice driver would previously panic after suspend. This is caused from the driver *only* calling the ice_vsi_free_q_vectors() function by itself, when it is suspending. Since commit b3e7b3a6ee92 ("ice: prevent NULL pointer deref during reload") the driver has zeroed out num_q_vectors, and only restored it in ice_vsi_cfg_def(). This further causes the ice_rebuild() function to allocate a zero length buffer, after which num_q_vectors is updated, and then the new value of num_q_vectors is used to index into the zero length buffer, which corrupts memory. The fix entails making sure all the code referencing num_q_vectors only does so after it has been reset via ice_vsi_cfg_def(). I didn't perform a full bisect, but I was able to test against 6.1.77 kernel and that ice driver works fine for suspend/resume with no panic, so sometime since then, this problem was introduced. Also clean up an un-needed init of a local variable in the function being modified. PANIC from 6.8.0-rc1: [1026674.915596] PM: suspend exit [1026675.664697] ice 0000:17:00.1: PTP reset successful [1026675.664707] ice 0000:17:00.1: 2755 msecs passed between update to cached PHC time [1026675.667660] ice 0000:b1:00.0: PTP reset successful [1026675.675944] ice 0000:b1:00.0: 2832 msecs passed between update to cached PHC time [1026677.137733] ixgbe 0000:31:00.0 ens787: NIC Link is Up 1 Gbps, Flow Control: None [1026677.190201] BUG: kernel NULL pointer dereference, address: 0000000000000010 [1026677.192753] ice 0000:17:00.0: PTP reset successful [1026677.192764] ice 0000:17:00.0: 4548 msecs passed between update to cached PHC time [1026677.197928] #PF: supervisor read access in kernel mode [1026677.197933] #PF: error_code(0x0000) - not-present page [1026677.197937] PGD 1557a7067 P4D 0 [1026677.212133] ice 0000:b1:00.1: PTP reset successful [1026677.212143] ice 0000:b1:00.1: 4344 msecs passed between update to cached PHC time [1026677.212575] [1026677.243142] Oops: 0000 [#1] PREEMPT SMP NOPTI [1026677.247918] CPU: 23 PID: 42790 Comm: kworker/23:0 Kdump: loaded Tainted: G W 6.8.0-rc1+ #1 [1026677.257989] Hardware name: Intel Corporation M50CYP2SBSTD/M50CYP2SBSTD, BIOS SE5C620.86B.01.01.0005.2202160810 02/16/2022 [1026677.269367] Workqueue: ice ice_service_task [ice] [1026677.274592] RIP: 0010:ice_vsi_rebuild_set_coalesce+0x130/0x1e0 [ice] [1026677.281421] Code: 0f 84 3a ff ff ff 41 0f b7 74 ec 02 66 89 b0 22 02 00 00 81 e6 ff 1f 00 00 e8 ec fd ff ff e9 35 ff ff ff 48 8b 43 30 49 63 ed <41> 0f b7 34 24 41 83 c5 01 48 8b 3c e8 66 89 b7 aa 02 00 00 81 e6 [1026677.300877] RSP: 0018:ff3be62a6399bcc0 EFLAGS: 00010202 [1026677.306556] RAX: ff28691e28980828 RBX: ff28691e41099828 RCX: 0000000000188000 [1026677.314148] RDX: 0000000000000000 RSI: 0000000000000010 RDI: ff28691e41099828 [1026677.321730] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [1026677.329311] R10: 0000000000000007 R11: ffffffffffffffc0 R12: 0000000000000010 [1026677.336896] R13: 0000000000000000 R14: 0000000000000000 R15: ff28691e0eaa81a0 [1026677.344472] FS: 0000000000000000(0000) GS:ff28693cbffc0000(0000) knlGS:0000000000000000 [1026677.353000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1026677.359195] CR2: 0000000000000010 CR3: 0000000128df4001 CR4: 0000000000771ef0 [1026677.366779] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1026677.374369] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1026677.381952] PKRU: 55555554 [1026677.385116] Call Trace: [1026677.388023] [1026677.390589] ? __die+0x20/0x70 [1026677.394105] ? page_fault_oops+0x82/0x160 [1026677.398576] ? do_user_addr_fault+0x65/0x6a0 [1026677.403307] ? exc_page_fault+0x6a/0x150 [1026677.407694] ? asm_exc_page_fault+0x22/0x30 [1026677.412349] ? ice_vsi_rebuild_set_coalesce+0x130/0x1e0 [ice] [1026677.418614] ice_vsi_rebuild+0x34b/0x3c0 [ice] [1026677.423583] ice_vsi_rebuild_by_type+0x76/0x180 [ice] [1026677.429147] ice_rebuild+0x18b/0x520 [ice] [1026677.433746] ? delay_tsc+0x8f/0xc0 [1026677.437630] ice_do_reset+0xa3/0x190 [ice] [1026677.442231] ice_service_task+0x26/0x440 [ice] [1026677.447180] process_one_work+0x174/0x340 [1026677.451669] worker_thread+0x27e/0x390 [1026677.455890] ? __pfx_worker_thread+0x10/0x10 [1026677.460627] kthread+0xee/0x120 [1026677.464235] ? __pfx_kthread+0x10/0x10 [1026677.468445] ret_from_fork+0x2d/0x50 [1026677.472476] ? __pfx_kthread+0x10/0x10 [1026677.476671] ret_from_fork_asm+0x1b/0x30 [1026677.481050] Fixes: b3e7b3a6ee92 ("ice: prevent NULL pointer deref during reload") Reported-by: Robert Elliott Signed-off-by: Jesse Brandeburg Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index ee3f0d3e3f6d..558422120312 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3091,7 +3091,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) { struct ice_vsi_cfg_params params = {}; struct ice_coalesce_stored *coalesce; - int prev_num_q_vectors = 0; + int prev_num_q_vectors; struct ice_pf *pf; int ret; @@ -3105,13 +3105,6 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (WARN_ON(vsi->type == ICE_VSI_VF && !vsi->vf)) return -EINVAL; - coalesce = kcalloc(vsi->num_q_vectors, - sizeof(struct ice_coalesce_stored), GFP_KERNEL); - if (!coalesce) - return -ENOMEM; - - prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); - ret = ice_vsi_realloc_stat_arrays(vsi); if (ret) goto err_vsi_cfg; @@ -3121,6 +3114,13 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (ret) goto err_vsi_cfg; + coalesce = kcalloc(vsi->num_q_vectors, + sizeof(struct ice_coalesce_stored), GFP_KERNEL); + if (!coalesce) + return -ENOMEM; + + prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); + ret = ice_vsi_cfg_tc_lan(pf, vsi); if (ret) { if (vsi_flags & ICE_VSI_FLAG_INIT) { @@ -3139,8 +3139,8 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) err_vsi_cfg_tc_lan: ice_vsi_decfg(vsi); -err_vsi_cfg: kfree(coalesce); +err_vsi_cfg: return ret; } From aec806fb4afba5fe80b09e29351379a4292baa43 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Tue, 5 Mar 2024 17:02:02 +0100 Subject: [PATCH 428/496] ixgbe: avoid sleeping allocation in ixgbe_ipsec_vf_add_sa() Change kzalloc() flags used in ixgbe_ipsec_vf_add_sa() to GFP_ATOMIC, to avoid sleeping in IRQ context. Dan Carpenter, with the help of Smatch, has found following issue: The patch eda0333ac293: "ixgbe: add VF IPsec management" from Aug 13, 2018 (linux-next), leads to the following Smatch static checker warning: drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c:917 ixgbe_ipsec_vf_add_sa() warn: sleeping in IRQ context The call tree that Smatch is worried about is: ixgbe_msix_other() <- IRQ handler -> ixgbe_msg_task() -> ixgbe_rcv_msg_from_vf() -> ixgbe_ipsec_vf_add_sa() Fixes: eda0333ac293 ("ixgbe: add VF IPsec management") Reported-by: Dan Carpenter Link: https://lore.kernel.org/intel-wired-lan/db31a0b0-4d9f-4e6b-aed8-88266eb5665c@moroto.mountain Reviewed-by: Michal Kubiak Signed-off-by: Przemek Kitszel Reviewed-by: Shannon Nelson Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 13a6fca31004..866024f2b9ee 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -914,7 +914,13 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) goto err_out; } - xs = kzalloc(sizeof(*xs), GFP_KERNEL); + algo = xfrm_aead_get_byname(aes_gcm_name, IXGBE_IPSEC_AUTH_BITS, 1); + if (unlikely(!algo)) { + err = -ENOENT; + goto err_out; + } + + xs = kzalloc(sizeof(*xs), GFP_ATOMIC); if (unlikely(!xs)) { err = -ENOMEM; goto err_out; @@ -930,14 +936,8 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) memcpy(&xs->id.daddr.a4, sam->addr, sizeof(xs->id.daddr.a4)); xs->xso.dev = adapter->netdev; - algo = xfrm_aead_get_byname(aes_gcm_name, IXGBE_IPSEC_AUTH_BITS, 1); - if (unlikely(!algo)) { - err = -ENOENT; - goto err_xs; - } - aead_len = sizeof(*xs->aead) + IXGBE_IPSEC_KEY_BITS / 8; - xs->aead = kzalloc(aead_len, GFP_KERNEL); + xs->aead = kzalloc(aead_len, GFP_ATOMIC); if (unlikely(!xs->aead)) { err = -ENOMEM; goto err_xs; From 47ce2956c7a61ff354723e28235205fa2012265b Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 13 Mar 2024 14:03:10 +0100 Subject: [PATCH 429/496] igc: Remove stale comment about Tx timestamping The initial igc Tx timestamping implementation used only one register for retrieving Tx timestamps. Commit 3ed247e78911 ("igc: Add support for multiple in-flight TX timestamps") added support for utilizing all four of them e.g., for multiple domain support. Remove the stale comment/FIXME. Fixes: 3ed247e78911 ("igc: Add support for multiple in-flight TX timestamps") Signed-off-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Przemek Kitszel Tested-by: Naama Meir Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 2e1cfbd82f4f..35ad40a803cb 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1642,10 +1642,6 @@ done: if (unlikely(test_bit(IGC_RING_FLAG_TX_HWTSTAMP, &tx_ring->flags) && skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { - /* FIXME: add support for retrieving timestamps from - * the other timer registers before skipping the - * timestamping request. - */ unsigned long flags; u32 tstamp_flags; From 443574b033876c85a35de4c65c14f7fe092222b2 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sun, 24 Mar 2024 10:33:06 +0000 Subject: [PATCH 430/496] riscv, bpf: Fix kfunc parameters incompatibility between bpf and riscv abi We encountered a failing case when running selftest in no_alu32 mode: The failure case is `kfunc_call/kfunc_call_test4` and its source code is like bellow: ``` long bpf_kfunc_call_test4(signed char a, short b, int c, long d) __ksym; int kfunc_call_test4(struct __sk_buff *skb) { ... tmp = bpf_kfunc_call_test4(-3, -30, -200, -1000); ... } ``` And its corresponding asm code is: ``` 0: r1 = -3 1: r2 = -30 2: r3 = 0xffffff38 # opcode: 18 03 00 00 38 ff ff ff 00 00 00 00 00 00 00 00 4: r4 = -1000 5: call bpf_kfunc_call_test4 ``` insn 2 is parsed to ld_imm64 insn to emit 0x00000000ffffff38 imm, and converted to int type and then send to bpf_kfunc_call_test4. But since it is zero-extended in the bpf calling convention, riscv jit will directly treat it as an unsigned 32-bit int value, and then fails with the message "actual 4294966063 != expected -1234". The reason is the incompatibility between bpf and riscv abi, that is, bpf will do zero-extension on uint, but riscv64 requires sign-extension on int or uint. We can solve this problem by sign extending the 32-bit parameters in kfunc. The issue is related to [0], and thanks to Yonghong and Alexei. Link: https://github.com/llvm/llvm-project/pull/84874 [0] Fixes: d40c3847b485 ("riscv, bpf: Add kfunc support for RV64") Signed-off-by: Pu Lehui Tested-by: Puranjay Mohan Reviewed-by: Puranjay Mohan Link: https://lore.kernel.org/r/20240324103306.2202954-1-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- arch/riscv/net/bpf_jit_comp64.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index aac190085472..1adf2f39ce59 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1463,6 +1463,22 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, if (ret < 0) return ret; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + const struct btf_func_model *fm; + int idx; + + fm = bpf_jit_find_kfunc_model(ctx->prog, insn); + if (!fm) + return -EINVAL; + + for (idx = 0; idx < fm->nr_args; idx++) { + u8 reg = bpf_to_rv_reg(BPF_REG_1 + idx, ctx); + + if (fm->arg_size[idx] == sizeof(int)) + emit_sextw(reg, reg, ctx); + } + } + ret = emit_call(addr, fixed_addr, ctx); if (ret) return ret; From cc2699268152d8e0386a36fe7c9271d7e23668f2 Mon Sep 17 00:00:00 2001 From: Prasad Pandit Date: Fri, 22 Mar 2024 17:18:19 +0530 Subject: [PATCH 431/496] dpll: indent DPLL option type by a tab Indent config option type by a tab. It helps Kconfig parsers to read file without error. Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions") Signed-off-by: Prasad Pandit Reviewed-by: Vadim Fedorenko Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240322114819.1801795-1-ppandit@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dpll/Kconfig b/drivers/dpll/Kconfig index a4cae73f20d3..20607ed54243 100644 --- a/drivers/dpll/Kconfig +++ b/drivers/dpll/Kconfig @@ -4,4 +4,4 @@ # config DPLL - bool + bool From afb373ff3f54c9d909efc7f810dc80a9742807b2 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 21 Mar 2024 12:53:37 +0100 Subject: [PATCH 432/496] s390/qeth: handle deferred cc1 The IO subsystem expects a driver to retry a ccw_device_start, when the subsequent interrupt response block (irb) contains a deferred condition code 1. Symptoms before this commit: On the read channel we always trigger the next read anyhow, so no different behaviour here. On the write channel we may experience timeout errors, because the expected reply will never be received without the retry. Other callers of qeth_send_control_data() may wrongly assume that the ccw was successful, which may cause problems later. Note that since commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") and commit 5ef1dc40ffa6 ("s390/cio: fix invalid -EBUSY on ccw_device_start") deferred CC1s are much more likely to occur. See the commit message of the latter for more background information. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Signed-off-by: Alexandra Winter Co-developed-by: Thorsten Winkler Signed-off-by: Thorsten Winkler Reviewed-by: Peter Oberparleiter Link: https://lore.kernel.org/r/20240321115337.3564694-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski --- drivers/s390/net/qeth_core_main.c | 38 +++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index a0cce6872075..f0b8b709649f 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1179,6 +1179,20 @@ static int qeth_check_irb_error(struct qeth_card *card, struct ccw_device *cdev, } } +/** + * qeth_irq() - qeth interrupt handler + * @cdev: ccw device + * @intparm: expect pointer to iob + * @irb: Interruption Response Block + * + * In the good path: + * corresponding qeth channel is locked with last used iob as active_cmd. + * But this function is also called for error interrupts. + * + * Caller ensures that: + * Interrupts are disabled; ccw device lock is held; + * + */ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb) { @@ -1220,11 +1234,10 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, iob = (struct qeth_cmd_buffer *) (addr_t)intparm; } - qeth_unlock_channel(card, channel); - rc = qeth_check_irb_error(card, cdev, irb); if (rc) { /* IO was terminated, free its resources. */ + qeth_unlock_channel(card, channel); if (iob) qeth_cancel_cmd(iob, rc); return; @@ -1268,6 +1281,7 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, rc = qeth_get_problem(card, cdev, irb); if (rc) { card->read_or_write_problem = 1; + qeth_unlock_channel(card, channel); if (iob) qeth_cancel_cmd(iob, rc); qeth_clear_ipacmd_list(card); @@ -1276,6 +1290,26 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, } } + if (scsw_cmd_is_valid_cc(&irb->scsw) && irb->scsw.cmd.cc == 1 && iob) { + /* channel command hasn't started: retry. + * active_cmd is still set to last iob + */ + QETH_CARD_TEXT(card, 2, "irqcc1"); + rc = ccw_device_start_timeout(cdev, __ccw_from_cmd(iob), + (addr_t)iob, 0, 0, iob->timeout); + if (rc) { + QETH_DBF_MESSAGE(2, + "ccw retry on %x failed, rc = %i\n", + CARD_DEVID(card), rc); + QETH_CARD_TEXT_(card, 2, " err%d", rc); + qeth_unlock_channel(card, channel); + qeth_cancel_cmd(iob, rc); + } + return; + } + + qeth_unlock_channel(card, channel); + if (iob) { /* sanity check: */ if (irb->scsw.cmd.count > iob->length) { From 3a38a829c8bc27d78552c28e582eb1d885d07d11 Mon Sep 17 00:00:00 2001 From: Claus Hansen Ries Date: Thu, 21 Mar 2024 13:08:59 +0000 Subject: [PATCH 433/496] net: ll_temac: platform_get_resource replaced by wrong function The function platform_get_resource was replaced with devm_platform_ioremap_resource_byname and is called using 0 as name. This eventually ends up in platform_get_resource_byname in the call stack, where it causes a null pointer in strcmp. if (type == resource_type(r) && !strcmp(r->name, name)) It should have been replaced with devm_platform_ioremap_resource. Fixes: bd69058f50d5 ("net: ll_temac: Use devm_platform_ioremap_resource_byname()") Signed-off-by: Claus Hansen Ries Cc: stable@vger.kernel.org Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/cca18f9c630a41c18487729770b492bb@terma.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/ll_temac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 9df39cf8b097..1072e2210aed 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -1443,7 +1443,7 @@ static int temac_probe(struct platform_device *pdev) } /* map device registers */ - lp->regs = devm_platform_ioremap_resource_byname(pdev, 0); + lp->regs = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(lp->regs)) { dev_err(&pdev->dev, "could not map TEMAC registers\n"); return -ENOMEM; From b11c81731c810efe592e510bb0110e0db6877419 Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Fri, 22 Mar 2024 15:34:47 +0530 Subject: [PATCH 434/496] net: hsr: hsr_slave: Fix the promiscuous mode in offload mode commit e748d0fd66ab ("net: hsr: Disable promiscuous mode in offload mode") disables promiscuous mode of slave devices while creating an HSR interface. But while deleting the HSR interface, it does not take care of it. It decreases the promiscuous mode count, which eventually enables promiscuous mode on the slave devices when creating HSR interface again. Fix this by not decrementing the promiscuous mode count while deleting the HSR interface when offload is enabled. Fixes: e748d0fd66ab ("net: hsr: Disable promiscuous mode in offload mode") Signed-off-by: Ravi Gunasekaran Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240322100447.27615-1-r-gunasekaran@ti.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_slave.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index e5742f2a2d52..1b6457f357bd 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -220,7 +220,8 @@ void hsr_del_port(struct hsr_port *port) netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); - dev_set_promiscuity(port->dev, -1); + if (!port->hsr->fwd_offloaded) + dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } From 151c9c724d05d5b0dd8acd3e11cb69ef1f2dbada Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2024 13:57:32 +0000 Subject: [PATCH 435/496] tcp: properly terminate timers for kernel sockets We had various syzbot reports about tcp timers firing after the corresponding netns has been dismantled. Fortunately Josef Bacik could trigger the issue more often, and could test a patch I wrote two years ago. When TCP sockets are closed, we call inet_csk_clear_xmit_timers() to 'stop' the timers. inet_csk_clear_xmit_timers() can be called from any context, including when socket lock is held. This is the reason it uses sk_stop_timer(), aka del_timer(). This means that ongoing timers might finish much later. For user sockets, this is fine because each running timer holds a reference on the socket, and the user socket holds a reference on the netns. For kernel sockets, we risk that the netns is freed before timer can complete, because kernel sockets do not hold reference on the netns. This patch adds inet_csk_clear_xmit_timers_sync() function that using sk_stop_timer_sync() to make sure all timers are terminated before the kernel socket is released. Modules using kernel sockets close them in their netns exit() handler. Also add sock_not_owned_by_me() helper to get LOCKDEP support : inet_csk_clear_xmit_timers_sync() must not be called while socket lock is held. It is very possible we can revert in the future commit 3a58f13a881e ("net: rds: acquire refcount on TCP sockets") which attempted to solve the issue in rds only. (net/smc/af_smc.c and net/mptcp/subflow.c have similar code) We probably can remove the check_net() tests from tcp_out_of_resources() and __tcp_close() in the future. Reported-by: Josef Bacik Closes: https://lore.kernel.org/netdev/20240314210740.GA2823176@perftesting/ Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Fixes: 8a68173691f0 ("net: sk_clone_lock() should only do get_net() if the parent is not a kernel socket") Link: https://lore.kernel.org/bpf/CANn89i+484ffqb93aQm1N-tjxxvb3WDKX0EbD7318RwRgsatjw@mail.gmail.com/ Signed-off-by: Eric Dumazet Tested-by: Josef Bacik Cc: Tetsuo Handa Link: https://lore.kernel.org/r/20240322135732.1535772-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 1 + include/net/sock.h | 7 +++++++ net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ net/ipv4/tcp.c | 2 ++ 4 files changed, 24 insertions(+) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 9ab4bf704e86..ccf171f7eb60 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -175,6 +175,7 @@ void inet_csk_init_xmit_timers(struct sock *sk, void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); +void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { diff --git a/include/net/sock.h b/include/net/sock.h index b5e00702acc1..f57bfd8a2ad2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1759,6 +1759,13 @@ static inline void sock_owned_by_me(const struct sock *sk) #endif } +static inline void sock_not_owned_by_me(const struct sock *sk) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); +#endif +} + static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7d8090f109ef..c038e28e2f1e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -771,6 +771,20 @@ void inet_csk_clear_xmit_timers(struct sock *sk) } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +void inet_csk_clear_xmit_timers_sync(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* ongoing timer handlers need to acquire socket lock. */ + sock_not_owned_by_me(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; + + sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); + sk_stop_timer_sync(sk, &sk->sk_timer); +} + void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d20b62d52171..e767721b3a58 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2931,6 +2931,8 @@ void tcp_close(struct sock *sk, long timeout) lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); From 7d5a7dd5a35876f0ecc286f3602a88887a788217 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 22 Mar 2024 15:40:00 +0100 Subject: [PATCH 436/496] net: wwan: t7xx: Split 64bit accesses to fix alignment issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some of the registers are aligned on a 32bit boundary, causing alignment faults on 64bit platforms. Unable to handle kernel paging request at virtual address ffffffc084a1d004 Mem abort info: ESR = 0x0000000096000061 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x21: alignment fault Data abort info: ISV = 0, ISS = 0x00000061, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000046ad6000 [ffffffc084a1d004] pgd=100000013ffff003, p4d=100000013ffff003, pud=100000013ffff003, pmd=0068000020a00711 Internal error: Oops: 0000000096000061 [#1] SMP Modules linked in: mtk_t7xx(+) qcserial pppoe ppp_async option nft_fib_inet nf_flow_table_inet mt7921u(O) mt7921s(O) mt7921e(O) mt7921_common(O) iwlmvm(O) iwldvm(O) usb_wwan rndis_host qmi_wwan pppox ppp_generic nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack mt7996e(O) mt792x_usb(O) mt792x_lib(O) mt7915e(O) mt76_usb(O) mt76_sdio(O) mt76_connac_lib(O) mt76(O) mac80211(O) iwlwifi(O) huawei_cdc_ncm cfg80211(O) cdc_ncm cdc_ether wwan usbserial usbnet slhc sfp rtc_pcf8563 nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 mt6577_auxadc mdio_i2c libcrc32c compat(O) cdc_wdm cdc_acm at24 crypto_safexcel pwm_fan i2c_gpio i2c_smbus industrialio i2c_algo_bit i2c_mux_reg i2c_mux_pca954x i2c_mux_pca9541 i2c_mux_gpio i2c_mux dummy oid_registry tun sha512_arm64 sha1_ce sha1_generic seqiv md5 geniv des_generic libdes cbc authencesn authenc leds_gpio xhci_plat_hcd xhci_pci xhci_mtk_hcd xhci_hcd nvme nvme_core gpio_button_hotplug(O) dm_mirror dm_region_hash dm_log dm_crypt dm_mod dax usbcore usb_common ptp aquantia pps_core mii tpm encrypted_keys trusted CPU: 3 PID: 5266 Comm: kworker/u9:1 Tainted: G O 6.6.22 #0 Hardware name: Bananapi BPI-R4 (DT) Workqueue: md_hk_wq t7xx_fsm_uninit [mtk_t7xx] pstate: 804000c5 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : t7xx_cldma_hw_set_start_addr+0x1c/0x3c [mtk_t7xx] lr : t7xx_cldma_start+0xac/0x13c [mtk_t7xx] sp : ffffffc085d63d30 x29: ffffffc085d63d30 x28: 0000000000000000 x27: 0000000000000000 x26: 0000000000000000 x25: ffffff80c804f2c0 x24: ffffff80ca196c05 x23: 0000000000000000 x22: ffffff80c814b9b8 x21: ffffff80c814b128 x20: 0000000000000001 x19: ffffff80c814b080 x18: 0000000000000014 x17: 0000000055c9806b x16: 000000007c5296d0 x15: 000000000f6bca68 x14: 00000000dbdbdce4 x13: 000000001aeaf72a x12: 0000000000000001 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffffff80ca1ef6b4 x7 : ffffff80c814b818 x6 : 0000000000000018 x5 : 0000000000000870 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 000000010a947000 x1 : ffffffc084a1d004 x0 : ffffffc084a1d004 Call trace: t7xx_cldma_hw_set_start_addr+0x1c/0x3c [mtk_t7xx] t7xx_fsm_uninit+0x578/0x5ec [mtk_t7xx] process_one_work+0x154/0x2a0 worker_thread+0x2ac/0x488 kthread+0xe0/0xec ret_from_fork+0x10/0x20 Code: f9400800 91001000 8b214001 d50332bf (f9000022) ---[ end trace 0000000000000000 ]--- The inclusion of io-64-nonatomic-lo-hi.h indicates that all 64bit accesses can be replaced by pairs of nonatomic 32bit access. Fix alignment by forcing all accesses to be 32bit on 64bit platforms. Link: https://forum.openwrt.org/t/fibocom-fm350-gl-support/142682/72 Fixes: 39d439047f1d ("net: wwan: t7xx: Add control DMA interface") Signed-off-by: Bjørn Mork Reviewed-by: Sergey Ryazanov Tested-by: Liviu Dudau Link: https://lore.kernel.org/r/20240322144000.1683822-1-bjorn@mork.no Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_cldma.c | 4 ++-- drivers/net/wwan/t7xx/t7xx_hif_cldma.c | 9 +++++---- drivers/net/wwan/t7xx/t7xx_pcie_mac.c | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_cldma.c b/drivers/net/wwan/t7xx/t7xx_cldma.c index 9f43f256db1d..f0a4783baf1f 100644 --- a/drivers/net/wwan/t7xx/t7xx_cldma.c +++ b/drivers/net/wwan/t7xx/t7xx_cldma.c @@ -106,7 +106,7 @@ bool t7xx_cldma_tx_addr_is_set(struct t7xx_cldma_hw *hw_info, unsigned int qno) { u32 offset = REG_CLDMA_UL_START_ADDRL_0 + qno * ADDR_SIZE; - return ioread64(hw_info->ap_pdn_base + offset); + return ioread64_lo_hi(hw_info->ap_pdn_base + offset); } void t7xx_cldma_hw_set_start_addr(struct t7xx_cldma_hw *hw_info, unsigned int qno, u64 address, @@ -117,7 +117,7 @@ void t7xx_cldma_hw_set_start_addr(struct t7xx_cldma_hw *hw_info, unsigned int qn reg = tx_rx == MTK_RX ? hw_info->ap_ao_base + REG_CLDMA_DL_START_ADDRL_0 : hw_info->ap_pdn_base + REG_CLDMA_UL_START_ADDRL_0; - iowrite64(address, reg + offset); + iowrite64_lo_hi(address, reg + offset); } void t7xx_cldma_hw_resume_queue(struct t7xx_cldma_hw *hw_info, unsigned int qno, diff --git a/drivers/net/wwan/t7xx/t7xx_hif_cldma.c b/drivers/net/wwan/t7xx/t7xx_hif_cldma.c index abc41a7089fa..97163e1e5783 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_cldma.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_cldma.c @@ -137,8 +137,9 @@ static int t7xx_cldma_gpd_rx_from_q(struct cldma_queue *queue, int budget, bool return -ENODEV; } - gpd_addr = ioread64(hw_info->ap_pdn_base + REG_CLDMA_DL_CURRENT_ADDRL_0 + - queue->index * sizeof(u64)); + gpd_addr = ioread64_lo_hi(hw_info->ap_pdn_base + + REG_CLDMA_DL_CURRENT_ADDRL_0 + + queue->index * sizeof(u64)); if (req->gpd_addr == gpd_addr || hwo_polling_count++ >= 100) return 0; @@ -316,8 +317,8 @@ static void t7xx_cldma_txq_empty_hndl(struct cldma_queue *queue) struct t7xx_cldma_hw *hw_info = &md_ctrl->hw_info; /* Check current processing TGPD, 64-bit address is in a table by Q index */ - ul_curr_addr = ioread64(hw_info->ap_pdn_base + REG_CLDMA_UL_CURRENT_ADDRL_0 + - queue->index * sizeof(u64)); + ul_curr_addr = ioread64_lo_hi(hw_info->ap_pdn_base + REG_CLDMA_UL_CURRENT_ADDRL_0 + + queue->index * sizeof(u64)); if (req->gpd_addr != ul_curr_addr) { spin_unlock_irqrestore(&md_ctrl->cldma_lock, flags); dev_err(md_ctrl->dev, "CLDMA%d queue %d is not empty\n", diff --git a/drivers/net/wwan/t7xx/t7xx_pcie_mac.c b/drivers/net/wwan/t7xx/t7xx_pcie_mac.c index 76da4c15e3de..f071ec7ff23d 100644 --- a/drivers/net/wwan/t7xx/t7xx_pcie_mac.c +++ b/drivers/net/wwan/t7xx/t7xx_pcie_mac.c @@ -75,7 +75,7 @@ static void t7xx_pcie_mac_atr_tables_dis(void __iomem *pbase, enum t7xx_atr_src_ for (i = 0; i < ATR_TABLE_NUM_PER_ATR; i++) { offset = ATR_PORT_OFFSET * port + ATR_TABLE_OFFSET * i; reg = pbase + ATR_PCIE_WIN0_T0_ATR_PARAM_SRC_ADDR + offset; - iowrite64(0, reg); + iowrite64_lo_hi(0, reg); } } @@ -112,17 +112,17 @@ static int t7xx_pcie_mac_atr_cfg(struct t7xx_pci_dev *t7xx_dev, struct t7xx_atr_ reg = pbase + ATR_PCIE_WIN0_T0_TRSL_ADDR + offset; value = cfg->trsl_addr & ATR_PCIE_WIN0_ADDR_ALGMT; - iowrite64(value, reg); + iowrite64_lo_hi(value, reg); reg = pbase + ATR_PCIE_WIN0_T0_TRSL_PARAM + offset; iowrite32(cfg->trsl_id, reg); reg = pbase + ATR_PCIE_WIN0_T0_ATR_PARAM_SRC_ADDR + offset; value = (cfg->src_addr & ATR_PCIE_WIN0_ADDR_ALGMT) | (atr_size << 1) | BIT(0); - iowrite64(value, reg); + iowrite64_lo_hi(value, reg); /* Ensure ATR is set */ - ioread64(reg); + ioread64_lo_hi(reg); return 0; } From 5f563c31ff0c40ce395d0bae7daa94c7950dac97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ar=C4=B1n=C3=A7=20=C3=9CNAL?= Date: Wed, 20 Mar 2024 23:45:30 +0300 Subject: [PATCH 437/496] net: dsa: mt7530: fix improper frames on all 25MHz and 40MHz XTAL MT7530 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MT7530 switch after reset initialises with a core clock frequency that works with a 25MHz XTAL connected to it. For 40MHz XTAL, the core clock frequency must be set to 500MHz. The mt7530_pll_setup() function is responsible of setting the core clock frequency. Currently, it runs on MT7530 with 25MHz and 40MHz XTAL. This causes MT7530 switch with 25MHz XTAL to egress and ingress frames improperly. Introduce a check to run it only on MT7530 with 40MHz XTAL. The core clock frequency is set by writing to a switch PHY's register. Access to the PHY's register is done via the MDIO bus the switch is also on. Therefore, it works only when the switch makes switch PHYs listen on the MDIO bus the switch is on. This is controlled either by the state of the ESW_P1_LED_1 pin after reset deassertion or modifying bit 5 of the modifiable trap register. When ESW_P1_LED_1 is pulled high, PHY indirect access is used. That means accessing PHY registers via the PHY indirect access control register of the switch. When ESW_P1_LED_1 is pulled low, PHY direct access is used. That means accessing PHY registers via the MDIO bus the switch is on. For MT7530 switch with 40MHz XTAL on a board with ESW_P1_LED_1 pulled high, the core clock frequency won't be set to 500MHz, causing the switch to egress and ingress frames improperly. Run mt7530_pll_setup() after PHY direct access is set on the modifiable trap register. With these two changes, all MT7530 switches with 25MHz and 40MHz, and P1_LED_1 pulled high or low, will egress and ingress frames properly. Link: https://github.com/BPI-SINOVOIP/BPI-R2-bsp/blob/4a5dd143f2172ec97a2872fa29c7c4cd520f45b5/linux-mt/drivers/net/ethernet/mediatek/gsw_mt7623.c#L1039 Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: Arınç ÜNAL Link: https://lore.kernel.org/r/20240320-for-net-mt7530-fix-25mhz-xtal-with-direct-phy-access-v1-1-d92f605f1160@arinc9.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 767f66c37f6b..1035820c2377 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2268,8 +2268,6 @@ mt7530_setup(struct dsa_switch *ds) SYS_CTRL_PHY_RST | SYS_CTRL_SW_RST | SYS_CTRL_REG_RST); - mt7530_pll_setup(priv); - /* Lower Tx driving for TRGMII path */ for (i = 0; i < NUM_TRGMII_CTRL; i++) mt7530_write(priv, MT7530_TRGMII_TD_ODT(i), @@ -2285,6 +2283,9 @@ mt7530_setup(struct dsa_switch *ds) val |= MHWTRAP_MANUAL; mt7530_write(priv, MT7530_MHWTRAP, val); + if ((val & HWTRAP_XTAL_MASK) == HWTRAP_XTAL_40MHZ) + mt7530_pll_setup(priv); + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ From 8c05813df270324ce0b3a8647facc70c9bdd6fb5 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sun, 24 Mar 2024 23:40:09 +0300 Subject: [PATCH 438/496] MAINTAINERS: split Renesas Ethernet drivers entry Since the Renesas Ethernet Switch driver was added by Yoshihiro Shimoda, I started receiving the patches to review for it -- which I was unable to do, as I don't know this hardware and don't even have the manuals for it. Fortunately, Shimoda-san has volunteered to be a reviewer for this new driver, thus let's now split the single entry into 3 per-driver entries, each with its own reviewer... Signed-off-by: Sergey Shtylyov Reviewed-by: Simon Horman Acked-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/de0ccc1d-6fc0-583f-4f80-f70e6461d62d@omp.ru Signed-off-by: Paolo Abeni --- MAINTAINERS | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f736af98d7b5..6fc7ee1a6150 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18724,13 +18724,24 @@ S: Supported F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.yaml F: drivers/i2c/busses/i2c-emev2.c -RENESAS ETHERNET DRIVERS +RENESAS ETHERNET AVB DRIVER R: Sergey Shtylyov L: netdev@vger.kernel.org L: linux-renesas-soc@vger.kernel.org -F: Documentation/devicetree/bindings/net/renesas,*.yaml -F: drivers/net/ethernet/renesas/ -F: include/linux/sh_eth.h +F: Documentation/devicetree/bindings/net/renesas,etheravb.yaml +F: drivers/net/ethernet/renesas/Kconfig +F: drivers/net/ethernet/renesas/Makefile +F: drivers/net/ethernet/renesas/ravb* + +RENESAS ETHERNET SWITCH DRIVER +R: Yoshihiro Shimoda +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +F: Documentation/devicetree/bindings/net/renesas,*ether-switch.yaml +F: drivers/net/ethernet/renesas/Kconfig +F: drivers/net/ethernet/renesas/Makefile +F: drivers/net/ethernet/renesas/rcar_gen4* +F: drivers/net/ethernet/renesas/rswitch* RENESAS IDT821034 ASoC CODEC M: Herve Codina @@ -18840,6 +18851,16 @@ S: Supported F: Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml F: drivers/i2c/busses/i2c-rzv2m.c +RENESAS SUPERH ETHERNET DRIVER +R: Sergey Shtylyov +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +F: Documentation/devicetree/bindings/net/renesas,ether.yaml +F: drivers/net/ethernet/renesas/Kconfig +F: drivers/net/ethernet/renesas/Makefile +F: drivers/net/ethernet/renesas/sh_eth* +F: include/linux/sh_eth.h + RENESAS USB PHY DRIVER M: Yoshihiro Shimoda L: linux-renesas-soc@vger.kernel.org From f1425529c33def8b46faae4400dd9e2bbaf16a05 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 25 Mar 2024 09:50:30 +0200 Subject: [PATCH 439/496] selftests: vxlan_mdb: Fix failures with old libnet Locally generated IP multicast packets (such as the ones used in the test) do not perform routing and simply egress the bound device. However, as explained in commit 8bcfb4ae4d97 ("selftests: forwarding: Fix failing tests with old libnet"), old versions of libnet (used by mausezahn) do not use the "SO_BINDTODEVICE" socket option. Specifically, the library started using the option for IPv6 sockets in version 1.1.6 and for IPv4 sockets in version 1.2. This explains why on Ubuntu - which uses version 1.1.6 - the IPv4 overlay tests are failing whereas the IPv6 ones are passing. Fix by specifying the source and destination MAC of the packets which will cause mausezahn to use a packet socket instead of an IP socket. Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/5bb50349-196d-4892-8ed2-f37543aa863f@alu.unizg.hr/ Tested-by: Mirsad Todorovac Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20240325075030.2379513-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/test_vxlan_mdb.sh | 205 +++++++++++------- 1 file changed, 128 insertions(+), 77 deletions(-) diff --git a/tools/testing/selftests/net/test_vxlan_mdb.sh b/tools/testing/selftests/net/test_vxlan_mdb.sh index 74ff9fb2a6f0..58da5de99ac4 100755 --- a/tools/testing/selftests/net/test_vxlan_mdb.sh +++ b/tools/testing/selftests/net/test_vxlan_mdb.sh @@ -1177,6 +1177,7 @@ encap_params_common() local plen=$1; shift local enc_ethtype=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -1195,11 +1196,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep2_ip src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_dst_ip $vtep1_ip action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Destination IP - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Destination IP - no match" @@ -1212,20 +1213,20 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip dst_port 1111 src_vni 10020" run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 4789 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Default destination port - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Default destination port - no match" run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 1111 action pass" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Non-default destination port - match" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Non-default destination port - no match" @@ -1238,11 +1239,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10010 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Default destination VNI - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Default destination VNI - no match" @@ -1250,11 +1251,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip vni 10010 src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10020 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Non-default destination VNI - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Non-default destination VNI - no match" @@ -1272,6 +1273,7 @@ encap_params_ipv4_ipv4() local plen=32 local enc_ethtype="ip" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1279,7 +1281,7 @@ encap_params_ipv4_ipv4() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn" + $grp $grp_dmac $src "mausezahn" } encap_params_ipv6_ipv4() @@ -1291,6 +1293,7 @@ encap_params_ipv6_ipv4() local plen=32 local enc_ethtype="ip" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1298,7 +1301,7 @@ encap_params_ipv6_ipv4() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn -6" + $grp $grp_dmac $src "mausezahn -6" } encap_params_ipv4_ipv6() @@ -1310,6 +1313,7 @@ encap_params_ipv4_ipv6() local plen=128 local enc_ethtype="ipv6" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1317,7 +1321,7 @@ encap_params_ipv4_ipv6() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn" + $grp $grp_dmac $src "mausezahn" } encap_params_ipv6_ipv6() @@ -1329,6 +1333,7 @@ encap_params_ipv6_ipv6() local plen=128 local enc_ethtype="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1336,7 +1341,7 @@ encap_params_ipv6_ipv6() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn -6" + $grp $grp_dmac $src "mausezahn -6" } starg_exclude_ir_common() @@ -1347,6 +1352,7 @@ starg_exclude_ir_common() local vtep2_ip=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1368,14 +1374,14 @@ starg_exclude_ir_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 log_test $? 0 "Block excluded source - second VTEP" # Check that valid source is forwarded to both VTEPs. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1385,14 +1391,14 @@ starg_exclude_ir_common() run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Block excluded source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 log_test $? 0 "Block excluded source after removal - second VTEP" # Check that valid source is forwarded to the remaining VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Forward valid source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1407,6 +1413,7 @@ starg_exclude_ir_ipv4_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1415,7 +1422,7 @@ starg_exclude_ir_ipv4_ipv4() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_exclude_ir_ipv6_ipv4() @@ -1426,6 +1433,7 @@ starg_exclude_ir_ipv6_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1434,7 +1442,7 @@ starg_exclude_ir_ipv6_ipv4() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_exclude_ir_ipv4_ipv6() @@ -1445,6 +1453,7 @@ starg_exclude_ir_ipv4_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1453,7 +1462,7 @@ starg_exclude_ir_ipv4_ipv6() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_exclude_ir_ipv6_ipv6() @@ -1464,6 +1473,7 @@ starg_exclude_ir_ipv6_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1472,7 +1482,7 @@ starg_exclude_ir_ipv6_ipv6() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_include_ir_common() @@ -1483,6 +1493,7 @@ starg_include_ir_common() local vtep2_ip=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1504,14 +1515,14 @@ starg_include_ir_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 log_test $? 0 "Block excluded source - second VTEP" # Check that valid source is forwarded to both VTEPs. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1521,14 +1532,14 @@ starg_include_ir_common() run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Block excluded source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 log_test $? 0 "Block excluded source after removal - second VTEP" # Check that valid source is forwarded to the remaining VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Forward valid source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1543,6 +1554,7 @@ starg_include_ir_ipv4_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1551,7 +1563,7 @@ starg_include_ir_ipv4_ipv4() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_include_ir_ipv6_ipv4() @@ -1562,6 +1574,7 @@ starg_include_ir_ipv6_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1570,7 +1583,7 @@ starg_include_ir_ipv6_ipv4() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_include_ir_ipv4_ipv6() @@ -1581,6 +1594,7 @@ starg_include_ir_ipv4_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1589,7 +1603,7 @@ starg_include_ir_ipv4_ipv6() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_include_ir_ipv6_ipv6() @@ -1600,6 +1614,7 @@ starg_include_ir_ipv6_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1608,7 +1623,7 @@ starg_include_ir_ipv6_ipv6() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_exclude_p2mp_common() @@ -1618,6 +1633,7 @@ starg_exclude_p2mp_common() local mcast_grp=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1635,12 +1651,12 @@ starg_exclude_p2mp_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $mcast_grp src_vni 10010 via veth0" # Check that invalid source is not forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source" # Check that valid source is forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source" @@ -1648,7 +1664,7 @@ starg_exclude_p2mp_common() run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0" # Check that valid source is not received anymore. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Receive of valid source after removal from group" } @@ -1660,6 +1676,7 @@ starg_exclude_p2mp_ipv4_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1667,7 +1684,7 @@ starg_exclude_p2mp_ipv4_ipv4() echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1678,6 +1695,7 @@ starg_exclude_p2mp_ipv6_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1685,7 +1703,7 @@ starg_exclude_p2mp_ipv6_ipv4() echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1696,6 +1714,7 @@ starg_exclude_p2mp_ipv4_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1703,7 +1722,7 @@ starg_exclude_p2mp_ipv4_ipv6() echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1714,6 +1733,7 @@ starg_exclude_p2mp_ipv6_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1721,7 +1741,7 @@ starg_exclude_p2mp_ipv6_ipv6() echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1732,6 +1752,7 @@ starg_include_p2mp_common() local mcast_grp=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1749,12 +1770,12 @@ starg_include_p2mp_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $mcast_grp src_vni 10010 via veth0" # Check that invalid source is not forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source" # Check that valid source is forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source" @@ -1762,7 +1783,7 @@ starg_include_p2mp_common() run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0" # Check that valid source is not received anymore. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Receive of valid source after removal from group" } @@ -1774,6 +1795,7 @@ starg_include_p2mp_ipv4_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1781,7 +1803,7 @@ starg_include_p2mp_ipv4_ipv4() echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1792,6 +1814,7 @@ starg_include_p2mp_ipv6_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1799,7 +1822,7 @@ starg_include_p2mp_ipv6_ipv4() echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1810,6 +1833,7 @@ starg_include_p2mp_ipv4_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1817,7 +1841,7 @@ starg_include_p2mp_ipv4_ipv6() echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1828,6 +1852,7 @@ starg_include_p2mp_ipv6_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1835,7 +1860,7 @@ starg_include_p2mp_ipv6_ipv6() echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1847,6 +1872,7 @@ egress_vni_translation_common() local plen=$1; shift local proto=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -1882,20 +1908,20 @@ egress_vni_translation_common() # Make sure that packets sent from the first VTEP over VLAN 10 are # received by the SVI corresponding to the L3VNI (14000 / VLAN 4000) on # the second VTEP, since it is configured as PVID. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1 log_test $? 0 "Egress VNI translation - PVID configured" # Remove PVID flag from VLAN 4000 on the second VTEP and make sure # packets are no longer received by the SVI interface. run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1 log_test $? 0 "Egress VNI translation - no PVID configured" # Reconfigure the PVID and make sure packets are received again. run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0 pvid" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 2 log_test $? 0 "Egress VNI translation - PVID reconfigured" } @@ -1908,6 +1934,7 @@ egress_vni_translation_ipv4_ipv4() local plen=32 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1915,7 +1942,7 @@ egress_vni_translation_ipv4_ipv4() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn" + $grp_dmac $src "mausezahn" } egress_vni_translation_ipv6_ipv4() @@ -1926,6 +1953,7 @@ egress_vni_translation_ipv6_ipv4() local plen=32 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1933,7 +1961,7 @@ egress_vni_translation_ipv6_ipv4() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn -6" + $grp_dmac $src "mausezahn -6" } egress_vni_translation_ipv4_ipv6() @@ -1944,6 +1972,7 @@ egress_vni_translation_ipv4_ipv6() local plen=128 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1951,7 +1980,7 @@ egress_vni_translation_ipv4_ipv6() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn" + $grp_dmac $src "mausezahn" } egress_vni_translation_ipv6_ipv6() @@ -1962,6 +1991,7 @@ egress_vni_translation_ipv6_ipv6() local plen=128 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1969,7 +1999,7 @@ egress_vni_translation_ipv6_ipv6() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn -6" + $grp_dmac $src "mausezahn -6" } all_zeros_mdb_common() @@ -1982,12 +2012,18 @@ all_zeros_mdb_common() local vtep4_ip=$1; shift local plen=$1; shift local ipv4_grp=239.1.1.1 + local ipv4_grp_dmac=01:00:5e:01:01:01 local ipv4_unreg_grp=239.2.2.2 + local ipv4_unreg_grp_dmac=01:00:5e:02:02:02 local ipv4_ll_grp=224.0.0.100 + local ipv4_ll_grp_dmac=01:00:5e:00:00:64 local ipv4_src=192.0.2.129 local ipv6_grp=ff0e::1 + local ipv6_grp_dmac=33:33:00:00:00:01 local ipv6_unreg_grp=ff0e::2 + local ipv6_unreg_grp_dmac=33:33:00:00:00:02 local ipv6_ll_grp=ff02::1 + local ipv6_ll_grp_dmac=33:33:00:00:00:01 local ipv6_src=2001:db8:100::1 # Install all-zeros (catchall) MDB entries for IPv4 and IPv6 traffic @@ -2023,7 +2059,7 @@ all_zeros_mdb_common() # Send registered IPv4 multicast and make sure it only arrives to the # first VTEP. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_grp_dmac -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Registered IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 @@ -2031,7 +2067,7 @@ all_zeros_mdb_common() # Send unregistered IPv4 multicast that is not link-local and make sure # it arrives to the first and second VTEPs. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_unreg_grp_dmac -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Unregistered IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -2039,7 +2075,7 @@ all_zeros_mdb_common() # Send IPv4 link-local multicast traffic and make sure it does not # arrive to any VTEP. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_ll_grp_dmac -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Link-local IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -2074,7 +2110,7 @@ all_zeros_mdb_common() # Send registered IPv6 multicast and make sure it only arrives to the # third VTEP. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_grp_dmac -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 1 log_test $? 0 "Registered IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 0 @@ -2082,7 +2118,7 @@ all_zeros_mdb_common() # Send unregistered IPv6 multicast that is not link-local and make sure # it arrives to the third and fourth VTEPs. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_unreg_grp_dmac -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 2 log_test $? 0 "Unregistered IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 1 @@ -2090,7 +2126,7 @@ all_zeros_mdb_common() # Send IPv6 link-local multicast traffic and make sure it does not # arrive to any VTEP. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_ll_grp_dmac -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 2 log_test $? 0 "Link-local IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 1 @@ -2165,6 +2201,7 @@ mdb_fdb_common() local plen=$1; shift local proto=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -2188,7 +2225,7 @@ mdb_fdb_common() # Send IP multicast traffic and make sure it is forwarded by the MDB # and only arrives to the first VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "IP multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 @@ -2205,7 +2242,7 @@ mdb_fdb_common() # Remove the MDB entry and make sure that IP multicast is now forwarded # by the FDB to the second VTEP. run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep1_ip src_vni 10010" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "IP multicast after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 2 @@ -2221,14 +2258,15 @@ mdb_fdb_ipv4_ipv4() local plen=32 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo echo "Data path: MDB with FDB - IPv4 overlay / IPv4 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn" } mdb_fdb_ipv6_ipv4() @@ -2240,14 +2278,15 @@ mdb_fdb_ipv6_ipv4() local plen=32 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo echo "Data path: MDB with FDB - IPv6 overlay / IPv4 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn -6" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn -6" } mdb_fdb_ipv4_ipv6() @@ -2259,14 +2298,15 @@ mdb_fdb_ipv4_ipv6() local plen=128 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo echo "Data path: MDB with FDB - IPv4 overlay / IPv6 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn" } mdb_fdb_ipv6_ipv6() @@ -2278,14 +2318,15 @@ mdb_fdb_ipv6_ipv6() local plen=128 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo echo "Data path: MDB with FDB - IPv6 overlay / IPv6 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn -6" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn -6" } mdb_grp1_loop() @@ -2320,7 +2361,9 @@ mdb_torture_common() local vtep1_ip=$1; shift local vtep2_ip=$1; shift local grp1=$1; shift + local grp1_dmac=$1; shift local grp2=$1; shift + local grp2_dmac=$1; shift local src=$1; shift local mz=$1; shift local pid1 @@ -2345,9 +2388,9 @@ mdb_torture_common() pid1=$! mdb_grp2_loop $ns1 $vtep1_ip $vtep2_ip $grp2 & pid2=$! - ip netns exec $ns1 $mz br0.10 -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & + ip netns exec $ns1 $mz br0.10 -a own -b $grp1_dmac -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & pid3=$! - ip netns exec $ns1 $mz br0.10 -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & + ip netns exec $ns1 $mz br0.10 -a own -b $grp2_dmac -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & pid4=$! sleep 30 @@ -2363,15 +2406,17 @@ mdb_torture_ipv4_ipv4() local vtep1_ip=198.51.100.100 local vtep2_ip=198.51.100.200 local grp1=239.1.1.1 + local grp1_dmac=01:00:5e:01:01:01 local grp2=239.2.2.2 + local grp2_dmac=01:00:5e:02:02:02 local src=192.0.2.129 echo echo "Data path: MDB torture test - IPv4 overlay / IPv4 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn" } mdb_torture_ipv6_ipv4() @@ -2380,15 +2425,17 @@ mdb_torture_ipv6_ipv4() local vtep1_ip=198.51.100.100 local vtep2_ip=198.51.100.200 local grp1=ff0e::1 + local grp1_dmac=33:33:00:00:00:01 local grp2=ff0e::2 + local grp2_dmac=33:33:00:00:00:02 local src=2001:db8:100::1 echo echo "Data path: MDB torture test - IPv6 overlay / IPv4 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn -6" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn -6" } mdb_torture_ipv4_ipv6() @@ -2397,15 +2444,17 @@ mdb_torture_ipv4_ipv6() local vtep1_ip=2001:db8:1000::1 local vtep2_ip=2001:db8:2000::1 local grp1=239.1.1.1 + local grp1_dmac=01:00:5e:01:01:01 local grp2=239.2.2.2 + local grp2_dmac=01:00:5e:02:02:02 local src=192.0.2.129 echo echo "Data path: MDB torture test - IPv4 overlay / IPv6 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn" } mdb_torture_ipv6_ipv6() @@ -2414,15 +2463,17 @@ mdb_torture_ipv6_ipv6() local vtep1_ip=2001:db8:1000::1 local vtep2_ip=2001:db8:2000::1 local grp1=ff0e::1 + local grp1_dmac=33:33:00:00:00:01 local grp2=ff0e::2 + local grp2_dmac=33:33:00:00:00:02 local src=2001:db8:100::1 echo echo "Data path: MDB torture test - IPv6 overlay / IPv6 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn -6" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn -6" } ################################################################################ From 8ea3f4f1a1b4242d5fc273f41aa7c86f6b40178c Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Thu, 21 Mar 2024 17:34:20 +0100 Subject: [PATCH 440/496] MAINTAINERS: wifi: mwifiex: add Francesco as reviewer As discussed on the mailing list, add myself as mwifiex driver reviewer. Link: https://lore.kernel.org/all/20240318112830.GA9565@francesco-nb/ Signed-off-by: Francesco Dolcini Acked-by: Brian Norris Signed-off-by: Kalle Valo Link: https://msgid.link/20240321163420.11158-1-francesco@dolcini.it --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 452288995991..c12849bec142 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13077,6 +13077,7 @@ F: drivers/net/ethernet/marvell/mvpp2/ MARVELL MWIFIEX WIRELESS DRIVER M: Brian Norris +R: Francesco Dolcini L: linux-wireless@vger.kernel.org S: Odd Fixes F: drivers/net/wireless/marvell/mwifiex/ From 47e39d213e09c6cae0d6b4d95e454ea404013312 Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 25 Mar 2024 20:43:09 +0800 Subject: [PATCH 441/496] net: hns3: fix index limit to support all queue stats Currently, hns hardware supports more than 512 queues and the index limit in hclge_comm_tqps_update_stats is wrong. So this patch removes it. Fixes: 287db5c40d15 ("net: hns3: create new set of common tqp stats APIs for PF and VF reuse") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Michal Kubiak Reviewed-by: Kalesh AP Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- .../ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c index f3c9395d8351..618f66d9586b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c @@ -85,7 +85,7 @@ int hclge_comm_tqps_update_stats(struct hnae3_handle *handle, hclge_comm_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_TX_STATS, true); - desc.data[0] = cpu_to_le32(tqp->index & 0x1ff); + desc.data[0] = cpu_to_le32(tqp->index); ret = hclge_comm_cmd_send(hw, &desc, 1); if (ret) { dev_err(&hw->cmq.csq.pdev->dev, From 93305b77ffcb042f1538ecc383505e87d95aa05a Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Mon, 25 Mar 2024 20:43:10 +0800 Subject: [PATCH 442/496] net: hns3: fix kernel crash when devlink reload during pf initialization The devlink reload process will access the hardware resources, but the register operation is done before the hardware is initialized. So, processing the devlink reload during initialization may lead to kernel crash. This patch fixes this by taking devl_lock during initialization. Fixes: b741269b2759 ("net: hns3: add support for registering devlink for PF") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index b4afb66efe5c..ff6a2ed23ddb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11626,6 +11626,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) if (ret) goto err_pci_uninit; + devl_lock(hdev->devlink); + /* Firmware command queue initialize */ ret = hclge_comm_cmd_queue_init(hdev->pdev, &hdev->hw.hw); if (ret) @@ -11805,6 +11807,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_task_schedule(hdev, round_jiffies_relative(HZ)); + devl_unlock(hdev->devlink); return 0; err_mdiobus_unreg: @@ -11817,6 +11820,7 @@ err_msi_uninit: err_cmd_uninit: hclge_comm_cmd_uninit(hdev->ae_dev, &hdev->hw.hw); err_devlink_uninit: + devl_unlock(hdev->devlink); hclge_devlink_uninit(hdev); err_pci_uninit: pcim_iounmap(pdev, hdev->hw.hw.io_base); From 5bd088d6c21a45ee70e6116879310e54174d75eb Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 25 Mar 2024 20:43:11 +0800 Subject: [PATCH 443/496] net: hns3: mark unexcuted loopback test result as UNEXECUTED Currently, loopback test may be skipped when resetting, but the test result will still show as 'PASS', because the driver doesn't set ETH_TEST_FL_FAILED flag. Fix it by setting the flag and initializating the value to UNEXECUTED. Fixes: 4c8dab1c709c ("net: hns3: reconstruct function hns3_self_test") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Michal Kubiak Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 999a0ee162a6..941cb529d671 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -78,6 +78,9 @@ static const struct hns3_stats hns3_rxq_stats[] = { #define HNS3_NIC_LB_TEST_NO_MEM_ERR 1 #define HNS3_NIC_LB_TEST_TX_CNT_ERR 2 #define HNS3_NIC_LB_TEST_RX_CNT_ERR 3 +#define HNS3_NIC_LB_TEST_UNEXECUTED 4 + +static int hns3_get_sset_count(struct net_device *netdev, int stringset); static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en) { @@ -418,18 +421,26 @@ static void hns3_do_external_lb(struct net_device *ndev, static void hns3_self_test(struct net_device *ndev, struct ethtool_test *eth_test, u64 *data) { + int cnt = hns3_get_sset_count(ndev, ETH_SS_TEST); struct hns3_nic_priv *priv = netdev_priv(ndev); struct hnae3_handle *h = priv->ae_handle; int st_param[HNAE3_LOOP_NONE][2]; bool if_running = netif_running(ndev); + int i; + + /* initialize the loopback test result, avoid marking an unexcuted + * loopback test as PASS. + */ + for (i = 0; i < cnt; i++) + data[i] = HNS3_NIC_LB_TEST_UNEXECUTED; if (hns3_nic_resetting(ndev)) { netdev_err(ndev, "dev resetting!"); - return; + goto failure; } if (!(eth_test->flags & ETH_TEST_FL_OFFLINE)) - return; + goto failure; if (netif_msg_ifdown(h)) netdev_info(ndev, "self test start\n"); @@ -451,6 +462,10 @@ static void hns3_self_test(struct net_device *ndev, if (netif_msg_ifdown(h)) netdev_info(ndev, "self test end\n"); + return; + +failure: + eth_test->flags |= ETH_TEST_FL_FAILED; } static void hns3_update_limit_promisc_mode(struct net_device *netdev, From 1ec17ef59168a1a6f1105f5dc517f783839a5302 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 28 Feb 2024 12:13:27 +0100 Subject: [PATCH 444/496] btrfs: zoned: fix use-after-free in do_zone_finish() Shinichiro reported the following use-after-free triggered by the device replace operation in fstests btrfs/070. BTRFS info (device nullb1): scrub: finished on devid 1 with status: 0 ================================================================== BUG: KASAN: slab-use-after-free in do_zone_finish+0x91a/0xb90 [btrfs] Read of size 8 at addr ffff8881543c8060 by task btrfs-cleaner/3494007 CPU: 0 PID: 3494007 Comm: btrfs-cleaner Tainted: G W 6.8.0-rc5-kts #1 Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020 Call Trace: dump_stack_lvl+0x5b/0x90 print_report+0xcf/0x670 ? __virt_addr_valid+0x200/0x3e0 kasan_report+0xd8/0x110 ? do_zone_finish+0x91a/0xb90 [btrfs] ? do_zone_finish+0x91a/0xb90 [btrfs] do_zone_finish+0x91a/0xb90 [btrfs] btrfs_delete_unused_bgs+0x5e1/0x1750 [btrfs] ? __pfx_btrfs_delete_unused_bgs+0x10/0x10 [btrfs] ? btrfs_put_root+0x2d/0x220 [btrfs] ? btrfs_clean_one_deleted_snapshot+0x299/0x430 [btrfs] cleaner_kthread+0x21e/0x380 [btrfs] ? __pfx_cleaner_kthread+0x10/0x10 [btrfs] kthread+0x2e3/0x3c0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x31/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Allocated by task 3493983: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 __kasan_kmalloc+0xaa/0xb0 btrfs_alloc_device+0xb3/0x4e0 [btrfs] device_list_add.constprop.0+0x993/0x1630 [btrfs] btrfs_scan_one_device+0x219/0x3d0 [btrfs] btrfs_control_ioctl+0x26e/0x310 [btrfs] __x64_sys_ioctl+0x134/0x1b0 do_syscall_64+0x99/0x190 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Freed by task 3494056: kasan_save_stack+0x33/0x60 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3f/0x60 poison_slab_object+0x102/0x170 __kasan_slab_free+0x32/0x70 kfree+0x11b/0x320 btrfs_rm_dev_replace_free_srcdev+0xca/0x280 [btrfs] btrfs_dev_replace_finishing+0xd7e/0x14f0 [btrfs] btrfs_dev_replace_by_ioctl+0x1286/0x25a0 [btrfs] btrfs_ioctl+0xb27/0x57d0 [btrfs] __x64_sys_ioctl+0x134/0x1b0 do_syscall_64+0x99/0x190 entry_SYSCALL_64_after_hwframe+0x6e/0x76 The buggy address belongs to the object at ffff8881543c8000 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 96 bytes inside of freed 1024-byte region [ffff8881543c8000, ffff8881543c8400) The buggy address belongs to the physical page: page:00000000fe2c1285 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1543c8 head:00000000fe2c1285 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x17ffffc0000840(slab|head|node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 0017ffffc0000840 ffff888100042dc0 ffffea0019e8f200 dead000000000002 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881543c7f00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881543c7f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffff8881543c8000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8881543c8080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8881543c8100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb This UAF happens because we're accessing stale zone information of a already removed btrfs_device in do_zone_finish(). The sequence of events is as follows: btrfs_dev_replace_start btrfs_scrub_dev btrfs_dev_replace_finishing btrfs_dev_replace_update_device_in_mapping_tree <-- devices replaced btrfs_rm_dev_replace_free_srcdev btrfs_free_device <-- device freed cleaner_kthread btrfs_delete_unused_bgs btrfs_zone_finish do_zone_finish <-- refers the freed device The reason for this is that we're using a cached pointer to the chunk_map from the block group, but on device replace this cached pointer can contain stale device entries. The staleness comes from the fact, that btrfs_block_group::physical_map is not a pointer to a btrfs_chunk_map but a memory copy of it. Also take the fs_info::dev_replace::rwsem to prevent btrfs_dev_replace_update_device_in_mapping_tree() from changing the device underneath us again. Note: btrfs_dev_replace_update_device_in_mapping_tree() is holding fs_info::mapping_tree_lock, but as this is a spinning read/write lock we cannot take it as the call to blkdev_zone_mgmt() requires a memory allocation which may not sleep. But btrfs_dev_replace_update_device_in_mapping_tree() is always called with the fs_info::dev_replace::rwsem held in write mode. Many thanks to Shinichiro for analyzing the bug. Reported-by: Shinichiro Kawasaki CC: stable@vger.kernel.org # 6.8 Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 3317bebfca95..459d1af02c3c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1561,11 +1561,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) if (!map) return -EINVAL; - cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); - if (!cache->physical_map) { - ret = -ENOMEM; - goto out; - } + cache->physical_map = map; zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); if (!zone_info) { @@ -1677,7 +1673,6 @@ out: } bitmap_free(active); kfree(zone_info); - btrfs_free_chunk_map(map); return ret; } @@ -2162,6 +2157,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret = 0; int i; @@ -2237,6 +2233,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + down_read(&dev_replace->rwsem); map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2251,13 +2248,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ zinfo->zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) + if (ret) { + up_read(&dev_replace->rwsem); return ret; + } if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } + up_read(&dev_replace->rwsem); if (!fully_written) btrfs_dec_block_group_ro(block_group); From 9f7eb8405dcbc79c5434821e9e3e92abe187ee8e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 1 Mar 2024 08:42:13 +0800 Subject: [PATCH 445/496] btrfs: validate device maj:min during open Boris managed to create a device capable of changing its maj:min without altering its device path. Only multi-devices can be scanned. A device that gets scanned and remains in the btrfs kernel cache might end up with an incorrect maj:min. Despite the temp-fsid feature patch did not introduce this bug, it could lead to issues if the above multi-device is converted to a single device with a stale maj:min. Subsequently, attempting to mount the same device with the correct maj:min might mistake it for another device with the same fsid, potentially resulting in wrongly auto-enabling the temp-fsid feature. To address this, this patch validates the device's maj:min at the time of device open and updates it if it has changed since the last scan. CC: stable@vger.kernel.org # 6.7+ Fixes: a5b8a5f9f835 ("btrfs: support cloned-device mount capability") Reported-by: Boris Burkov Co-developed-by: Boris Burkov Reviewed-by: Boris Burkov # Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e49935a54da0..c318640b4472 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -692,6 +692,16 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->bdev = bdev_handle->bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); + if (device->devt != device->bdev->bd_dev) { + btrfs_warn(NULL, + "device %s maj:min changed from %d:%d to %d:%d", + device->name->str, MAJOR(device->devt), + MINOR(device->devt), MAJOR(device->bdev->bd_dev), + MINOR(device->bdev->bd_dev)); + + device->devt = device->bdev->bd_dev; + } + fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { From 8a565ec04d6c43f330e7401e5af3458431b29bc6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 13 Mar 2024 11:37:31 +0000 Subject: [PATCH 446/496] btrfs: fix extent map leak in unexpected scenario at unpin_extent_cache() At unpin_extent_cache() if we happen to find an extent map with an unexpected start offset, we jump to the 'out' label and never release the reference we added to the extent map through the call to lookup_extent_mapping(), therefore resulting in a leak. So fix this by moving the free_extent_map() under the 'out' label. Fixes: c03c89f821e5 ("btrfs: handle errors returned from unpin_extent_cache()") Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 347ca13d15a9..e03953dbcd5e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -340,9 +340,9 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) em->mod_len = em->len; } - free_extent_map(em); out: write_unlock(&tree->lock); + free_extent_map(em); return ret; } From 4dc1d69c2b101eee0bf071187794ffed2f9c2596 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 13 Mar 2024 12:49:31 +0000 Subject: [PATCH 447/496] btrfs: fix warning messages not printing interval at unpin_extent_range() At unpin_extent_range() we print warning messages that are supposed to print an interval in the form "[X, Y)", with the first element being an inclusive start offset and the second element being the exclusive end offset of a range. However we end up printing the range's length instead of the range's exclusive end offset, so fix that to avoid having confusing and non-sense messages in case we hit one of these unexpected scenarios. Fixes: 00deaf04df35 ("btrfs: log messages at unpin_extent_range() during unexpected cases") Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e03953dbcd5e..2cfc6e8cf76f 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -309,7 +309,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - start, len, gen); + start, start + len, gen); ret = -ENOENT; goto out; } @@ -318,7 +318,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) btrfs_warn(fs_info, "found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", btrfs_ino(inode), btrfs_root_id(inode->root), - em->start, start, len, gen); + em->start, start, start + len, gen); ret = -EUCLEAN; goto out; } From 379c87239320a204138995e1da35ce9eca239e7a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 13 Mar 2024 13:02:02 +0000 Subject: [PATCH 448/496] btrfs: fix message not properly printing interval when adding extent map At btrfs_add_extent_mapping(), if we are unable to merge the existing extent map, we print a warning message that suggests interval ranges in the form "[X, Y)", where the first element is the inclusive start offset of a range and the second element is the exclusive end offset. However we end up printing the length of the ranges instead of the exclusive end offsets. So fix this by printing the range end offsets. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2cfc6e8cf76f..16685cb8a91d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -634,8 +634,8 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, *em_in = NULL; WARN_ONCE(ret, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", - existing->start, existing->len, - orig_start, orig_len, start); + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } From 2133460061e1bbecb47da73ad5ec7cf8e951006c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 13 Mar 2024 17:14:02 +0000 Subject: [PATCH 449/496] btrfs: use btrfs_warn() to log message at btrfs_add_extent_mapping() At btrfs_add_extent_mapping(), if we failed to merge the extent map, which is unexpected and theoretically should never happen, we use WARN_ONCE() to log a message which is not great because we don't get information about which filesystem it relates to in case we have multiple btrfs filesystems mounted. So change this to use btrfs_warn() and surround the error check with WARN_ON() so we always get a useful stack trace and the condition is flagged as "unlikely" since it's not expected to ever happen. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 16685cb8a91d..445f7716f1e2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -629,13 +629,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, */ ret = merge_extent_mapping(em_tree, existing, em, start); - if (ret) { + if (WARN_ON(ret)) { free_extent_map(em); *em_in = NULL; - WARN_ONCE(ret, -"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu\n", - existing->start, extent_map_end(existing), - orig_start, orig_start + orig_len, start); + btrfs_warn(fs_info, +"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", + existing->start, extent_map_end(existing), + orig_start, orig_start + orig_len, start); } free_extent_map(existing); } From a8b70c7f8600bc77d03c0b032c0662259b9e615e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 21 Feb 2024 07:35:52 -0800 Subject: [PATCH 450/496] btrfs: zoned: don't skip block groups with 100% zone unusable Commit f4a9f219411f ("btrfs: do not delete unused block group if it may be used soon") changed the behaviour of deleting unused block-groups on zoned filesystems. Starting with this commit, we're using btrfs_space_info_used() to calculate the number of used bytes in a space_info. But btrfs_space_info_used() also accounts btrfs_space_info::bytes_zone_unusable as used bytes. So if a block group is 100% zone_unusable it is skipped from the deletion step. In order not to skip fully zone_unusable block-groups, also check if the block-group has bytes left that can be used on a zoned filesystem. Fixes: f4a9f219411f ("btrfs: do not delete unused block group if it may be used soon") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Filipe Manana Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5f7587ca1ca7..1e09aeea69c2 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1559,7 +1559,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used) { + if (space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the From 2f1aeab9fca1a5f583be1add175d1ee95c213cfa Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 19 Mar 2024 08:28:18 +0530 Subject: [PATCH 451/496] btrfs: return accurate error code on open failure in open_fs_devices() When attempting to exclusive open a device which has no exclusive open permission, such as a physical device associated with the flakey dm device, the open operation will fail, resulting in a mount failure. In this particular scenario, we erroneously return -EINVAL instead of the correct error code provided by the bdev_open_by_path() function, which is -EBUSY. Fix this, by returning error code from the bdev_open_by_path() function. With this correction, the mount error message will align with that of ext4 and xfs. Reviewed-by: Boris Burkov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c318640b4472..dedec3d9b111 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1184,23 +1184,30 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; + int ret = 0; list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { - int ret; + int ret2; - ret = btrfs_open_one_device(fs_devices, device, flags, holder); - if (ret == 0 && + ret2 = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret2 == 0 && (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; - } else if (ret == -ENODATA) { + } else if (ret2 == -ENODATA) { fs_devices->num_devices--; list_del(&device->dev_list); btrfs_free_device(device); } + if (ret == 0 && ret2 != 0) + ret = ret2; } - if (fs_devices->open_devices == 0) + + if (fs_devices->open_devices == 0) { + if (ret) + return ret; return -EINVAL; + } fs_devices->opened = 1; fs_devices->latest_dev = latest_dev; From ef1e68236b9153c27cb7cf29ead0c532870d4215 Mon Sep 17 00:00:00 2001 From: Tavian Barnes Date: Fri, 15 Mar 2024 21:14:29 -0400 Subject: [PATCH 452/496] btrfs: fix race in read_extent_buffer_pages() There are reports from tree-checker that detects corrupted nodes, without any obvious pattern so possibly an overwrite in memory. After some debugging it turns out there's a race when reading an extent buffer the uptodate status can be missed. To prevent concurrent reads for the same extent buffer, read_extent_buffer_pages() performs these checks: /* (1) */ if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; /* (2) */ if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) goto done; At this point, it seems safe to start the actual read operation. Once that completes, end_bbio_meta_read() does /* (3) */ set_extent_buffer_uptodate(eb); /* (4) */ clear_bit(EXTENT_BUFFER_READING, &eb->bflags); Normally, this is enough to ensure only one read happens, and all other callers wait for it to finish before returning. Unfortunately, there is a racey interleaving: Thread A | Thread B | Thread C ---------+----------+--------- (1) | | | (1) | (2) | | (3) | | (4) | | | (2) | | | (1) When this happens, thread B kicks of an unnecessary read. Worse, thread C will see UPTODATE set and return immediately, while the read from thread B is still in progress. This race could result in tree-checker errors like this as the extent buffer is concurrently modified: BTRFS critical (device dm-0): corrupted node, root=256 block=8550954455682405139 owner mismatch, have 11858205567642294356 expect [256, 18446744073709551360] Fix it by testing UPTODATE again after setting the READING bit, and if it's been set, skip the unnecessary read. Fixes: d7172f52e993 ("btrfs: use per-buffer locking for extent_buffer reading") Link: https://lore.kernel.org/linux-btrfs/CAHk-=whNdMaN9ntZ47XRKP6DBes2E5w7fi-0U3H2+PS18p+Pzw@mail.gmail.com/ Link: https://lore.kernel.org/linux-btrfs/f51a6d5d7432455a6a858d51b49ecac183e0bbc9.1706312914.git.wqu@suse.com/ Link: https://lore.kernel.org/linux-btrfs/c7241ea4-fcc6-48d2-98c8-b5ea790d6c89@gmx.com/ CC: stable@vger.kernel.org # 6.5+ Reviewed-by: Qu Wenruo Reviewed-by: Christoph Hellwig Signed-off-by: Tavian Barnes Reviewed-by: David Sterba [ minor update of changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7441245b1ceb..61594eaf1f89 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4333,6 +4333,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) goto done; + /* + * Between the initial test_bit(EXTENT_BUFFER_UPTODATE) and the above + * test_and_set_bit(EXTENT_BUFFER_READING), someone else could have + * started and finished reading the same eb. In this case, UPTODATE + * will now be set, and we shouldn't read it in again. + */ + if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); + return 0; + } + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); From f8572367eaff6739e3bc238ba93b86cd7881c0ff Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 13 Mar 2024 17:31:07 -0400 Subject: [PATCH 453/496] mm/memory: fix missing pte marker for !page on pte zaps Commit 0cf18e839f64 of large folio zap work broke uffd-wp. Now mm's uffd unit test "wp-unpopulated" will trigger this WARN_ON_ONCE(). The WARN_ON_ONCE() asserts that an VMA cannot be registered with userfaultfd-wp if it contains a !normal page, but it's actually possible. One example is an anonymous vma, register with uffd-wp, read anything will install a zero page. Then when zap on it, this should trigger. What's more, removing that WARN_ON_ONCE may not be enough either, because we should also not rely on "whether it's a normal page" to decide whether pte marker is needed. For example, one can register wr-protect over some DAX regions to track writes when UFFD_FEATURE_WP_ASYNC enabled, in which case it can have page==NULL for a devmap but we may want to keep the marker around. Link: https://lkml.kernel.org/r/20240313213107.235067-1-peterx@redhat.com Fixes: 0cf18e839f64 ("mm/memory: handle !page case in zap_present_pte() separately") Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Muhammad Usama Anjum Signed-off-by: Andrew Morton --- mm/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index f2bc6dd15eb8..904f70b99498 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1536,7 +1536,9 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); arch_check_zapped_pte(vma, ptent); tlb_remove_tlb_entry(tlb, pte, addr); - VM_WARN_ON_ONCE(userfaultfd_wp(vma)); + if (userfaultfd_pte_wp(vma, ptent)) + zap_install_uffd_wp_if_needed(vma, addr, pte, 1, + details, ptent); ksm_might_unmap_zero_page(mm, ptent); return 1; } From 8b65ef5ad4862904e476a8f3d4e4418c950ddb90 Mon Sep 17 00:00:00 2001 From: Vitaly Chikunov Date: Mon, 18 Mar 2024 05:34:44 +0300 Subject: [PATCH 454/496] selftests/mm: Fix build with _FORTIFY_SOURCE Add missing flags argument to open(2) call with O_CREAT. Some tests fail to compile if _FORTIFY_SOURCE is defined (to any valid value) (together with -O), resulting in similar error messages such as: In file included from /usr/include/fcntl.h:342, from gup_test.c:1: In function 'open', inlined from 'main' at gup_test.c:206:10: /usr/include/bits/fcntl2.h:50:11: error: call to '__open_missing_mode' declared with attribute error: open with O_CREAT or O_TMPFILE in second argument needs 3 arguments 50 | __open_missing_mode (); | ^~~~~~~~~~~~~~~~~~~~~~ _FORTIFY_SOURCE is enabled by default in some distributions, so the tests are not built by default and are skipped. open(2) man-page warns about missing flags argument: "if it is not supplied, some arbitrary bytes from the stack will be applied as the file mode." Link: https://lkml.kernel.org/r/20240318023445.3192922-1-vt@altlinux.org Fixes: aeb85ed4f41a ("tools/testing/selftests/vm/gup_benchmark.c: allow user specified file") Fixes: fbe37501b252 ("mm: huge_memory: debugfs for file-backed THP split") Fixes: c942f5bd17b3 ("selftests: soft-dirty: add test for mprotect") Signed-off-by: Vitaly Chikunov Reviewed-by: Zi Yan Reviewed-by: David Hildenbrand Cc: Keith Busch Cc: Peter Xu Cc: Yang Shi Cc: Andrea Arcangeli Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 2 +- tools/testing/selftests/mm/soft-dirty.c | 2 +- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index cbe99594d319..18a49c70d4c6 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -203,7 +203,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(nthreads); - filed = open(file, O_RDWR|O_CREAT); + filed = open(file, O_RDWR|O_CREAT, 0664); if (filed < 0) ksft_exit_fail_msg("Unable to open %s: %s\n", file, strerror(errno)); diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index cc5f144430d4..7dbfa53d93a0 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -137,7 +137,7 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) if (!map) ksft_exit_fail_msg("anon mmap failed\n"); } else { - test_fd = open(fname, O_RDWR | O_CREAT); + test_fd = open(fname, O_RDWR | O_CREAT, 0664); if (test_fd < 0) { ksft_test_result_skip("Test %s open() file failed\n", __func__); return; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 856662d2f87a..6c988bd2f335 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -223,7 +223,7 @@ void split_file_backed_thp(void) ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); } - fd = open(testfile, O_CREAT|O_WRONLY); + fd = open(testfile, O_CREAT|O_WRONLY, 0664); if (fd == -1) { ksft_perror("Cannot open testing file"); goto cleanup; From 4624b346cf67400ef46a31771011fb798dd2f999 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Sun, 17 Mar 2024 15:15:22 -0700 Subject: [PATCH 455/496] init: open /initrd.image with O_LARGEFILE If initrd data is larger than 2Gb, we'll eventually fail to write to the /initrd.image file when we hit that limit, unless O_LARGEFILE is set. Link: https://lkml.kernel.org/r/20240317221522.896040-1-jsperbeck@google.com Signed-off-by: John Sperbeck Cc: Jens Axboe Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/initramfs.c b/init/initramfs.c index da79760b8be3..3127e0bf7bbd 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -682,7 +682,7 @@ static void __init populate_initrd_image(char *err) printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); - file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700); if (IS_ERR(file)) return; From 329003246617dc52064a2dd9be7496c7a186bdac Mon Sep 17 00:00:00 2001 From: Leonard Crestez Date: Sat, 16 Mar 2024 20:28:37 +0200 Subject: [PATCH 456/496] mailmap: update entry for Leonard Crestez Put my personal email first because NXP employment ended some time ago. Also add my old intel email address. Link: https://lkml.kernel.org/r/f568faa0-2380-4e93-a312-b80c1e367645@gmail.com Signed-off-by: Leonard Crestez Cc: Florian Fainelli Signed-off-by: Andrew Morton --- .mailmap | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 2216b5d5c84e..dcc7af7ad4c7 100644 --- a/.mailmap +++ b/.mailmap @@ -340,7 +340,8 @@ Lee Jones Lee Jones Lee Jones Lee Jones -Leonard Crestez Leonard Crestez +Leonard Crestez +Leonard Crestez Leonardo Bras Leonard Göhrs Leonid I Ananiev From 7844c01472119f55bd9a107a4578a6d26be04c46 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Fri, 15 Mar 2024 23:26:10 +0100 Subject: [PATCH 457/496] mm,page_owner: fix recursion Prior to 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") the only place where page_owner could potentially go into recursion due to its need of allocating more memory was in save_stack(), which ends up calling into stackdepot code with the possibility of allocating memory. We made sure to guard against that by signaling that the current task was already in page_owner code, so in case a recursion attempt was made, we could catch that and return dummy_handle. After above commit, a new place in page_owner code was introduced where we could allocate memory, meaning we could go into recursion would we take that path. Make sure to signal that we are in page_owner in that codepath as well. Move the guard code into two helpers {un}set_current_in_page_owner() and use them prior to calling in the two functions that might allocate memory. Link: https://lkml.kernel.org/r/20240315222610.6870-1-osalvador@suse.de Signed-off-by: Oscar Salvador Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") Reviewed-by: Vlastimil Babka Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Marco Elver Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_owner.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index e7139952ffd9..d17d1351ec84 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -54,6 +54,22 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); +static inline void set_current_in_page_owner(void) +{ + /* + * Avoid recursion. + * + * We might need to allocate more memory from page_owner code, so make + * sure to signal it in order to avoid recursion. + */ + current->in_page_owner = 1; +} + +static inline void unset_current_in_page_owner(void) +{ + current->in_page_owner = 0; +} + static int __init early_page_owner_param(char *buf) { int ret = kstrtobool(buf, &page_owner_enabled); @@ -133,23 +149,16 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) depot_stack_handle_t handle; unsigned int nr_entries; - /* - * Avoid recursion. - * - * Sometimes page metadata allocation tracking requires more - * memory to be allocated: - * - when new stack trace is saved to stack depot - */ if (current->in_page_owner) return dummy_handle; - current->in_page_owner = 1; + set_current_in_page_owner(); nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + unset_current_in_page_owner(); - current->in_page_owner = 0; return handle; } @@ -164,9 +173,13 @@ static void add_stack_record_to_list(struct stack_record *stack_record, gfp_mask &= (GFP_ATOMIC | GFP_KERNEL); gfp_mask |= __GFP_NOWARN; + set_current_in_page_owner(); stack = kmalloc(sizeof(*stack), gfp_mask); - if (!stack) + if (!stack) { + unset_current_in_page_owner(); return; + } + unset_current_in_page_owner(); stack->stack_record = stack_record; stack->next = NULL; From 9cecde80aae0fb0aa44425575d5aca71bc646d89 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Mar 2024 14:08:21 +0000 Subject: [PATCH 458/496] mm: increase folio batch size On a 104 thread, 2 socket Skylake system, Intel report a 4.7% performance reduction with will-it-scale page_fault2. This was due to reducing the size of the batch from 32 to 15. Increasing the folio batch size from 15 to 31 gives a performance increase of 12.5% relative to the original, or 17.2% relative to the reduced performance commit. The penalty of this commit is an additional 128 bytes of stack usage. Six folio_batches are also allocated from percpu memory in cpu_fbatches so that will be an additional 768 bytes of percpu memory (per CPU). Tim Chen originally submitted a patch like this in 2020: https://lore.kernel.org/linux-mm/d1cc9f12a8ad6c2a52cb600d93b06b064f2bbc57.1593205965.git.tim.c.chen@linux.intel.com/ Link: https://lkml.kernel.org/r/20240315140823.2478146-1-willy@infradead.org Fixes: 99fbb6bfc16f ("mm: make folios_put() the basis of release_pages()") Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yujie Liu Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202403151058.7048f6a8-oliver.sang@intel.com Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index fcc06c300a72..5d3a0cccc6bf 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -11,8 +11,8 @@ #include -/* 15 pointers + header align the folio_batch structure to a power of two */ -#define PAGEVEC_SIZE 15 +/* 31 pointers + header align the folio_batch structure to a power of two */ +#define PAGEVEC_SIZE 31 struct folio; From d5d39c707a4cf0bcc84680178677b97aa2cb2627 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 15 Mar 2024 05:55:56 -0400 Subject: [PATCH 459/496] mm: cachestat: fix two shmem bugs When cachestat on shmem races with swapping and invalidation, there are two possible bugs: 1) A swapin error can have resulted in a poisoned swap entry in the shmem inode's xarray. Calling get_shadow_from_swap_cache() on it will result in an out-of-bounds access to swapper_spaces[]. Validate the entry with non_swap_entry() before going further. 2) When we find a valid swap entry in the shmem's inode, the shadow entry in the swapcache might not exist yet: swap IO is still in progress and we're before __remove_mapping; swapin, invalidation, or swapoff have removed the shadow from swapcache after we saw the shmem swap entry. This will send a NULL to workingset_test_recent(). The latter purely operates on pointer bits, so it won't crash - node 0, memcg ID 0, eviction timestamp 0, etc. are all valid inputs - but it's a bogus test. In theory that could result in a false "recently evicted" count. Such a false positive wouldn't be the end of the world. But for code clarity and (future) robustness, be explicit about this case. Bail on get_shadow_from_swap_cache() returning NULL. Link: https://lkml.kernel.org/r/20240315095556.GC581298@cmpxchg.org Fixes: cf264e1329fb ("cachestat: implement cachestat syscall") Signed-off-by: Johannes Weiner Reported-by: Chengming Zhou [Bug #1] Reported-by: Jann Horn [Bug #2] Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Cc: [v6.5+] Signed-off-by: Andrew Morton --- mm/filemap.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 7437b2bd75c1..30de18c4fd28 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4197,7 +4197,23 @@ static void filemap_cachestat(struct address_space *mapping, /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); + /* swapin error results in poisoned entry */ + if (non_swap_entry(swp)) + goto resched; + + /* + * Getting a swap entry from the shmem + * inode means we beat + * shmem_unuse(). rcu_read_lock() + * ensures swapoff waits for us before + * freeing the swapper space. However, + * we can race with swapping and + * invalidation, so there might not be + * a shadow in the swapcache (yet). + */ shadow = get_shadow_from_swap_cache(swp); + if (!shadow) + goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset)) From 950bf45d3bbfdb373772ed4d32b5f90e8532fcce Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Fri, 15 Mar 2024 09:22:48 +0800 Subject: [PATCH 460/496] tools/Makefile: remove cgroup target The tools/cgroup directory no longer contains a Makefile. This patch updates the top-level tools/Makefile to remove references to building and installing cgroup components. This change reflects the current structure of the tools directory and fixes the build failure when building tools in the top-level directory. linux/tools$ make cgroup DESCEND cgroup make[1]: *** No targets specified and no makefile found. Stop. make: *** [Makefile:73: cgroup] Error 2 Link: https://lkml.kernel.org/r/20240315012249.439639-1-liucong2@kylinos.cn Signed-off-by: Cong Liu Acked-by: Stanislav Fomichev Reviewed-by: Dmitry Rokosov Cc: Cong Liu Signed-off-by: Andrew Morton --- tools/Makefile | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 37e9f6804832..276f5d0d53a4 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -11,7 +11,6 @@ help: @echo '' @echo ' acpi - ACPI tools' @echo ' bpf - misc BPF tools' - @echo ' cgroup - cgroup tools' @echo ' counter - counter tools' @echo ' cpupower - a tool for all things x86 CPU power' @echo ' debugging - tools for debugging' @@ -69,7 +68,7 @@ acpi: FORCE cpupower: FORCE $(call descend,power/$@) -cgroup counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE +counter firewire hv guest bootconfig spi usb virtio mm bpf iio gpio objtool leds wmi pci firmware debugging tracing: FORCE $(call descend,$@) bpf/%: FORCE @@ -116,7 +115,7 @@ freefall: FORCE kvm_stat: FORCE $(call descend,kvm/$@) -all: acpi cgroup counter cpupower gpio hv firewire \ +all: acpi counter cpupower gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ @@ -128,7 +127,7 @@ acpi_install: cpupower_install: $(call descend,power/$(@:_install=),install) -cgroup_install counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: +counter_install firewire_install gpio_install hv_install iio_install perf_install bootconfig_install spi_install usb_install virtio_install mm_install bpf_install objtool_install wmi_install pci_install debugging_install tracing_install: $(call descend,$(@:_install=),install) selftests_install: @@ -155,7 +154,7 @@ freefall_install: kvm_stat_install: $(call descend,kvm/$(@:_install=),install) -install: acpi_install cgroup_install counter_install cpupower_install gpio_install \ +install: acpi_install counter_install cpupower_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install mm_install bpf_install x86_energy_perf_policy_install \ @@ -169,7 +168,7 @@ acpi_clean: cpupower_clean: $(call descend,power/cpupower,clean) -cgroup_clean counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: +counter_clean hv_clean firewire_clean bootconfig_clean spi_clean usb_clean virtio_clean mm_clean wmi_clean bpf_clean iio_clean gpio_clean objtool_clean leds_clean pci_clean firmware_clean debugging_clean tracing_clean: $(call descend,$(@:_clean=),clean) libapi_clean: @@ -209,7 +208,7 @@ freefall_clean: build_clean: $(call descend,build,clean) -clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_clean \ +clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ From c52eb6db7b7dd8b4b338b16c5c37df22a6b08fdf Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Thu, 14 Mar 2024 14:40:45 +0500 Subject: [PATCH 461/496] selftests: mm: restore settings from only parent process The atexit() is called from parent process as well as forked processes. Hence the child restores the settings at exit while the parent is still executing. Fix this by checking pid of atexit() calling process and only restore THP number from parent process. Link: https://lkml.kernel.org/r/20240314094045.157149-1-usama.anjum@collabora.com Fixes: c23ea61726d5 ("selftests/mm: protection_keys: save/restore nr_hugepages settings") Signed-off-by: Muhammad Usama Anjum Tested-by: Joey Gouly Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/protection_keys.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index f822ae31af22..374a308174d2 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -1745,9 +1745,12 @@ void pkey_setup_shadow(void) shadow_pkey_reg = __read_pkey_reg(); } +pid_t parent_pid; + void restore_settings_atexit(void) { - cat_into_file(buf, "/proc/sys/vm/nr_hugepages"); + if (parent_pid == getpid()) + cat_into_file(buf, "/proc/sys/vm/nr_hugepages"); } void save_settings(void) @@ -1773,6 +1776,7 @@ void save_settings(void) exit(__LINE__); } + parent_pid = getpid(); atexit(restore_settings_atexit); close(fd); } From 9c500835f279e636722bbcafdfe62cc0153ec292 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 19 Mar 2024 12:47:06 +1300 Subject: [PATCH 462/496] mm: zswap: fix kernel BUG in sg_init_one sg_init_one() relies on linearly mapped low memory for the safe utilization of virt_to_page(). Otherwise, we trigger a kernel BUG, kernel BUG at include/linux/scatterlist.h:187! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM Modules linked in: CPU: 0 PID: 2997 Comm: syz-executor198 Not tainted 6.8.0-syzkaller #0 Hardware name: ARM-Versatile Express PC is at sg_set_buf include/linux/scatterlist.h:187 [inline] PC is at sg_init_one+0x9c/0xa8 lib/scatterlist.c:143 LR is at sg_init_table+0x2c/0x40 lib/scatterlist.c:128 Backtrace: [<807e16ac>] (sg_init_one) from [<804c1824>] (zswap_decompress+0xbc/0x208 mm/zswap.c:1089) r7:83471c80 r6:def6d08c r5:844847d0 r4:ff7e7ef4 [<804c1768>] (zswap_decompress) from [<804c4468>] (zswap_load+0x15c/0x198 mm/zswap.c:1637) r9:8446eb80 r8:8446eb80 r7:8446eb84 r6:def6d08c r5:00000001 r4:844847d0 [<804c430c>] (zswap_load) from [<804b9644>] (swap_read_folio+0xa8/0x498 mm/page_io.c:518) r9:844ac800 r8:835e6c00 r7:00000000 r6:df955d4c r5:00000001 r4:def6d08c [<804b959c>] (swap_read_folio) from [<804bb064>] (swap_cluster_readahead+0x1c4/0x34c mm/swap_state.c:684) r10:00000000 r9:00000007 r8:df955d4b r7:00000000 r6:00000000 r5:00100cca r4:00000001 [<804baea0>] (swap_cluster_readahead) from [<804bb3b8>] (swapin_readahead+0x68/0x4a8 mm/swap_state.c:904) r10:df955eb8 r9:00000000 r8:00100cca r7:84476480 r6:00000001 r5:00000000 r4:00000001 [<804bb350>] (swapin_readahead) from [<8047cde0>] (do_swap_page+0x200/0xcc4 mm/memory.c:4046) r10:00000040 r9:00000000 r8:844ac800 r7:84476480 r6:00000001 r5:00000000 r4:df955eb8 [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (handle_pte_fault mm/memory.c:5301 [inline]) [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (__handle_mm_fault mm/memory.c:5439 [inline]) [<8047cbe0>] (do_swap_page) from [<8047e6c4>] (handle_mm_fault+0x3d8/0x12b8 mm/memory.c:5604) r10:00000040 r9:842b3900 r8:7eb0d000 r7:84476480 r6:7eb0d000 r5:835e6c00 r4:00000254 [<8047e2ec>] (handle_mm_fault) from [<80215d28>] (do_page_fault+0x148/0x3a8 arch/arm/mm/fault.c:326) r10:00000007 r9:842b3900 r8:7eb0d000 r7:00000207 r6:00000254 r5:7eb0d9b4 r4:df955fb0 [<80215be0>] (do_page_fault) from [<80216170>] (do_DataAbort+0x38/0xa8 arch/arm/mm/fault.c:558) r10:7eb0da7c r9:00000000 r8:80215be0 r7:df955fb0 r6:7eb0d9b4 r5:00000207 r4:8261d0e0 [<80216138>] (do_DataAbort) from [<80200e3c>] (__dabt_usr+0x5c/0x60 arch/arm/kernel/entry-armv.S:427) Exception stack(0xdf955fb0 to 0xdf955ff8) 5fa0: 00000000 00000000 22d5f800 0008d158 5fc0: 00000000 7eb0d9a4 00000000 00000109 00000000 00000000 7eb0da7c 7eb0da3c 5fe0: 00000000 7eb0d9a0 00000001 00066bd4 00000010 ffffffff r8:824a9044 r7:835e6c00 r6:ffffffff r5:00000010 r4:00066bd4 Code: 1a000004 e1822003 e8860094 e89da8f0 (e7f001f2) ---[ end trace 0000000000000000 ]--- ---------------- Code disassembly (best guess): 0: 1a000004 bne 0x18 4: e1822003 orr r2, r2, r3 8: e8860094 stm r6, {r2, r4, r7} c: e89da8f0 ldm sp, {r4, r5, r6, r7, fp, sp, pc} * 10: e7f001f2 udf #18 <-- trapping instruction Consequently, we have two choices: either employ kmap_to_page() alongside sg_set_page(), or resort to copying high memory contents to a temporary buffer residing in low memory. However, considering the introduction of the WARN_ON_ONCE in commit ef6e06b2ef870 ("highmem: fix kmap_to_page() for kmap_local_page() addresses"), which specifically addresses high memory concerns, it appears that memcpy remains the sole viable option. Link: https://lkml.kernel.org/r/20240318234706.95347-1-21cnbao@gmail.com Fixes: 270700dd06ca ("mm/zswap: remove the memcpy if acomp is not sleepable") Signed-off-by: Barry Song Reported-by: syzbot+adbc983a1588b7805de3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000bbb3d80613f243a6@google.com/ Tested-by: syzbot+adbc983a1588b7805de3@syzkaller.appspotmail.com Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: Chris Li Cc: Ira Weiny Signed-off-by: Andrew Morton --- mm/zswap.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 9dec853647c8..fc2755b05a73 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1080,7 +1080,17 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) mutex_lock(&acomp_ctx->mutex); src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) { + /* + * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer + * to do crypto_acomp_decompress() which might sleep. In such cases, we must + * resort to copying the buffer to a temporary one. + * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, + * such as a kmap address of high memory or even ever a vmap address. + * However, sg_init_one is only equipped to handle linearly mapped low memory. + * In such cases, we also must copy the buffer to a temporary and lowmem one. + */ + if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || + !virt_addr_valid(src)) { memcpy(acomp_ctx->buffer, src, entry->length); src = acomp_ctx->buffer; zpool_unmap_handle(zpool, entry->handle); @@ -1094,7 +1104,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page) BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); mutex_unlock(&acomp_ctx->mutex); - if (!acomp_ctx->is_sleepable || zpool_can_sleep_mapped(zpool)) + if (src != acomp_ctx->buffer) zpool_unmap_handle(zpool, entry->handle); } From db09f2df916eade885aae63963449666d3a23f8d Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Wed, 20 Mar 2024 02:18:42 +0800 Subject: [PATCH 463/496] MAINTAINERS: remove incorrect M: tag for dm-devel@lists.linux.dev The dm-devel@lists.linux.dev mailing list should only be listed under the L: (List) tag in the MAINTAINERS file. However, it was incorrectly listed under both L: and M: (Maintainers) tags, which is not accurate. Remove the M: tag for dm-devel@lists.linux.dev in the MAINTAINERS file to reflect the correct categorization. Link: https://lkml.kernel.org/r/20240319181842.249547-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Cc: Matthew Sakai Cc: Michael Sclafani Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index aa3b947fb080..079a86e680bc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6173,7 +6173,6 @@ F: include/uapi/linux/dm-*.h DEVICE-MAPPER VDO TARGET M: Matthew Sakai -M: dm-devel@lists.linux.dev L: dm-devel@lists.linux.dev S: Maintained F: Documentation/admin-guide/device-mapper/vdo*.rst From d5aad4c2ca057e760a92a9a7d65bd38d72963f27 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 26 Feb 2024 17:35:41 -0800 Subject: [PATCH 464/496] prctl: generalize PR_SET_MDWE support check to be per-arch Patch series "ARM: prctl: Reject PR_SET_MDWE where not supported". I noticed after a recent kernel update that my ARM926 system started segfaulting on any execve() after calling prctl(PR_SET_MDWE). After some investigation it appears that ARMv5 is incapable of providing the appropriate protections for MDWE, since any readable memory is also implicitly executable. The prctl_set_mdwe() function already had some special-case logic added disabling it on PARISC (commit 793838138c15, "prctl: Disable prctl(PR_SET_MDWE) on parisc"); this patch series (1) generalizes that check to use an arch_*() function, and (2) adds a corresponding override for ARM to disable MDWE on pre-ARMv6 CPUs. With the series applied, prctl(PR_SET_MDWE) is rejected on ARMv5 and subsequent execve() calls (as well as mmap(PROT_READ|PROT_WRITE)) can succeed instead of unconditionally failing; on ARMv6 the prctl works as it did previously. [0] https://lore.kernel.org/all/2023112456-linked-nape-bf19@gregkh/ This patch (of 2): There exist systems other than PARISC where MDWE may not be feasible to support; rather than cluttering up the generic code with additional arch-specific logic let's add a generic function for checking MDWE support and allow each arch to override it as needed. Link: https://lkml.kernel.org/r/20240227013546.15769-4-zev@bewilderbeest.net Link: https://lkml.kernel.org/r/20240227013546.15769-5-zev@bewilderbeest.net Signed-off-by: Zev Weiss Acked-by: Helge Deller [parisc] Cc: Borislav Petkov Cc: David Hildenbrand Cc: Florent Revest Cc: "James E.J. Bottomley" Cc: Josh Triplett Cc: Kees Cook Cc: Miguel Ojeda Cc: Mike Rapoport (IBM) Cc: Oleg Nesterov Cc: Ondrej Mosnacek Cc: Rick Edgecombe Cc: Russell King (Oracle) Cc: Sam James Cc: Stefan Roesch Cc: Yang Shi Cc: Yin Fengwei Cc: [6.3+] Signed-off-by: Andrew Morton --- arch/parisc/include/asm/mman.h | 14 ++++++++++++++ include/linux/mman.h | 8 ++++++++ kernel/sys.c | 7 +++++-- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 arch/parisc/include/asm/mman.h diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h new file mode 100644 index 000000000000..47c5a1991d10 --- /dev/null +++ b/arch/parisc/include/asm/mman.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include + +/* PARISC cannot allow mdwe as it needs writable stacks */ +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return false; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported + +#endif /* __ASM_MMAN_H__ */ diff --git a/include/linux/mman.h b/include/linux/mman.h index dc7048824be8..bcb201ab7a41 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -162,6 +162,14 @@ calc_vm_flag_bits(unsigned long flags) unsigned long vm_commit_limit(void); +#ifndef arch_memory_deny_write_exec_supported +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return true; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported +#endif + /* * Denies creating a writable executable mapping or gaining executable permissions. * diff --git a/kernel/sys.c b/kernel/sys.c index f8e543f1e38a..8bb106a56b3a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2408,8 +2408,11 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; - /* PARISC cannot allow mdwe as it needs writable stacks */ - if (IS_ENABLED(CONFIG_PARISC)) + /* + * EOPNOTSUPP might be more appropriate here in principle, but + * existing userspace depends on EINVAL specifically. + */ + if (!arch_memory_deny_write_exec_supported()) return -EINVAL; current_bits = get_current_mdwe(); From 166ce846dc5974a266f6c2a2896dbef5425a6f21 Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Mon, 26 Feb 2024 17:35:42 -0800 Subject: [PATCH 465/496] ARM: prctl: reject PR_SET_MDWE on pre-ARMv6 On v5 and lower CPUs we can't provide MDWE protection, so ensure we fail any attempt to enable it via prctl(PR_SET_MDWE). Previously such an attempt would misleadingly succeed, leading to any subsequent mmap(PROT_READ|PROT_WRITE) or execve() failing unconditionally (the latter somewhat violently via force_fatal_sig(SIGSEGV) due to READ_IMPLIES_EXEC). Link: https://lkml.kernel.org/r/20240227013546.15769-6-zev@bewilderbeest.net Signed-off-by: Zev Weiss Cc: [6.3+] Cc: Borislav Petkov Cc: David Hildenbrand Cc: Florent Revest Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Josh Triplett Cc: Kees Cook Cc: Miguel Ojeda Cc: Mike Rapoport (IBM) Cc: Oleg Nesterov Cc: Ondrej Mosnacek Cc: Rick Edgecombe Cc: Russell King (Oracle) Cc: Sam James Cc: Stefan Roesch Cc: Yang Shi Cc: Yin Fengwei Signed-off-by: Andrew Morton --- arch/arm/include/asm/mman.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 arch/arm/include/asm/mman.h diff --git a/arch/arm/include/asm/mman.h b/arch/arm/include/asm/mman.h new file mode 100644 index 000000000000..2189e507c8e0 --- /dev/null +++ b/arch/arm/include/asm/mman.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#include +#include + +static inline bool arch_memory_deny_write_exec_supported(void) +{ + return cpu_architecture() >= CPU_ARCH_ARMv6; +} +#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported + +#endif /* __ASM_MMAN_H__ */ From 30fb6a8d9e3378919f378f9bf561142b4a6d2637 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 21 Mar 2024 14:25:32 -0400 Subject: [PATCH 466/496] mm: zswap: fix writeback shinker GFP_NOIO/GFP_NOFS recursion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kent forwards this bug report of zswap re-entering the block layer from an IO request allocation and locking up: [10264.128242] sysrq: Show Blocked State [10264.128268] task:kworker/20:0H state:D stack:0 pid:143 tgid:143 ppid:2 flags:0x00004000 [10264.128271] Workqueue: bcachefs_io btree_write_submit [bcachefs] [10264.128295] Call Trace: [10264.128295] [10264.128297] __schedule+0x3e6/0x1520 [10264.128303] schedule+0x32/0xd0 [10264.128304] schedule_timeout+0x98/0x160 [10264.128308] io_schedule_timeout+0x50/0x80 [10264.128309] wait_for_completion_io_timeout+0x7f/0x180 [10264.128310] submit_bio_wait+0x78/0xb0 [10264.128313] swap_writepage_bdev_sync+0xf6/0x150 [10264.128317] zswap_writeback_entry+0xf2/0x180 [10264.128319] shrink_memcg_cb+0xe7/0x2f0 [10264.128322] __list_lru_walk_one+0xb9/0x1d0 [10264.128325] list_lru_walk_one+0x5d/0x90 [10264.128326] zswap_shrinker_scan+0xc4/0x130 [10264.128327] do_shrink_slab+0x13f/0x360 [10264.128328] shrink_slab+0x28e/0x3c0 [10264.128329] shrink_one+0x123/0x1b0 [10264.128331] shrink_node+0x97e/0xbc0 [10264.128332] do_try_to_free_pages+0xe7/0x5b0 [10264.128333] try_to_free_pages+0xe1/0x200 [10264.128334] __alloc_pages_slowpath.constprop.0+0x343/0xde0 [10264.128337] __alloc_pages+0x32d/0x350 [10264.128338] allocate_slab+0x400/0x460 [10264.128339] ___slab_alloc+0x40d/0xa40 [10264.128345] kmem_cache_alloc+0x2e7/0x330 [10264.128348] mempool_alloc+0x86/0x1b0 [10264.128349] bio_alloc_bioset+0x200/0x4f0 [10264.128352] bio_alloc_clone+0x23/0x60 [10264.128354] alloc_io+0x26/0xf0 [dm_mod 7e9e6b44df4927f93fb3e4b5c782767396f58382] [10264.128361] dm_submit_bio+0xb8/0x580 [dm_mod 7e9e6b44df4927f93fb3e4b5c782767396f58382] [10264.128366] __submit_bio+0xb0/0x170 [10264.128367] submit_bio_noacct_nocheck+0x159/0x370 [10264.128368] bch2_submit_wbio_replicas+0x21c/0x3a0 [bcachefs 85f1b9a7a824f272eff794653a06dde1a94439f2] [10264.128391] btree_write_submit+0x1cf/0x220 [bcachefs 85f1b9a7a824f272eff794653a06dde1a94439f2] [10264.128406] process_one_work+0x178/0x350 [10264.128408] worker_thread+0x30f/0x450 [10264.128409] kthread+0xe5/0x120 The zswap shrinker resumes the swap_writepage()s that were intercepted by the zswap store. This will enter the block layer, and may even enter the filesystem depending on the swap backing file. Make it respect GFP_NOIO and GFP_NOFS. Link: https://lore.kernel.org/linux-mm/rc4pk2r42oyvjo4dc62z6sovquyllq56i5cdgcaqbd7wy3hfzr@n4nbxido3fme/ Link: https://lkml.kernel.org/r/20240321182532.60000-1-hannes@cmpxchg.org Fixes: b5ba474f3f51 ("zswap: shrink zswap pool based on memory pressure") Signed-off-by: Johannes Weiner Reported-by: Kent Overstreet Acked-by: Yosry Ahmed Reported-by: Jérôme Poulin Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Cc: stable@vger.kernel.org [v6.8] Signed-off-by: Andrew Morton --- mm/zswap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index fc2755b05a73..36612f34b5d7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1323,6 +1323,14 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg)) return 0; + /* + * The shrinker resumes swap writeback, which will enter block + * and may enter fs. XXX: Harmonize with vmscan.c __GFP_FS + * rules (may_enter_fs()), which apply on a per-folio basis. + */ + if (!gfp_has_io_fs(sc->gfp_mask)) + return 0; + #ifdef CONFIG_MEMCG_KMEM mem_cgroup_flush_stats(memcg); nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; From 105840ebd76d8dbc1a7d734748ae320076f3201e Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Thu, 21 Mar 2024 23:20:21 +0000 Subject: [PATCH 467/496] selftests/mm: sigbus-wp test requires UFFD_FEATURE_WP_HUGETLBFS_SHMEM The sigbus-wp test requires the UFFD_FEATURE_WP_HUGETLBFS_SHMEM flag for shmem and hugetlb targets. Otherwise it is not backwards compatible with kernels <5.19 and fails with EINVAL. Link: https://lkml.kernel.org/r/20240321232023.2064975-1-edliaw@google.com Fixes: 73c1ea939b65 ("selftests/mm: move uffd sig/events tests into uffd unit tests") Signed-off-by: Edward Liaw Cc: Shuah Khan Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 2b9f8cc52639..536e09b03aee 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -1427,7 +1427,8 @@ uffd_test_case_t uffd_tests[] = { .uffd_fn = uffd_sigbus_wp_test, .mem_targets = MEM_ALL, .uffd_feature_required = UFFD_FEATURE_SIGBUS | - UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP, + UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_PAGEFAULT_FLAG_WP | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM, }, { .name = "events", From 0a69b6b3a026543bc215ccc866d0aea5579e6ce2 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Wed, 20 Mar 2024 13:39:59 +0100 Subject: [PATCH 468/496] tmpfs: fix race on handling dquot rbtree A syzkaller reproducer found a race while attempting to remove dquot information from the rb tree. Fetching the rb_tree root node must also be protected by the dqopt->dqio_sem, otherwise, giving the right timing, shmem_release_dquot() will trigger a warning because it couldn't find a node in the tree, when the real reason was the root node changing before the search starts: Thread 1 Thread 2 - shmem_release_dquot() - shmem_{acquire,release}_dquot() - fetch ROOT - Fetch ROOT - acquire dqio_sem - wait dqio_sem - do something, triger a tree rebalance - release dqio_sem - acquire dqio_sem - start searching for the node, but from the wrong location, missing the node, and triggering a warning. Link: https://lkml.kernel.org/r/20240320124011.398847-1-cem@kernel.org Fixes: eafc474e2029 ("shmem: prepare shmem quota infrastructure") Signed-off-by: Carlos Maiolino Reported-by: Ubisectech Sirius Reviewed-by: Jan Kara Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton --- mm/shmem_quota.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c index 062d1c1097ae..ce514e700d2f 100644 --- a/mm/shmem_quota.c +++ b/mm/shmem_quota.c @@ -116,7 +116,7 @@ static int shmem_free_file_info(struct super_block *sb, int type) static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) { struct mem_dqinfo *info = sb_dqinfo(sb, qid->type); - struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node *node; qid_t id = from_kqid(&init_user_ns, *qid); struct quota_info *dqopt = sb_dqopt(sb); struct quota_id *entry = NULL; @@ -126,6 +126,7 @@ static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) return -ESRCH; down_read(&dqopt->dqio_sem); + node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); @@ -165,7 +166,7 @@ out_unlock: static int shmem_acquire_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); - struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node **n; struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; struct rb_node *parent = NULL, *new_node = NULL; struct quota_id *new_entry, *entry; @@ -176,6 +177,8 @@ static int shmem_acquire_dquot(struct dquot *dquot) mutex_lock(&dquot->dq_lock); down_write(&dqopt->dqio_sem); + n = &((struct rb_root *)info->dqi_priv)->rb_node; + while (*n) { parent = *n; entry = rb_entry(parent, struct quota_id, node); @@ -264,7 +267,7 @@ static bool shmem_is_empty_dquot(struct dquot *dquot) static int shmem_release_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); - struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node *node; qid_t id = from_kqid(&init_user_ns, dquot->dq_id); struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); struct quota_id *entry = NULL; @@ -275,6 +278,7 @@ static int shmem_release_dquot(struct dquot *dquot) goto out_dqlock; down_write(&dqopt->dqio_sem); + node = ((struct rb_root *)info->dqi_priv)->rb_node; while (node) { entry = rb_entry(node, struct quota_id, node); From 30af24facf0aed12dec23bdf6eac6a907f88306a Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 21 Mar 2024 16:58:18 -0700 Subject: [PATCH 469/496] userfaultfd: fix deadlock warning when locking src and dst VMAs Use down_read_nested() to avoid the warning. Link: https://lkml.kernel.org/r/20240321235818.125118-1-lokeshgidra@google.com Fixes: 867a43a34ff8 ("userfaultfd: use per-vma locks in userfaultfd operations") Reported-by: syzbot+49056626fe41e01f2ba7@syzkaller.appspotmail.com Signed-off-by: Lokesh Gidra Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Hillf Danton Cc: Jann Horn [Bug #2] Cc: Kalesh Singh Cc: Lokesh Gidra Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 712160cd41ec..3c3539c573e7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1444,7 +1444,8 @@ static int uffd_move_lock(struct mm_struct *mm, */ down_read(&(*dst_vmap)->vm_lock->lock); if (*dst_vmap != *src_vmap) - down_read(&(*src_vmap)->vm_lock->lock); + down_read_nested(&(*src_vmap)->vm_lock->lock, + SINGLE_DEPTH_NESTING); } mmap_read_unlock(mm); return err; From 549aa9678a0b3981d4821bf244579d9937650562 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 19 Mar 2024 17:37:46 -0700 Subject: [PATCH 470/496] hexagon: vmlinux.lds.S: handle attributes section After the linked LLVM change, the build fails with CONFIG_LD_ORPHAN_WARN_LEVEL="error", which happens with allmodconfig: ld.lld: error: vmlinux.a(init/main.o):(.hexagon.attributes) is being placed in '.hexagon.attributes' Handle the attributes section in a similar manner as arm and riscv by adding it after the primary ELF_DETAILS grouping in vmlinux.lds.S, which fixes the error. Link: https://lkml.kernel.org/r/20240319-hexagon-handle-attributes-section-vmlinux-lds-s-v1-1-59855dab8872@kernel.org Fixes: 113616ec5b64 ("hexagon: select ARCH_WANT_LD_ORPHAN_WARN") Link: https://github.com/llvm/llvm-project/commit/31f4b329c8234fab9afa59494d7f8bdaeaefeaad Signed-off-by: Nathan Chancellor Reviewed-by: Brian Cain Cc: Bill Wendling Cc: Justin Stitt Cc: Nick Desaulniers Cc: Signed-off-by: Andrew Morton --- arch/hexagon/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S index 1140051a0c45..1150b77fa281 100644 --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -63,6 +63,7 @@ SECTIONS STABS_DEBUG DWARF_DEBUG ELF_DETAILS + .hexagon.attributes 0 : { *(.hexagon.attributes) } DISCARDS } From 8c864371b2a15a23ce35aa7e2bd241baaad6fbe8 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Mon, 25 Mar 2024 19:40:52 +0000 Subject: [PATCH 471/496] selftests/mm: fix ARM related issue with fork after pthread_create Following issue was observed while running the uffd-unit-tests selftest on ARM devices. On x86_64 no issues were detected: pthread_create followed by fork caused deadlock in certain cases wherein fork required some work to be completed by the created thread. Used synchronization to ensure that created thread's start function has started before invoking fork. [edliaw@google.com: refactored to use atomic_bool] Link: https://lkml.kernel.org/r/20240325194100.775052-1-edliaw@google.com Fixes: 760aee0b71e3 ("selftests/mm: add tests for RO pinning vs fork()") Signed-off-by: Lokesh Gidra Signed-off-by: Edward Liaw Cc: Peter Xu Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 3 +++ tools/testing/selftests/mm/uffd-common.h | 2 ++ tools/testing/selftests/mm/uffd-unit-tests.c | 10 ++++++++++ 3 files changed, 15 insertions(+) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index b0ac0ec2356d..7ad6ba660c7d 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -18,6 +18,7 @@ bool test_uffdio_wp = true; unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; +atomic_bool ready_for_fork; static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) { @@ -518,6 +519,8 @@ void *uffd_poll_thread(void *arg) pollfd[1].fd = pipefd[cpu*2]; pollfd[1].events = POLLIN; + ready_for_fork = true; + for (;;) { ret = poll(pollfd, 2, -1); if (ret <= 0) { diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index cb055282c89c..cc5629c3d2aa 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -32,6 +32,7 @@ #include #include #include +#include #include "../kselftest.h" #include "vm_util.h" @@ -103,6 +104,7 @@ extern bool map_shared; extern bool test_uffdio_wp; extern unsigned long long *count_verify; extern volatile bool test_uffdio_copy_eexist; +extern atomic_bool ready_for_fork; extern uffd_test_ops_t anon_uffd_test_ops; extern uffd_test_ops_t shmem_uffd_test_ops; diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 536e09b03aee..21ec23206ab4 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -775,6 +775,8 @@ static void uffd_sigbus_test_common(bool wp) char c; struct uffd_args args = { 0 }; + ready_for_fork = false; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); if (uffd_register(uffd, area_dst, nr_pages * page_size, @@ -790,6 +792,9 @@ static void uffd_sigbus_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ + pid = fork(); if (pid < 0) err("fork"); @@ -829,6 +834,8 @@ static void uffd_events_test_common(bool wp) char c; struct uffd_args args = { 0 }; + ready_for_fork = false; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); if (uffd_register(uffd, area_dst, nr_pages * page_size, true, wp, false)) @@ -838,6 +845,9 @@ static void uffd_events_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); + while (!ready_for_fork) + ; /* Wait for the poll_thread to start executing before forking */ + pid = fork(); if (pid < 0) err("fork"); From 25cd241408a2adc1ed0ebc90ae0793576c111880 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sun, 24 Mar 2024 17:04:47 -0400 Subject: [PATCH 472/496] mm: zswap: fix data loss on SWP_SYNCHRONOUS_IO devices Zhongkun He reports data corruption when combining zswap with zram. The issue is the exclusive loads we're doing in zswap. They assume that all reads are going into the swapcache, which can assume authoritative ownership of the data and so the zswap copy can go. However, zram files are marked SWP_SYNCHRONOUS_IO, and faults will try to bypass the swapcache. This results in an optimistic read of the swap data into a page that will be dismissed if the fault fails due to races. In this case, zswap mustn't drop its authoritative copy. Link: https://lore.kernel.org/all/CACSyD1N+dUvsu8=zV9P691B9bVq33erwOXNTmEaUbi9DrDeJzw@mail.gmail.com/ Fixes: b9c91c43412f ("mm: zswap: support exclusive loads") Link: https://lkml.kernel.org/r/20240324210447.956973-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Zhongkun He Tested-by: Zhongkun He Acked-by: Yosry Ahmed Acked-by: Barry Song Reviewed-by: Chengming Zhou Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: [6.5+] Signed-off-by: Andrew Morton --- mm/zswap.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 36612f34b5d7..caed028945b0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1636,6 +1636,7 @@ bool zswap_load(struct folio *folio) swp_entry_t swp = folio->swap; pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; + bool swapcache = folio_test_swapcache(folio); struct zswap_tree *tree = swap_zswap_tree(swp); struct zswap_entry *entry; u8 *dst; @@ -1648,7 +1649,20 @@ bool zswap_load(struct folio *folio) spin_unlock(&tree->lock); return false; } - zswap_rb_erase(&tree->rbroot, entry); + /* + * When reading into the swapcache, invalidate our entry. The + * swapcache can be the authoritative owner of the page and + * its mappings, and the pressure that results from having two + * in-memory copies outweighs any benefits of caching the + * compression work. + * + * (Most swapins go through the swapcache. The notable + * exception is the singleton fault on SWP_SYNCHRONOUS_IO + * files, which reads into a private page and may free it if + * the fault fails. We remain the primary owner of the entry.) + */ + if (swapcache) + zswap_rb_erase(&tree->rbroot, entry); spin_unlock(&tree->lock); if (entry->length) @@ -1663,9 +1677,10 @@ bool zswap_load(struct folio *folio) if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); - zswap_entry_free(entry); - - folio_mark_dirty(folio); + if (swapcache) { + zswap_entry_free(entry); + folio_mark_dirty(folio); + } return true; } From 32fbe5246582af4f611ccccee33fd6e559087252 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 25 Mar 2024 09:50:50 +0800 Subject: [PATCH 473/496] crash: use macro to add crashk_res into iomem early for specific arch There are regression reports[1][2] that crashkernel region on x86_64 can't be added into iomem tree sometime. This causes the later failure of kdump loading. This happened after commit 4a693ce65b18 ("kdump: defer the insertion of crashkernel resources") was merged. Even though, these reported issues are proved to be related to other component, they are just exposed after above commmit applied, I still would like to keep crashk_res and crashk_low_res being added into iomem early as before because the early adding has been always there on x86_64 and working very well. For safety of kdump, Let's change it back. Here, add a macro HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY to limit that only ARCH defining the macro can have the early adding crashk_res/_low_res into iomem. Then define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY on x86 to enable it. Note: In reserve_crashkernel_low(), there's a remnant of crashk_low_res handling which was mistakenly added back in commit 85fcde402db1 ("kexec: split crashkernel reservation code out from crash_core.c"). [1] [PATCH V2] x86/kexec: do not update E820 kexec table for setup_data https://lore.kernel.org/all/Zfv8iCL6CT2JqLIC@darkstar.users.ipa.redhat.com/T/#u [2] Question about Address Range Validation in Crash Kernel Allocation https://lore.kernel.org/all/4eeac1f733584855965a2ea62fa4da58@huawei.com/T/#u Link: https://lkml.kernel.org/r/ZgDYemRQ2jxjLkq+@MiWiFi-R3L-srv Fixes: 4a693ce65b18 ("kdump: defer the insertion of crashkernel resources") Signed-off-by: Baoquan He Cc: Dave Young Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiri Bohac Cc: Li Huafei Cc: Signed-off-by: Andrew Morton --- arch/x86/include/asm/crash_reserve.h | 2 ++ kernel/crash_reserve.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/crash_reserve.h b/arch/x86/include/asm/crash_reserve.h index 152239f95541..7835b2cdff04 100644 --- a/arch/x86/include/asm/crash_reserve.h +++ b/arch/x86/include/asm/crash_reserve.h @@ -39,4 +39,6 @@ static inline unsigned long crash_low_size_default(void) #endif } +#define HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + #endif /* _X86_CRASH_RESERVE_H */ diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index bbb6c3cb00e4..066668799f75 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -366,7 +366,9 @@ static int __init reserve_crashkernel_low(unsigned long long low_size) crashk_low_res.start = low_base; crashk_low_res.end = low_base + low_size - 1; +#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY insert_resource(&iomem_resource, &crashk_low_res); +#endif #endif return 0; } @@ -448,8 +450,12 @@ retry: crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; +#ifdef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY + insert_resource(&iomem_resource, &crashk_res); +#endif } +#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { if (crashk_res.start < crashk_res.end) @@ -462,3 +468,4 @@ static __init int insert_crashkernel_resources(void) } early_initcall(insert_crashkernel_resources); #endif +#endif From 7608a971fdeb4c3eefa522d1bfe8d4bc6b2481cc Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:45 +0100 Subject: [PATCH 474/496] tls: recv: process_rx_list shouldn't use an offset with kvec Only MSG_PEEK needs to copy from an offset during the final process_rx_list call, because the bytes we copied at the beginning of tls_sw_recvmsg were left on the rx_list. In the KVEC case, we removed data from the rx_list as we were copying it, so there's no need to use an offset, just like in the normal case. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/e5487514f828e0347d2b92ca40002c62b58af73d.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 211f57164cb6..3cdc6bc9fba6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2152,7 +2152,7 @@ recv_end: } /* Drain records from the rx_list & copy if required */ - if (is_peek || is_kvec) + if (is_peek) err = process_rx_list(ctx, msg, &control, copied + peeked, decrypted - peeked, is_peek, NULL); else From 85eef9a41d019b59be7bc91793f26251909c0710 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:46 +0100 Subject: [PATCH 475/496] tls: adjust recv return with async crypto and failed copy to userspace process_rx_list may not copy as many bytes as we want to the userspace buffer, for example in case we hit an EFAULT during the copy. If this happens, we should only count the bytes that were actually copied, which may be 0. Subtracting async_copy_bytes is correct in both peek and !peek cases, because decrypted == async_copy_bytes + peeked for the peek case: peek is always !ZC, and we can go through either the sync or async path. In the async case, we add chunk to both decrypted and async_copy_bytes. In the sync case, we add chunk to both decrypted and peeked. I missed that in commit 6caaf104423d ("tls: fix peeking with sync+async decryption"). Fixes: 4d42cd6bc2ac ("tls: rx: fix return value for async crypto") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/1b5a1eaab3c088a9dd5d9f1059ceecd7afe888d1.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3cdc6bc9fba6..14faf6189eb1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2158,6 +2158,9 @@ recv_end: else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); + + /* we could have copied less than we wanted, and possibly nothing */ + decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; From dc54b813df63020e946ccdef35b64d4fa99fd622 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:47 +0100 Subject: [PATCH 476/496] selftests: tls: add test with a partially invalid iov Make sure that we don't return more bytes than we actually received if the userspace buffer was bogus. We expect to receive at least the rest of rec1, and possibly some of rec2 (currently, we don't, but that would be ok). Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/720e61b3d3eab40af198a58ce2cd1ee019f0ceb1.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 34 +++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index c6eda21cefb6..f27a12d2a2c9 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -1615,6 +1615,40 @@ TEST_F(tls, getsockopt) EXPECT_EQ(errno, EINVAL); } +TEST_F(tls, recv_efault) +{ + char *rec1 = "1111111111"; + char *rec2 = "2222222222"; + struct msghdr hdr = {}; + struct iovec iov[2]; + char recv_mem[12]; + int ret; + + if (self->notls) + SKIP(return, "no TLS support"); + + EXPECT_EQ(send(self->fd, rec1, 10, 0), 10); + EXPECT_EQ(send(self->fd, rec2, 10, 0), 10); + + iov[0].iov_base = recv_mem; + iov[0].iov_len = sizeof(recv_mem); + iov[1].iov_base = NULL; /* broken iov to make process_rx_list fail */ + iov[1].iov_len = 1; + + hdr.msg_iovlen = 2; + hdr.msg_iov = iov; + + EXPECT_EQ(recv(self->cfd, recv_mem, 1, 0), 1); + EXPECT_EQ(recv_mem[0], rec1[0]); + + ret = recvmsg(self->cfd, &hdr, 0); + EXPECT_LE(ret, sizeof(recv_mem)); + EXPECT_GE(ret, 9); + EXPECT_EQ(memcmp(rec1, recv_mem, 9), 0); + if (ret > 9) + EXPECT_EQ(memcmp(rec2, recv_mem + 9, ret - 9), 0); +} + FIXTURE(tls_err) { int fd, cfd; From 417e91e856099e9b8a42a2520e2255e6afe024be Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:48 +0100 Subject: [PATCH 477/496] tls: get psock ref after taking rxlock to avoid leak At the start of tls_sw_recvmsg, we take a reference on the psock, and then call tls_rx_reader_lock. If that fails, we return directly without releasing the reference. Instead of adding a new label, just take the reference after locking has succeeded, since we don't need it before. Fixes: 4cbc325ed6b4 ("tls: rx: allow only one reader at a time") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/fe2ade22d030051ce4c3638704ed58b67d0df643.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 14faf6189eb1..b783231668c6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1976,10 +1976,10 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); - psock = sk_psock_get(sk); err = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT); if (err < 0) return err; + psock = sk_psock_get(sk); bpf_strp_enabled = sk_psock_strp_enabled(psock); /* If crypto failed the connection is broken */ From f7442a634ac06b953fc1f7418f307b25acd4cfbc Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 25 Mar 2024 14:36:27 -0400 Subject: [PATCH 478/496] mlxbf_gige: call request_irq() after NAPI initialized The mlxbf_gige driver encounters a NULL pointer exception in mlxbf_gige_open() when kdump is enabled. The sequence to reproduce the exception is as follows: a) enable kdump b) trigger kdump via "echo c > /proc/sysrq-trigger" c) kdump kernel executes d) kdump kernel loads mlxbf_gige module e) the mlxbf_gige module runs its open() as the the "oob_net0" interface is brought up f) mlxbf_gige module will experience an exception during its open(), something like: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Mem abort info: ESR = 0x0000000086000004 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault user pgtable: 4k pages, 48-bit VAs, pgdp=00000000e29a4000 [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000086000004 [#1] SMP CPU: 0 PID: 812 Comm: NetworkManager Tainted: G OE 5.15.0-1035-bluefield #37-Ubuntu Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.6.0.13024 Jan 19 2024 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : 0x0 lr : __napi_poll+0x40/0x230 sp : ffff800008003e00 x29: ffff800008003e00 x28: 0000000000000000 x27: 00000000ffffffff x26: ffff000066027238 x25: ffff00007cedec00 x24: ffff800008003ec8 x23: 000000000000012c x22: ffff800008003eb7 x21: 0000000000000000 x20: 0000000000000001 x19: ffff000066027238 x18: 0000000000000000 x17: ffff578fcb450000 x16: ffffa870b083c7c0 x15: 0000aaab010441d0 x14: 0000000000000001 x13: 00726f7272655f65 x12: 6769675f6662786c x11: 0000000000000000 x10: 0000000000000000 x9 : ffffa870b0842398 x8 : 0000000000000004 x7 : fe5a48b9069706ea x6 : 17fdb11fc84ae0d2 x5 : d94a82549d594f35 x4 : 0000000000000000 x3 : 0000000000400100 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000066027238 Call trace: 0x0 net_rx_action+0x178/0x360 __do_softirq+0x15c/0x428 __irq_exit_rcu+0xac/0xec irq_exit+0x18/0x2c handle_domain_irq+0x6c/0xa0 gic_handle_irq+0xec/0x1b0 call_on_irq_stack+0x20/0x2c do_interrupt_handler+0x5c/0x70 el1_interrupt+0x30/0x50 el1h_64_irq_handler+0x18/0x2c el1h_64_irq+0x7c/0x80 __setup_irq+0x4c0/0x950 request_threaded_irq+0xf4/0x1bc mlxbf_gige_request_irqs+0x68/0x110 [mlxbf_gige] mlxbf_gige_open+0x5c/0x170 [mlxbf_gige] __dev_open+0x100/0x220 __dev_change_flags+0x16c/0x1f0 dev_change_flags+0x2c/0x70 do_setlink+0x220/0xa40 __rtnl_newlink+0x56c/0x8a0 rtnl_newlink+0x58/0x84 rtnetlink_rcv_msg+0x138/0x3c4 netlink_rcv_skb+0x64/0x130 rtnetlink_rcv+0x20/0x30 netlink_unicast+0x2ec/0x360 netlink_sendmsg+0x278/0x490 __sock_sendmsg+0x5c/0x6c ____sys_sendmsg+0x290/0x2d4 ___sys_sendmsg+0x84/0xd0 __sys_sendmsg+0x70/0xd0 __arm64_sys_sendmsg+0x2c/0x40 invoke_syscall+0x78/0x100 el0_svc_common.constprop.0+0x54/0x184 do_el0_svc+0x30/0xac el0_svc+0x48/0x160 el0t_64_sync_handler+0xa4/0x12c el0t_64_sync+0x1a4/0x1a8 Code: bad PC value ---[ end trace 7d1c3f3bf9d81885 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Kernel Offset: 0x2870a7a00000 from 0xffff800008000000 PHYS_OFFSET: 0x80000000 CPU features: 0x0,000005c1,a3332a5a Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- The exception happens because there is a pending RX interrupt before the call to request_irq(RX IRQ) executes. Then, the RX IRQ handler fires immediately after this request_irq() completes. The RX IRQ handler runs "napi_schedule()" before NAPI is fully initialized via "netif_napi_add()" and "napi_enable()", both which happen later in the open() logic. The logic in mlxbf_gige_open() must fully initialize NAPI before any calls to request_irq() execute. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Reviewed-by: Asmaa Mnebhi Link: https://lore.kernel.org/r/20240325183627.7641-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index cef0e2d3f1a7..77134ca92938 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -139,13 +139,10 @@ static int mlxbf_gige_open(struct net_device *netdev) control |= MLXBF_GIGE_CONTROL_PORT_EN; writeq(control, priv->base + MLXBF_GIGE_CONTROL); - err = mlxbf_gige_request_irqs(priv); - if (err) - return err; mlxbf_gige_cache_stats(priv); err = mlxbf_gige_clean_port(priv); if (err) - goto free_irqs; + return err; /* Clear driver's valid_polarity to match hardware, * since the above call to clean_port() resets the @@ -166,6 +163,10 @@ static int mlxbf_gige_open(struct net_device *netdev) napi_enable(&priv->napi); netif_start_queue(netdev); + err = mlxbf_gige_request_irqs(priv); + if (err) + goto napi_deinit; + /* Set bits in INT_EN that we care about */ int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR | MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS | @@ -182,14 +183,17 @@ static int mlxbf_gige_open(struct net_device *netdev) return 0; +napi_deinit: + netif_stop_queue(netdev); + napi_disable(&priv->napi); + netif_napi_del(&priv->napi); + mlxbf_gige_rx_deinit(priv); + tx_deinit: mlxbf_gige_tx_deinit(priv); phy_deinit: phy_stop(phydev); - -free_irqs: - mlxbf_gige_free_irqs(priv); return err; } From ea2c09283b44d1a3732a195a9b257d56779c8863 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 25 Mar 2024 09:25:05 +0100 Subject: [PATCH 479/496] net: wan: framer: Add missing static inline qualifiers Compilation with CONFIG_GENERIC_FRAMER disabled lead to the following warnings: framer.h:184:16: warning: no previous prototype for function 'framer_get' [-Wmissing-prototypes] 184 | struct framer *framer_get(struct device *dev, const char *con_id) framer.h:184:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 184 | struct framer *framer_get(struct device *dev, const char *con_id) framer.h:189:6: warning: no previous prototype for function 'framer_put' [-Wmissing-prototypes] 189 | void framer_put(struct device *dev, struct framer *framer) framer.h:189:1: note: declare 'static' if the function is not intended to be used outside of this translation unit 189 | void framer_put(struct device *dev, struct framer *framer) Add missing 'static inline' qualifiers for these functions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202403241110.hfJqeJRu-lkp@intel.com/ Fixes: 82c944d05b1a ("net: wan: Add framer framework support") Cc: stable@vger.kernel.org Signed-off-by: Herve Codina Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/framer/framer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h index 9a9b88962c29..2b85fe9e7f9a 100644 --- a/include/linux/framer/framer.h +++ b/include/linux/framer/framer.h @@ -181,12 +181,12 @@ static inline int framer_notifier_unregister(struct framer *framer, return -ENOSYS; } -struct framer *framer_get(struct device *dev, const char *con_id) +static inline struct framer *framer_get(struct device *dev, const char *con_id) { return ERR_PTR(-ENOSYS); } -void framer_put(struct device *dev, struct framer *framer) +static inline void framer_put(struct device *dev, struct framer *framer) { } From afbf75e8da8ce8a0698212953d350697bb4355a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 25 Mar 2024 08:56:11 -0700 Subject: [PATCH 480/496] selftests: netdevsim: set test timeout to 10 minutes The longest running netdevsim test, nexthop.sh, currently takes 5 min to finish. Around 260s to be exact, and 310s on a debug kernel. The default timeout in selftest is 45sec, so we need an explicit config. Give ourselves some headroom and use 10min. Commit under Fixes isn't really to "blame" but prior to that netdevsim tests weren't integrated with kselftest infra so blaming the tests themselves doesn't seem right, either. Fixes: 8ff25dac88f6 ("netdevsim: add Makefile for selftests") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- tools/testing/selftests/drivers/net/netdevsim/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/drivers/net/netdevsim/settings diff --git a/tools/testing/selftests/drivers/net/netdevsim/settings b/tools/testing/selftests/drivers/net/netdevsim/settings new file mode 100644 index 000000000000..a62d2fa1275c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/settings @@ -0,0 +1 @@ +timeout=600 From 96b98a6552a90690d7bc18dd71b66312c9ded1fb Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 19 Mar 2024 13:31:52 +0530 Subject: [PATCH 481/496] bpf: fix warning for crash_kexec With [1], crash dump specific code is moved out of CONFIG_KEXEC_CORE and placed under CONFIG_CRASH_DUMP, where it is more appropriate. And since CONFIG_KEXEC & !CONFIG_CRASH_DUMP build option is supported with that, it led to the below warning: "WARN: resolve_btfids: unresolved symbol crash_kexec" Fix it by using the appropriate #ifdef. [1] https://lore.kernel.org/all/20240124051254.67105-1-bhe@redhat.com/ Acked-by: Baoquan He Fixes: 02aff8480533 ("crash: split crash dumping code out from kexec_core.c") Acked-by: Jiri Olsa Acked-by: Stanislav Fomichev Signed-off-by: Hari Bathini Link: https://lore.kernel.org/r/20240319080152.36987-1-hbathini@linux.ibm.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a89587859571..449b9a5d3fe3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2548,7 +2548,7 @@ __bpf_kfunc void bpf_throw(u64 cookie) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) From 5b4cdd9c5676559b8a7c944ac5269b914b8c0bb8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 26 Mar 2024 14:59:48 -0700 Subject: [PATCH 482/496] Fix memory leak in posix_clock_open() If the clk ops.open() function returns an error, we don't release the pccontext we allocated for this clock. Re-organize the code slightly to make it all more obvious. Reported-by: Rohit Keshri Acked-by: Oleg Nesterov Fixes: 60c6946675fc ("posix-clock: introduce posix_clock_context concept") Cc: Jakub Kicinski Cc: David S. Miller Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/time/posix-clock.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9de66bbbb3d1..4782edcbe7b9 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -129,15 +129,17 @@ static int posix_clock_open(struct inode *inode, struct file *fp) goto out; } pccontext->clk = clk; - fp->private_data = pccontext; - if (clk->ops.open) + if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); - else - err = 0; - - if (!err) { - get_device(clk->dev); + if (err) { + kfree(pccontext); + goto out; + } } + + fp->private_data = pccontext; + get_device(clk->dev); + err = 0; out: up_read(&clk->rwsem); return err; From 498e47cd1d1f3e0a870a29d1b28093e64db52fd2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 27 Mar 2024 09:48:47 -0700 Subject: [PATCH 483/496] Fix build errors due to new UIO_MEM_DMA_COHERENT mess Commit 576882ef5e7f ("uio: introduce UIO_MEM_DMA_COHERENT type") introduced a new use-case for 'struct uio_mem' where the 'mem' field now contains a kernel virtual address when 'memtype' is set to UIO_MEM_DMA_COHERENT. That in turn causes build errors, because 'mem' is of type 'phys_addr_t', and a virtual address is a pointer type. When the code just blindly uses cast to mix the two, it caused problems when phys_addr_t isn't the same size as a pointer - notably on 32-bit architectures with PHYS_ADDR_T_64BIT. The proper thing to do would probably be to use a union member, and not have any casts, and make the 'mem' member be a union of 'mem.physaddr' and 'mem.vaddr', based on 'memtype'. This is not that proper thing. This is just fixing the ugly casts to be even uglier, but at least not cause build errors on 32-bit platforms with 64-bit physical addresses. Reported-by: Guenter Roeck Fixes: 576882ef5e7f ("uio: introduce UIO_MEM_DMA_COHERENT type") Fixes: 7722151e4651 ("uio_pruss: UIO_MEM_DMA_COHERENT conversion") Fixes: 019947805a8d ("uio_dmem_genirq: UIO_MEM_DMA_COHERENT conversion") Cc: Greg Kroah-Hartman Cc: Chris Leech Cc: Nilesh Javali Cc: Christoph Hellwig Signed-off-by: Linus Torvalds --- drivers/uio/uio.c | 2 +- drivers/uio/uio_dmem_genirq.c | 4 ++-- drivers/uio/uio_pruss.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index bb77de6fa067..009158fef2a8 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -792,7 +792,7 @@ static int uio_mmap_dma_coherent(struct vm_area_struct *vma) */ vma->vm_pgoff = 0; - addr = (void *)mem->addr; + addr = (void *)(uintptr_t)mem->addr; ret = dma_mmap_coherent(mem->dma_device, vma, addr, diff --git a/drivers/uio/uio_dmem_genirq.c b/drivers/uio/uio_dmem_genirq.c index d5f9384df125..13cc35ab5d29 100644 --- a/drivers/uio/uio_dmem_genirq.c +++ b/drivers/uio/uio_dmem_genirq.c @@ -60,7 +60,7 @@ static int uio_dmem_genirq_open(struct uio_info *info, struct inode *inode) addr = dma_alloc_coherent(&priv->pdev->dev, uiomem->size, &uiomem->dma_addr, GFP_KERNEL); - uiomem->addr = addr ? (phys_addr_t) addr : DMEM_MAP_ERROR; + uiomem->addr = addr ? (uintptr_t) addr : DMEM_MAP_ERROR; ++uiomem; } priv->refcnt++; @@ -89,7 +89,7 @@ static int uio_dmem_genirq_release(struct uio_info *info, struct inode *inode) break; if (uiomem->addr) { dma_free_coherent(uiomem->dma_device, uiomem->size, - (void *) uiomem->addr, + (void *) (uintptr_t) uiomem->addr, uiomem->dma_addr); } uiomem->addr = DMEM_MAP_ERROR; diff --git a/drivers/uio/uio_pruss.c b/drivers/uio/uio_pruss.c index 72b33f7d4c40..f67881cba645 100644 --- a/drivers/uio/uio_pruss.c +++ b/drivers/uio/uio_pruss.c @@ -191,7 +191,7 @@ static int pruss_probe(struct platform_device *pdev) p->mem[1].size = sram_pool_sz; p->mem[1].memtype = UIO_MEM_PHYS; - p->mem[2].addr = (phys_addr_t) gdev->ddr_vaddr; + p->mem[2].addr = (uintptr_t) gdev->ddr_vaddr; p->mem[2].dma_addr = gdev->ddr_paddr; p->mem[2].size = extram_pool_sz; p->mem[2].memtype = UIO_MEM_DMA_COHERENT; From a8d89feba7e54e691ca7c4efc2a6264fa83f3687 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 26 Mar 2024 22:42:44 -0400 Subject: [PATCH 484/496] bpf: Check bloom filter map value size This patch adds a missing check to bloom filter creating, rejecting values above KMALLOC_MAX_SIZE. This brings the bloom map in line with many other map types. The lack of this protection can cause kernel crashes for value sizes that overflow int's. Such a crash was caught by syzkaller. The next patch adds more guard-rails at a lower level. Signed-off-by: Andrei Matei Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240327024245.318299-2-andreimatei1@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bloom_filter.c | 13 +++++++++++++ .../selftests/bpf/prog_tests/bloom_filter_map.c | 6 ++++++ 2 files changed, 19 insertions(+) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index addf3dd57b59..35e1ddca74d2 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -80,6 +80,18 @@ static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key return -EOPNOTSUPP; } +/* Called from syscall */ +static int bloom_map_alloc_check(union bpf_attr *attr) +{ + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -191,6 +203,7 @@ static u64 bloom_map_mem_usage(const struct bpf_map *map) BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter) const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bloom_map_alloc_check, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, .map_get_next_key = bloom_map_get_next_key, diff --git a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c index 053f4d6da77a..cc184e4420f6 100644 --- a/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c +++ b/tools/testing/selftests/bpf/prog_tests/bloom_filter_map.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021 Facebook */ #include +#include #include #include "bloom_filter_map.skel.h" @@ -21,6 +22,11 @@ static void test_fail_cases(void) if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value size 0")) close(fd); + /* Invalid value size: too big */ + fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, INT32_MAX, 100, NULL); + if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid value too large")) + close(fd); + /* Invalid max entries size */ fd = bpf_map_create(BPF_MAP_TYPE_BLOOM_FILTER, NULL, 0, sizeof(value), 0, NULL); if (!ASSERT_LT(fd, 0, "bpf_map_create bloom filter invalid max entries size")) From ecc6a2101840177e57c925c102d2d29f260d37c8 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 26 Mar 2024 22:42:45 -0400 Subject: [PATCH 485/496] bpf: Protect against int overflow for stack access size This patch re-introduces protection against the size of access to stack memory being negative; the access size can appear negative as a result of overflowing its signed int representation. This should not actually happen, as there are other protections along the way, but we should protect against it anyway. One code path was missing such protections (fixed in the previous patch in the series), causing out-of-bounds array accesses in check_stack_range_initialized(). This patch causes the verification of a program with such a non-sensical access size to fail. This check used to exist in a more indirect way, but was inadvertendly removed in a833a17aeac7. Fixes: a833a17aeac7 ("bpf: Fix verification of indirect var-off stack access") Reported-by: syzbot+33f4297b5f927648741a@syzkaller.appspotmail.com Reported-by: syzbot+aafd0513053a1cbf52ef@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAADnVQLORV5PT0iTAhRER+iLBTkByCYNBYyvBSgjN1T31K+gOw@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Andrei Matei Link: https://lore.kernel.org/r/20240327024245.318299-3-andreimatei1@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0bfc0050db28..353985b2b6a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6701,6 +6701,11 @@ static int check_stack_access_within_bounds( err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ + if (!err && access_size < 0) + /* access_size should not be negative (or overflow an int); others checks + * along the way should have prevented such an access. + */ + err = -EFAULT; /* invalid negative access size; integer overflow? */ if (err) { if (tnum_is_const(reg->var_off)) { From 4dd651076ef0e5f09940f763a1b4e8a209dab7ab Mon Sep 17 00:00:00 2001 From: Matt Bobrowski Date: Tue, 26 Mar 2024 19:50:19 +0000 Subject: [PATCH 486/496] bpf: update BPF LSM designated reviewer list Adding myself in place of both Brendan and Florent as both have since moved on from working on the BPF LSM and will no longer be devoting their time to maintaining the BPF LSM. Signed-off-by: Matt Bobrowski Acked-by: KP Singh Link: https://lore.kernel.org/r/ZgMhWF_egdYF8t4D@google.com Signed-off-by: Alexei Starovoitov --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 272dba71f6d9..9372dcff0f35 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3941,8 +3941,7 @@ F: kernel/bpf/ringbuf.c BPF [SECURITY & LSM] (Security Audit and Enforcement using BPF) M: KP Singh -R: Florent Revest -R: Brendan Jackman +R: Matt Bobrowski L: bpf@vger.kernel.org S: Maintained F: Documentation/bpf/prog_lsm.rst From b32ca27fa238ff83427d23bef2a5b741e2a88a1e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:27:50 +0100 Subject: [PATCH 487/496] netfilter: nf_tables: reject destroy command to remove basechain hooks Report EOPNOTSUPP if NFT_MSG_DESTROYCHAIN is used to delete hooks in an existing netdev basechain, thus, only NFT_MSG_DELCHAIN is allowed. Fixes: 7d937b107108f ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5fa3d3540c93..a1a8030e16a5 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2944,7 +2944,8 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { From 1e1fb6f00f52812277963365d9bd835b9b0ea4e0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:27:59 +0100 Subject: [PATCH 488/496] netfilter: nf_tables: reject table flag and netdev basechain updates netdev basechain updates are stored in the transaction object hook list. When setting on the table dormant flag, it iterates over the existing hooks in the basechain. Thus, skipping the hooks that are being added/deleted in this transaction, which leaves hook registration in inconsistent state. Reject table flag updates in combination with netdev basechain updates in the same batch: - Update table flags and add/delete basechain: Check from basechain update path if there are pending flag updates for this table. - add/delete basechain and update table flags: Iterate over the transaction list to search for basechain updates from the table update path. In both cases, the batch is rejected. Based on suggestion from Florian Westphal. Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Fixes: 7d937b107108f ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a1a8030e16a5..086d85fffeb1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1200,6 +1200,25 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) __NFT_TABLE_F_WAS_AWAKEN | \ __NFT_TABLE_F_WAS_ORPHAN) +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if ((trans->msg_type == NFT_MSG_NEWCHAIN || + trans->msg_type == NFT_MSG_DELCHAIN) && + trans->ctx.table == ctx->table && + nft_trans_chain_update(trans)) + return true; + } + + return false; +} + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -1226,7 +1245,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ - if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, @@ -2631,6 +2650,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } } + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { @@ -2860,6 +2886,9 @@ static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_trans *trans; int err; + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) From 216e7bf7402caf73f4939a8e0248392e96d7c0da Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:28:07 +0100 Subject: [PATCH 489/496] netfilter: nf_tables: skip netdev hook unregistration if table is dormant Skip hook unregistration when adding or deleting devices from an existing netdev basechain. Otherwise, commit/abort path try to unregister hooks which not enabled. Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 086d85fffeb1..fd86f2720c9e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10212,9 +10212,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, &nft_trans_chain_hooks(trans)); - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } } else { nft_chain_del(trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, @@ -10490,9 +10492,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); From 15fba562f7a9f04322b8bfc8f392e04bb93d81be Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 25 Mar 2024 21:15:52 -0700 Subject: [PATCH 490/496] netfilter: arptables: Select NETFILTER_FAMILY_ARP when building arp_tables.c syzkaller started to report a warning below [0] after consuming the commit 4654467dc7e1 ("netfilter: arptables: allow xtables-nft only builds"). The change accidentally removed the dependency on NETFILTER_FAMILY_ARP from IP_NF_ARPTABLES. If NF_TABLES_ARP is not enabled on Kconfig, NETFILTER_FAMILY_ARP will be removed and some code necessary for arptables will not be compiled. $ grep -E "(NETFILTER_FAMILY_ARP|IP_NF_ARPTABLES|NF_TABLES_ARP)" .config CONFIG_NETFILTER_FAMILY_ARP=y # CONFIG_NF_TABLES_ARP is not set CONFIG_IP_NF_ARPTABLES=y $ make olddefconfig $ grep -E "(NETFILTER_FAMILY_ARP|IP_NF_ARPTABLES|NF_TABLES_ARP)" .config # CONFIG_NF_TABLES_ARP is not set CONFIG_IP_NF_ARPTABLES=y So, when nf_register_net_hooks() is called for arptables, it will trigger the splat below. Now IP_NF_ARPTABLES is only enabled by IP_NF_ARPFILTER, so let's restore the dependency on NETFILTER_FAMILY_ARP in IP_NF_ARPFILTER. [0]: WARNING: CPU: 0 PID: 242 at net/netfilter/core.c:316 nf_hook_entry_head+0x1e1/0x2c0 net/netfilter/core.c:316 Modules linked in: CPU: 0 PID: 242 Comm: syz-executor.0 Not tainted 6.8.0-12821-g537c2e91d354 #10 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_hook_entry_head+0x1e1/0x2c0 net/netfilter/core.c:316 Code: 83 fd 04 0f 87 bc 00 00 00 e8 5b 84 83 fd 4d 8d ac ec a8 0b 00 00 e8 4e 84 83 fd 4c 89 e8 5b 5d 41 5c 41 5d c3 e8 3f 84 83 fd <0f> 0b e8 38 84 83 fd 45 31 ed 5b 5d 4c 89 e8 41 5c 41 5d c3 e8 26 RSP: 0018:ffffc90000b8f6e8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000003 RCX: ffffffff83c42164 RDX: ffff888106851180 RSI: ffffffff83c42321 RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000005 R09: 000000000000000a R10: 0000000000000003 R11: ffff8881055c2f00 R12: ffff888112b78000 R13: 0000000000000000 R14: ffff8881055c2f00 R15: ffff8881055c2f00 FS: 00007f377bd78800(0000) GS:ffff88811b000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000496068 CR3: 000000011298b003 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: __nf_register_net_hook+0xcd/0x7a0 net/netfilter/core.c:428 nf_register_net_hook+0x116/0x170 net/netfilter/core.c:578 nf_register_net_hooks+0x5d/0xc0 net/netfilter/core.c:594 arpt_register_table+0x250/0x420 net/ipv4/netfilter/arp_tables.c:1553 arptable_filter_table_init+0x41/0x60 net/ipv4/netfilter/arptable_filter.c:39 xt_find_table_lock+0x2e9/0x4b0 net/netfilter/x_tables.c:1260 xt_request_find_table_lock+0x2b/0xe0 net/netfilter/x_tables.c:1285 get_info+0x169/0x5c0 net/ipv4/netfilter/arp_tables.c:808 do_arpt_get_ctl+0x3f9/0x830 net/ipv4/netfilter/arp_tables.c:1444 nf_getsockopt+0x76/0xd0 net/netfilter/nf_sockopt.c:116 ip_getsockopt+0x17d/0x1c0 net/ipv4/ip_sockglue.c:1777 tcp_getsockopt+0x99/0x100 net/ipv4/tcp.c:4373 do_sock_getsockopt+0x279/0x360 net/socket.c:2373 __sys_getsockopt+0x115/0x1e0 net/socket.c:2402 __do_sys_getsockopt net/socket.c:2412 [inline] __se_sys_getsockopt net/socket.c:2409 [inline] __x64_sys_getsockopt+0xbd/0x150 net/socket.c:2409 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x4f/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x46/0x4e RIP: 0033:0x7f377beca6fe Code: 1f 44 00 00 48 8b 15 01 97 0a 00 f7 d8 64 89 02 b8 ff ff ff ff eb b8 0f 1f 44 00 00 f3 0f 1e fa 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 c9 RSP: 002b:00000000005df728 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 00000000004966e0 RCX: 00007f377beca6fe RDX: 0000000000000060 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 000000000042938a R08: 00000000005df73c R09: 00000000005df800 R10: 00000000004966e8 R11: 0000000000000246 R12: 0000000000000003 R13: 0000000000496068 R14: 0000000000000003 R15: 00000000004bc9d8 Fixes: 4654467dc7e1 ("netfilter: arptables: allow xtables-nft only builds") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 8f6e950163a7..1b991b889506 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -329,6 +329,7 @@ config NFT_COMPAT_ARP config IP_NF_ARPFILTER tristate "arptables-legacy packet filtering support" select IP_NF_ARPTABLES + select NETFILTER_FAMILY_ARP depends on NETFILTER_XTABLES help ARP packet filtering defines a table `filter', which has a series of From 6a4aee277740d04ac0fd54cfa17cc28261932ddc Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 25 Mar 2024 20:06:19 +0100 Subject: [PATCH 491/496] net: phy: qcom: at803x: fix kernel panic with at8031_probe On reworking and splitting the at803x driver, in splitting function of at803x PHYs it was added a NULL dereference bug where priv is referenced before it's actually allocated and then is tried to write to for the is_1000basex and is_fiber variables in the case of at8031, writing on the wrong address. Fix this by correctly setting priv local variable only after at803x_probe is called and actually allocates priv in the phydev struct. Reported-by: William Wortel Cc: Fixes: 25d2ba94005f ("net: phy: at803x: move specific at8031 probe mode check to dedicated probe") Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240325190621.2665-1-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/phy/qcom/at803x.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c index 4717c59d51d0..e79657f76bea 100644 --- a/drivers/net/phy/qcom/at803x.c +++ b/drivers/net/phy/qcom/at803x.c @@ -797,7 +797,7 @@ static int at8031_parse_dt(struct phy_device *phydev) static int at8031_probe(struct phy_device *phydev) { - struct at803x_priv *priv = phydev->priv; + struct at803x_priv *priv; int mode_cfg; int ccr; int ret; @@ -806,6 +806,8 @@ static int at8031_probe(struct phy_device *phydev) if (ret) return ret; + priv = phydev->priv; + /* Only supported on AR8031/AR8033, the AR8030/AR8035 use strapping * options. */ From dfd222e2aef68818320a57b13a1c52a44c22bc80 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Mon, 25 Mar 2024 12:30:24 -0700 Subject: [PATCH 492/496] net: bcmasp: Bring up unimac after PHY link up The unimac requires the PHY RX clk during reset or it may be put into a bad state. Bring up the unimac after link up to ensure the PHY RX clk exists. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Justin Chen Signed-off-by: Paolo Abeni --- .../net/ethernet/broadcom/asp2/bcmasp_intf.c | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index dd06b68b33ed..34e5156762a8 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -392,7 +392,9 @@ static void umac_reset(struct bcmasp_intf *intf) umac_wl(intf, 0x0, UMC_CMD); umac_wl(intf, UMC_CMD_SW_RESET, UMC_CMD); usleep_range(10, 100); - umac_wl(intf, 0x0, UMC_CMD); + /* We hold the umac in reset and bring it out of + * reset when phy link is up. + */ } static void umac_set_hw_addr(struct bcmasp_intf *intf, @@ -412,6 +414,8 @@ static void umac_enable_set(struct bcmasp_intf *intf, u32 mask, u32 reg; reg = umac_rl(intf, UMC_CMD); + if (reg & UMC_CMD_SW_RESET) + return; if (enable) reg |= mask; else @@ -430,7 +434,6 @@ static void umac_init(struct bcmasp_intf *intf) umac_wl(intf, 0x800, UMC_FRM_LEN); umac_wl(intf, 0xffff, UMC_PAUSE_CNTRL); umac_wl(intf, 0x800, UMC_RX_MAX_PKT_SZ); - umac_enable_set(intf, UMC_CMD_PROMISC, 1); } static int bcmasp_tx_poll(struct napi_struct *napi, int budget) @@ -658,6 +661,12 @@ static void bcmasp_adj_link(struct net_device *dev) UMC_CMD_HD_EN | UMC_CMD_RX_PAUSE_IGNORE | UMC_CMD_TX_PAUSE_IGNORE); reg |= cmd_bits; + if (reg & UMC_CMD_SW_RESET) { + reg &= ~UMC_CMD_SW_RESET; + umac_wl(intf, reg, UMC_CMD); + udelay(2); + reg |= UMC_CMD_TX_EN | UMC_CMD_RX_EN | UMC_CMD_PROMISC; + } umac_wl(intf, reg, UMC_CMD); active = phy_init_eee(phydev, 0) >= 0; @@ -1045,9 +1054,6 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) umac_init(intf); - /* Disable the UniMAC RX/TX */ - umac_enable_set(intf, (UMC_CMD_RX_EN | UMC_CMD_TX_EN), 0); - umac_set_hw_addr(intf, dev->dev_addr); intf->old_duplex = -1; @@ -1062,9 +1068,6 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) netif_napi_add(intf->ndev, &intf->rx_napi, bcmasp_rx_poll); bcmasp_enable_rx(intf, 1); - /* Turn on UniMAC TX/RX */ - umac_enable_set(intf, (UMC_CMD_RX_EN | UMC_CMD_TX_EN), 1); - intf->crc_fwd = !!(umac_rl(intf, UMC_CMD) & UMC_CMD_CRC_FWD); bcmasp_netif_start(dev); @@ -1306,7 +1309,14 @@ static void bcmasp_suspend_to_wol(struct bcmasp_intf *intf) if (intf->wolopts & WAKE_FILTER) bcmasp_netfilt_suspend(intf); - /* UniMAC receive needs to be turned on */ + /* Bring UniMAC out of reset if needed and enable RX */ + reg = umac_rl(intf, UMC_CMD); + if (reg & UMC_CMD_SW_RESET) + reg &= ~UMC_CMD_SW_RESET; + + reg |= UMC_CMD_RX_EN | UMC_CMD_PROMISC; + umac_wl(intf, reg, UMC_CMD); + umac_enable_set(intf, UMC_CMD_RX_EN, 1); if (intf->parent->wol_irq > 0) { From 4494c10e007121de6d3fbef909d38b4a64087239 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Mon, 25 Mar 2024 12:30:25 -0700 Subject: [PATCH 493/496] net: bcmasp: Remove phy_{suspend/resume} phy_{suspend/resume} is redundant. It gets called from phy_{stop/start}. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Justin Chen Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 34e5156762a8..72ea97c5d5d4 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -1044,10 +1044,6 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) /* Indicate that the MAC is responsible for PHY PM */ phydev->mac_managed_pm = true; - } else if (!intf->wolopts) { - ret = phy_resume(dev->phydev); - if (ret) - goto err_phy_disable; } umac_reset(intf); @@ -1334,7 +1330,6 @@ int bcmasp_interface_suspend(struct bcmasp_intf *intf) { struct device *kdev = &intf->parent->pdev->dev; struct net_device *dev = intf->ndev; - int ret = 0; if (!netif_running(dev)) return 0; @@ -1344,10 +1339,6 @@ int bcmasp_interface_suspend(struct bcmasp_intf *intf) bcmasp_netif_deinit(dev); if (!intf->wolopts) { - ret = phy_suspend(dev->phydev); - if (ret) - goto out; - if (intf->internal_phy) bcmasp_ephy_enable_set(intf, false); else @@ -1364,11 +1355,7 @@ int bcmasp_interface_suspend(struct bcmasp_intf *intf) clk_disable_unprepare(intf->parent->clk); - return ret; - -out: - bcmasp_netif_init(dev, false); - return ret; + return 0; } static void bcmasp_resume_from_wol(struct bcmasp_intf *intf) From e4a58989f5c839316ac63675e8800b9eed7dbe96 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Tue, 26 Mar 2024 12:28:05 +0530 Subject: [PATCH 494/496] net: lan743x: Add set RFE read fifo threshold for PCI1x1x chips PCI11x1x Rev B0 devices might drop packets when receiving back to back frames at 2.5G link speed. Change the B0 Rev device's Receive filtering Engine FIFO threshold parameter from its hardware default of 4 to 3 dwords to prevent the problem. Rev C0 and later hardware already defaults to 3 dwords. Fixes: bb4f6bffe33c ("net: lan743x: Add PCI11010 / PCI11414 device IDs") Signed-off-by: Raju Lakkaraju Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326065805.686128-1-Raju.Lakkaraju@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microchip/lan743x_main.c | 18 ++++++++++++++++++ drivers/net/ethernet/microchip/lan743x_main.h | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index bd8aa83b47e5..75a988c0bd79 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -25,6 +25,8 @@ #define PCS_POWER_STATE_DOWN 0x6 #define PCS_POWER_STATE_UP 0x4 +#define RFE_RD_FIFO_TH_3_DWORDS 0x3 + static void pci11x1x_strap_get_status(struct lan743x_adapter *adapter) { u32 chip_rev; @@ -3272,6 +3274,21 @@ static void lan743x_full_cleanup(struct lan743x_adapter *adapter) lan743x_pci_cleanup(adapter); } +static void pci11x1x_set_rfe_rd_fifo_threshold(struct lan743x_adapter *adapter) +{ + u16 rev = adapter->csr.id_rev & ID_REV_CHIP_REV_MASK_; + + if (rev == ID_REV_CHIP_REV_PCI11X1X_B0_) { + u32 misc_ctl; + + misc_ctl = lan743x_csr_read(adapter, MISC_CTL_0); + misc_ctl &= ~MISC_CTL_0_RFE_READ_FIFO_MASK_; + misc_ctl |= FIELD_PREP(MISC_CTL_0_RFE_READ_FIFO_MASK_, + RFE_RD_FIFO_TH_3_DWORDS); + lan743x_csr_write(adapter, MISC_CTL_0, misc_ctl); + } +} + static int lan743x_hardware_init(struct lan743x_adapter *adapter, struct pci_dev *pdev) { @@ -3287,6 +3304,7 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, pci11x1x_strap_get_status(adapter); spin_lock_init(&adapter->eth_syslock_spinlock); mutex_init(&adapter->sgmii_rw_lock); + pci11x1x_set_rfe_rd_fifo_threshold(adapter); } else { adapter->max_tx_channels = LAN743X_MAX_TX_CHANNELS; adapter->used_tx_channels = LAN743X_USED_TX_CHANNELS; diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index be79cb0ae5af..645bc048e52e 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -26,6 +26,7 @@ #define ID_REV_CHIP_REV_MASK_ (0x0000FFFF) #define ID_REV_CHIP_REV_A0_ (0x00000000) #define ID_REV_CHIP_REV_B0_ (0x00000010) +#define ID_REV_CHIP_REV_PCI11X1X_B0_ (0x000000B0) #define FPGA_REV (0x04) #define FPGA_REV_GET_MINOR_(fpga_rev) (((fpga_rev) >> 8) & 0x000000FF) @@ -311,6 +312,9 @@ #define SGMII_CTL_LINK_STATUS_SOURCE_ BIT(8) #define SGMII_CTL_SGMII_POWER_DN_ BIT(1) +#define MISC_CTL_0 (0x920) +#define MISC_CTL_0_RFE_READ_FIFO_MASK_ GENMASK(6, 4) + /* Vendor Specific SGMII MMD details */ #define SR_VSMMD_PCS_ID1 0x0004 #define SR_VSMMD_PCS_ID2 0x0005 From 40d4b4807cadd83fb3f46cc8cd67a945b5b25461 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 26 Mar 2024 10:57:20 +0530 Subject: [PATCH 495/496] Octeontx2-af: fix pause frame configuration in GMP mode The Octeontx2 MAC block (CGX) has separate data paths (SMU and GMP) for different speeds, allowing for efficient data transfer. The previous patch which added pause frame configuration has a bug due to which pause frame feature is not working in GMP mode. This patch fixes the issue by configurating appropriate registers. Fixes: f7e086e754fe ("octeontx2-af: Pause frame configuration at cgx") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326052720.4441-1-hkelam@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 3c0f55b3e48e..b86f3224f0b7 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -808,6 +808,11 @@ static int cgx_lmac_enadis_pause_frm(void *cgxd, int lmac_id, if (!is_lmac_valid(cgx, lmac_id)) return -ENODEV; + cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL); + cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK; + cfg |= rx_pause ? CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK : 0x0; + cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg); + cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL); cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK; cfg |= rx_pause ? CGX_SMUX_RX_FRM_CTL_CTL_BCK : 0x0; From 18685451fc4e546fc0e718580d32df3c0e5c8272 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Mar 2024 11:18:41 +0100 Subject: [PATCH 496/496] inet: inet_defrag: prevent sk release while still in use ip_local_out() and other functions can pass skb->sk as function argument. If the skb is a fragment and reassembly happens before such function call returns, the sk must not be released. This affects skb fragments reassembled via netfilter or similar modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline. Eric Dumazet made an initial analysis of this bug. Quoting Eric: Calling ip_defrag() in output path is also implying skb_orphan(), which is buggy because output path relies on sk not disappearing. A relevant old patch about the issue was : 8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()") [..] net/ipv4/ip_output.c depends on skb->sk being set, and probably to an inet socket, not an arbitrary one. If we orphan the packet in ipvlan, then downstream things like FQ packet scheduler will not work properly. We need to change ip_defrag() to only use skb_orphan() when really needed, ie whenever frag_list is going to be used. Eric suggested to stash sk in fragment queue and made an initial patch. However there is a problem with this: If skb is refragmented again right after, ip_do_fragment() will copy head->sk to the new fragments, and sets up destructor to sock_wfree. IOW, we have no choice but to fix up sk_wmem accouting to reflect the fully reassembled skb, else wmem will underflow. This change moves the orphan down into the core, to last possible moment. As ip_defrag_offset is aliased with sk_buff->sk member, we must move the offset into the FRAG_CB, else skb->sk gets clobbered. This allows to delay the orphaning long enough to learn if the skb has to be queued or if the skb is completing the reasm queue. In the former case, things work as before, skb is orphaned. This is safe because skb gets queued/stolen and won't continue past reasm engine. In the latter case, we will steal the skb->sk reference, reattach it to the head skb, and fix up wmem accouting when inet_frag inflates truesize. Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Diagnosed-by: Eric Dumazet Reported-by: xingwei lee Reported-by: yue sun Reported-by: syzbot+e5167d7144a62715044c@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240326101845.30836-1-fw@strlen.de Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 7 +-- net/ipv4/inet_fragment.c | 70 ++++++++++++++++++++----- net/ipv4/ip_fragment.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0c7c67b3a87b..9d24aec064e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -753,8 +753,6 @@ typedef unsigned char *sk_buff_data_t; * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by - * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in - * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here @@ -875,10 +873,7 @@ struct sk_buff { struct llist_node ll_node; }; - union { - struct sock *sk; - int ip_defrag_offset; - }; + struct sock *sk; union { ktime_t tstamp; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 7072fc0783ef..c88c9034d630 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -24,6 +24,8 @@ #include #include +#include "../core/sock_destructor.h" + /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. @@ -39,6 +41,7 @@ struct ipfrag_skb_cb { }; struct sk_buff *next_frag; int frag_run_len; + int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) @@ -396,12 +399,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, */ if (!last) fragrun_create(q, skb); /* First fragment. */ - else if (last->ip_defrag_offset + last->len < end) { + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ - if (offset < last->ip_defrag_offset + last->len) + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; - if (offset == last->ip_defrag_offset + last->len) + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); @@ -418,13 +421,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, parent = *rbn; curr = rb_to_skb(parent); - curr_run_end = curr->ip_defrag_offset + + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; - if (end <= curr->ip_defrag_offset) + if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; - else if (offset >= curr->ip_defrag_offset && + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else @@ -438,7 +441,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, rb_insert_color(&skb->rbnode, &q->rb_fragments); } - skb->ip_defrag_offset = offset; + FRAG_CB(skb)->ip_defrag_offset = offset; return IPFRAG_OK; } @@ -448,13 +451,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); - struct sk_buff **nextp; + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; int delta; + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } + if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - return NULL; + if (!fp) { + head = skb; + goto out_restore_sk; + } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; @@ -463,6 +481,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; + } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, @@ -470,13 +494,13 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, consume_skb(head); head = skb; } - WARN_ON(head->ip_defrag_offset != 0); + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) - return NULL; + goto out_restore_sk; delta += head->truesize; if (delta) @@ -492,7 +516,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) - return NULL; + goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) @@ -509,6 +533,21 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, nextp = &skb_shinfo(head)->frag_list; } +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } + return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); @@ -516,6 +555,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; @@ -579,6 +620,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, head->prev = NULL; head->tstamp = q->stamp; head->mono_delivery_time = q->mono_delivery_time; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a4941f53b523..fb947d1613fe 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -384,6 +384,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -487,7 +488,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 1a51a44571c3..d0dcbaca1994 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -294,6 +294,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -469,7 +470,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) {