From 93e3f45a26310e3f3f8558be40df411e23ab742c Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:39 +0530 Subject: [PATCH 001/181] powerpc: Fix __WARN_FLAGS() for use with Objtool Commit 1e688dd2a3d675 ("powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto") updated __WARN_FLAGS() to use asm goto, and added a call to 'unreachable()' after the asm goto for optimal code generation. With CONFIG_OBJTOOL enabled, 'annotate_unreachable()' statement in 'unreachable()' tries to note down the location of the subsequent instruction in a separate elf section to aid code flow analysis. However, on powerpc, this results in gcc emitting a call to a symbol of size 0. This results in objtool complaining of "unannotated intra-function call" since the target symbol is not a valid function call destination. Objtool wants this annotation for code flow analysis, which we are not yet enabling on powerpc. As such, expand the call to 'unreachable()' in __WARN_FLAGS() without annotate_unreachable(): barrier_before_unreachable(); __builtin_unreachable(); This still results in optimal code generation for __WARN_FLAGS(), while getting rid of the objtool warning. We still need barrier_before_unreachable() to work around gcc bugs 82365 and 106751: - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82365 - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106751 Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-2-sv@linux.ibm.com --- arch/powerpc/include/asm/bug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 61a4736355c2..ef42adb44aa3 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -99,7 +99,8 @@ __label__ __label_warn_on; \ \ WARN_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags), __label_warn_on); \ - unreachable(); \ + barrier_before_unreachable(); \ + __builtin_unreachable(); \ \ __label_warn_on: \ break; \ From 01f2cf0b990e58ae89142f57c7e02d33621311d2 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:40 +0530 Subject: [PATCH 002/181] powerpc: Override __ALIGN and __ALIGN_STR macros In a subsequent patch, we would want to annotate powerpc assembly functions with SYM_FUNC_START_LOCAL macro. This macro depends on __ALIGN macro. The default expansion of __ALIGN macro is: #define __ALIGN .align 4,0x90 So, override __ALIGN and __ALIGN_STR macros to use the same alignment as that of the existing _GLOBAL macro. Also, do not pad with 0x90, because repeated 0x90s are not a nop or trap on powerpc. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-3-sv@linux.ibm.com --- arch/powerpc/include/asm/linkage.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/linkage.h b/arch/powerpc/include/asm/linkage.h index b71b9582e754..b88d1d2cf304 100644 --- a/arch/powerpc/include/asm/linkage.h +++ b/arch/powerpc/include/asm/linkage.h @@ -4,6 +4,9 @@ #include <asm/types.h> +#define __ALIGN .align 2 +#define __ALIGN_STR ".align 2" + #ifdef CONFIG_PPC64_ELF_ABI_V1 #define cond_syscall(x) \ asm ("\t.weak " #x "\n\t.set " #x ", sys_ni_syscall\n" \ From 29a011fc79e625b2b02f25262657f7c4c59ae9f7 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:41 +0530 Subject: [PATCH 003/181] powerpc: Fix objtool unannotated intra-function call warnings Objtool throws unannotated intra-function call warnings in the following assembly files: arch/powerpc/kernel/vector.o: warning: objtool: .text+0x53c: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x60: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x124: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x5d4: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x5dc: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0xcb8: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0xd0c: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0x1030: unannotated intra-function call arch/powerpc/kernel/head_64.o: warning: objtool: .text+0x358: unannotated intra-function call arch/powerpc/kernel/head_64.o: warning: objtool: .text+0x728: unannotated intra-function call arch/powerpc/kernel/head_64.o: warning: objtool: .text+0x4d94: unannotated intra-function call arch/powerpc/kernel/head_64.o: warning: objtool: .text+0x4ec4: unannotated intra-function call arch/powerpc/kvm/book3s_hv_interrupts.o: warning: objtool: .text+0x6c: unannotated intra-function call arch/powerpc/kernel/misc_64.o: warning: objtool: .text+0x64: unannotated intra-function call Objtool does not add STT_NOTYPE symbols with size 0 to the rbtree, which is why find_call_destination() function is not able to find the destination symbol for 'bl' instruction. For such symbols, objtool is throwing unannotated intra-function call warnings in assembly files. Fix these warnings by annotating those symbols with SYM_FUNC_START_LOCAL and SYM_FUNC_END macros, inorder to set symbol type to STT_FUNC and symbol size accordingly. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-4-sv@linux.ibm.com --- arch/powerpc/kernel/exceptions-64s.S | 4 +++- arch/powerpc/kernel/head_64.S | 7 +++++-- arch/powerpc/kernel/misc_64.S | 4 +++- arch/powerpc/kernel/vector.S | 4 +++- arch/powerpc/kvm/book3s_hv_interrupts.S | 4 +++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 22 +++++++++++++++------- 6 files changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..77201ad9f329 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -13,6 +13,7 @@ * */ +#include <linux/linkage.h> #include <asm/hw_irq.h> #include <asm/exception-64s.h> #include <asm/ptrace.h> @@ -3112,7 +3113,7 @@ _GLOBAL(enable_machine_check) blr /* MSR[RI] should be clear because this uses SRR[01] */ -disable_machine_check: +SYM_FUNC_START_LOCAL(disable_machine_check) mflr r0 bcl 20,31,$+4 0: mflr r3 @@ -3125,3 +3126,4 @@ disable_machine_check: RFI_TO_KERNEL 1: mtlr r0 blr +SYM_FUNC_END(disable_machine_check) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index dedcc6fe2263..874efd25cc45 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -18,6 +18,7 @@ * variants. */ +#include <linux/linkage.h> #include <linux/threads.h> #include <linux/init.h> #include <asm/reg.h> @@ -462,7 +463,7 @@ generic_secondary_common_init: * Assumes we're mapped EA == RA if the MMU is on. */ #ifdef CONFIG_PPC_BOOK3S -__mmu_off: +SYM_FUNC_START_LOCAL(__mmu_off) mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR beqlr @@ -473,6 +474,7 @@ __mmu_off: sync rfid b . /* prevent speculative execution */ +SYM_FUNC_END(__mmu_off) #endif @@ -869,7 +871,7 @@ _GLOBAL(start_secondary_resume) /* * This subroutine clobbers r11 and r12 */ -enable_64b_mode: +SYM_FUNC_START_LOCAL(enable_64b_mode) mfmsr r11 /* grab the current MSR */ #ifdef CONFIG_PPC_BOOK3E_64 oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ @@ -881,6 +883,7 @@ enable_64b_mode: isync #endif blr +SYM_FUNC_END(enable_64b_mode) /* * This puts the TOC pointer into r2, offset by 0x8000 (as expected diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 36184cada00b..c61a7ba446a8 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -9,6 +9,7 @@ * PPC64 updates by Dave Engebretsen (engebret@us.ibm.com) */ +#include <linux/linkage.h> #include <linux/sys.h> #include <asm/unistd.h> #include <asm/errno.h> @@ -353,7 +354,7 @@ _GLOBAL(kexec_smp_wait) * * don't overwrite r3 here, it is live for kexec_wait above. */ -real_mode: /* assume normal blr return */ +SYM_FUNC_START_LOCAL(real_mode) /* assume normal blr return */ #ifdef CONFIG_PPC_BOOK3E_64 /* Create an identity mapping. */ b kexec_create_tlb @@ -370,6 +371,7 @@ real_mode: /* assume normal blr return */ mtspr SPRN_SRR0,r11 rfid #endif +SYM_FUNC_END(real_mode) /* * kexec_sequence(newstack, start, image, control, clear_all(), diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 5cf64740edb8..ffe5d90abe17 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> #include <asm/processor.h> #include <asm/ppc_asm.h> #include <asm/reg.h> @@ -185,7 +186,7 @@ fphalf: * Internal routine to enable floating point and set FPSCR to 0. * Don't call it from C; it doesn't use the normal calling convention. */ -fpenable: +SYM_FUNC_START_LOCAL(fpenable) #ifdef CONFIG_PPC32 stwu r1,-64(r1) #else @@ -202,6 +203,7 @@ fpenable: mffs fr31 MTFSF_L(fr1) blr +SYM_FUNC_END(fpenable) fpdisable: mtlr r12 diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 59d89e4b154a..c0deeea7eef3 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -9,6 +9,7 @@ * Authors: Alexander Graf <agraf@suse.de> */ +#include <linux/linkage.h> #include <asm/ppc_asm.h> #include <asm/kvm_asm.h> #include <asm/reg.h> @@ -107,7 +108,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* * void kvmhv_save_host_pmu(void) */ -kvmhv_save_host_pmu: +SYM_FUNC_START_LOCAL(kvmhv_save_host_pmu) BEGIN_FTR_SECTION /* Work around P8 PMAE bug */ li r3, -1 @@ -154,3 +155,4 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stw r8, HSTATE_PMC5(r13) stw r9, HSTATE_PMC6(r13) 31: blr +SYM_FUNC_END(kvmhv_save_host_pmu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 37f50861dd98..a69d36cbf43b 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -10,6 +10,7 @@ * Authors: Alexander Graf <agraf@suse.de> */ +#include <linux/linkage.h> #include <asm/ppc_asm.h> #include <asm/code-patching-asm.h> #include <asm/kvm_asm.h> @@ -2358,7 +2359,7 @@ hmi_realmode: * This routine calls kvmppc_read_intr, a C function, if an external * interrupt is pending. */ -kvmppc_check_wake_reason: +SYM_FUNC_START_LOCAL(kvmppc_check_wake_reason) mfspr r6, SPRN_SRR1 BEGIN_FTR_SECTION rlwinm r6, r6, 45-31, 0xf /* extract wake reason field (P8) */ @@ -2427,6 +2428,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addi r1, r1, PPC_MIN_STKFRM mtlr r0 blr +SYM_FUNC_END(kvmppc_check_wake_reason) /* * Save away FP, VMX and VSX registers. @@ -2434,7 +2436,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * N.B. r30 and r31 are volatile across this function, * thus it is not callable from C. */ -kvmppc_save_fp: +SYM_FUNC_START_LOCAL(kvmppc_save_fp) mflr r30 mr r31,r3 mfmsr r5 @@ -2462,6 +2464,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) stw r6,VCPU_VRSAVE(r31) mtlr r30 blr +SYM_FUNC_END(kvmppc_save_fp) /* * Load up FP, VMX and VSX registers @@ -2469,7 +2472,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * N.B. r30 and r31 are volatile across this function, * thus it is not callable from C. */ -kvmppc_load_fp: +SYM_FUNC_START_LOCAL(kvmppc_load_fp) mflr r30 mr r31,r4 mfmsr r9 @@ -2498,6 +2501,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) mtlr r30 mr r4,r31 blr +SYM_FUNC_END(kvmppc_load_fp) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* @@ -2746,7 +2750,7 @@ kvmppc_bad_host_intr: * r9 has a vcpu pointer (in) * r0 is used as a scratch register */ -kvmppc_msr_interrupt: +SYM_FUNC_START_LOCAL(kvmppc_msr_interrupt) rldicl r0, r11, 64 - MSR_TS_S_LG, 62 cmpwi r0, 2 /* Check if we are in transactional state.. */ ld r11, VCPU_INTR_MSR(r9) @@ -2755,13 +2759,14 @@ kvmppc_msr_interrupt: li r0, 1 1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG blr +SYM_FUNC_END(kvmppc_msr_interrupt) /* * void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu) * * Load up guest PMU state. R3 points to the vcpu struct. */ -kvmhv_load_guest_pmu: +SYM_FUNC_START_LOCAL(kvmhv_load_guest_pmu) mr r4, r3 mflr r0 li r3, 1 @@ -2811,13 +2816,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) isync mtlr r0 blr +SYM_FUNC_END(kvmhv_load_guest_pmu) /* * void kvmhv_load_host_pmu(void) * * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu. */ -kvmhv_load_host_pmu: +SYM_FUNC_START_LOCAL(kvmhv_load_host_pmu) mflr r0 lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ cmpwi r4, 0 @@ -2859,6 +2865,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) isync mtlr r0 23: blr +SYM_FUNC_END(kvmhv_load_host_pmu) /* * void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use) @@ -2866,7 +2873,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * Save guest PMU state into the vcpu struct. * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA) */ -kvmhv_save_guest_pmu: +SYM_FUNC_START_LOCAL(kvmhv_save_guest_pmu) mr r9, r3 mr r8, r4 BEGIN_FTR_SECTION @@ -2942,6 +2949,7 @@ BEGIN_FTR_SECTION mtspr SPRN_MMCRS, r4 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 22: blr +SYM_FUNC_END(kvmhv_save_guest_pmu) /* * This works around a hardware bug on POWER8E processors, where From 8d0c21b50655bfe136a76cf384495ba1f9c87224 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:42 +0530 Subject: [PATCH 004/181] powerpc: Curb objtool unannotated intra-function call warnings objtool throws the following unannotated intra-function call warnings: arch/powerpc/kernel/entry_64.o: warning: objtool: .text+0x4: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0xe64: unannotated intra-function call arch/powerpc/kvm/book3s_hv_rmhandlers.o: warning: objtool: .text+0xee4: unannotated intra-function call Fix these warnings by annotating intra-function calls, using ANNOTATE_INTRA_FUNCTION_CALL macro, to indicate that the branch targets are valid. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-5-sv@linux.ibm.com --- arch/powerpc/kernel/entry_64.S | 2 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +++ 2 files changed, 5 insertions(+) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 3e2e37e6ecab..1bf1121e17f1 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -14,6 +14,7 @@ * code, and exception/interrupt return code for PowerPC. */ +#include <linux/objtool.h> #include <linux/errno.h> #include <linux/err.h> #include <asm/cache.h> @@ -73,6 +74,7 @@ flush_branch_caches: // Flush the link stack .rept 64 + ANNOTATE_INTRA_FUNCTION_CALL bl .+4 .endr b 1f diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a69d36cbf43b..96b65b530156 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -11,6 +11,7 @@ */ #include <linux/linkage.h> +#include <linux/objtool.h> #include <asm/ppc_asm.h> #include <asm/code-patching-asm.h> #include <asm/kvm_asm.h> @@ -1523,12 +1524,14 @@ kvm_flush_link_stack: /* Flush the link stack. On Power8 it's up to 32 entries in size. */ .rept 32 + ANNOTATE_INTRA_FUNCTION_CALL bl .+4 .endr /* And on Power9 it's up to 64. */ BEGIN_FTR_SECTION .rept 32 + ANNOTATE_INTRA_FUNCTION_CALL bl .+4 .endr END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) From 1c137323e9a2a970b4a5bf8cf3c50e0ea1cefbeb Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:43 +0530 Subject: [PATCH 005/181] crypto: vmx: Skip objtool from running on aesp8-ppc.o With objtool enabled, below warnings are seen when trying to build: drivers/crypto/vmx/aesp8-ppc.o: warning: objtool: aes_p8_set_encrypt_key+0x44: unannotated intra-function call drivers/crypto/vmx/aesp8-ppc.o: warning: objtool: .text+0x2448: unannotated intra-function call drivers/crypto/vmx/aesp8-ppc.o: warning: objtool: .text+0x2d68: unannotated intra-function call Skip objtool from running on drivers/crypto/vmx/aesp8-ppc.o file for the following reasons: - Since this file comes from OpenSSL, and since it is a perl file which generates a .S file, it may not be the best choice to make too many code changes to such files, unless absolutely necessary. - As far as the objtool --mcount functionality is concerned, we do not have to run objtool on this file because there are no calls to _mcount(). Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-6-sv@linux.ibm.com --- drivers/crypto/vmx/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/crypto/vmx/Makefile b/drivers/crypto/vmx/Makefile index 2560cfea1dec..7b41f0da6807 100644 --- a/drivers/crypto/vmx/Makefile +++ b/drivers/crypto/vmx/Makefile @@ -9,3 +9,5 @@ targets += aesp8-ppc.S ghashp8-ppc.S $(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE $(call if_changed,perl) + +OBJECT_FILES_NON_STANDARD_aesp8-ppc.o := y From 2da37761671b5bdedbe04e6469cfa57cd6b6ae45 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Mon, 14 Nov 2022 23:27:44 +0530 Subject: [PATCH 006/181] powerpc/32: Fix objtool unannotated intra-function call warnings Fix several annotations in assembly files on PPC32. [Sathvika Vasireddy: Changed subject line and removed Kconfig change to enable objtool, as it is a part of "objtool/powerpc: Enable objtool to be built on ppc" patch in this series.] Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-7-sv@linux.ibm.com --- arch/powerpc/kernel/cpu_setup_6xx.S | 26 ++++++++++++------ arch/powerpc/kernel/cpu_setup_e500.S | 8 ++++-- arch/powerpc/kernel/entry_32.S | 9 ++++-- arch/powerpc/kernel/head_40x.S | 5 +++- arch/powerpc/kernel/head_85xx.S | 5 +++- arch/powerpc/kernel/head_8xx.S | 5 +++- arch/powerpc/kernel/head_book3s_32.S | 29 ++++++++++++++------ arch/powerpc/kernel/swsusp_32.S | 5 +++- arch/powerpc/kvm/fpu.S | 17 ++++++++---- arch/powerpc/platforms/52xx/lite5200_sleep.S | 15 +++++++--- 10 files changed, 89 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index f8b5ff64b604..f29ce3dd6140 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -4,6 +4,8 @@ * Copyright (C) 2003 Benjamin Herrenschmidt (benh@kernel.crashing.org) */ +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/cputable.h> @@ -81,7 +83,7 @@ _GLOBAL(__setup_cpu_745x) blr /* Enable caches for 603's, 604, 750 & 7400 */ -setup_common_caches: +SYM_FUNC_START_LOCAL(setup_common_caches) mfspr r11,SPRN_HID0 andi. r0,r11,HID0_DCE ori r11,r11,HID0_ICE|HID0_DCE @@ -95,11 +97,12 @@ setup_common_caches: sync isync blr +SYM_FUNC_END(setup_common_caches) /* 604, 604e, 604ev, ... * Enable superscalar execution & branch history table */ -setup_604_hid0: +SYM_FUNC_START_LOCAL(setup_604_hid0) mfspr r11,SPRN_HID0 ori r11,r11,HID0_SIED|HID0_BHTE ori r8,r11,HID0_BTCD @@ -110,6 +113,7 @@ setup_604_hid0: sync isync blr +SYM_FUNC_END(setup_604_hid0) /* 7400 <= rev 2.7 and 7410 rev = 1.0 suffer from some * erratas we work around here. @@ -125,13 +129,14 @@ setup_604_hid0: * needed once we have applied workaround #5 (though it's * not set by Apple's firmware at least). */ -setup_7400_workarounds: +SYM_FUNC_START_LOCAL(setup_7400_workarounds) mfpvr r3 rlwinm r3,r3,0,20,31 cmpwi 0,r3,0x0207 ble 1f blr -setup_7410_workarounds: +SYM_FUNC_END(setup_7400_workarounds) +SYM_FUNC_START_LOCAL(setup_7410_workarounds) mfpvr r3 rlwinm r3,r3,0,20,31 cmpwi 0,r3,0x0100 @@ -151,6 +156,7 @@ setup_7410_workarounds: sync isync blr +SYM_FUNC_END(setup_7410_workarounds) /* 740/750/7400/7410 * Enable Store Gathering (SGE), Address Broadcast (ABE), @@ -158,7 +164,7 @@ setup_7410_workarounds: * Dynamic Power Management (DPM), Speculative (SPD) * Clear Instruction cache throttling (ICTC) */ -setup_750_7400_hid0: +SYM_FUNC_START_LOCAL(setup_750_7400_hid0) mfspr r11,SPRN_HID0 ori r11,r11,HID0_SGE | HID0_ABE | HID0_BHTE | HID0_BTIC oris r11,r11,HID0_DPM@h @@ -177,12 +183,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) sync isync blr +SYM_FUNC_END(setup_750_7400_hid0) /* 750cx specific * Looks like we have to disable NAP feature for some PLL settings... * (waiting for confirmation) */ -setup_750cx: +SYM_FUNC_START_LOCAL(setup_750cx) mfspr r10, SPRN_HID1 rlwinm r10,r10,4,28,31 cmpwi cr0,r10,7 @@ -196,11 +203,13 @@ setup_750cx: andc r6,r6,r7 stw r6,CPU_SPEC_FEATURES(r4) blr +SYM_FUNC_END(setup_750cx) /* 750fx specific */ -setup_750fx: +SYM_FUNC_START_LOCAL(setup_750fx) blr +SYM_FUNC_END(setup_750fx) /* MPC 745x * Enable Store Gathering (SGE), Branch Folding (FOLD) @@ -212,7 +221,7 @@ setup_750fx: * Clear Instruction cache throttling (ICTC) * Enable L2 HW prefetch */ -setup_745x_specifics: +SYM_FUNC_START_LOCAL(setup_745x_specifics) /* We check for the presence of an L3 cache setup by * the firmware. If any, we disable NAP capability as * it's known to be bogus on rev 2.1 and earlier @@ -270,6 +279,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_NO_DPM) sync isync blr +SYM_FUNC_END(setup_745x_specifics) /* * Initialize the FPU registers. This is needed to work around an errata diff --git a/arch/powerpc/kernel/cpu_setup_e500.S b/arch/powerpc/kernel/cpu_setup_e500.S index 2ab25161b0ad..077cfccc3461 100644 --- a/arch/powerpc/kernel/cpu_setup_e500.S +++ b/arch/powerpc/kernel/cpu_setup_e500.S @@ -8,6 +8,8 @@ * Benjamin Herrenschmidt <benh@kernel.crashing.org> */ +#include <linux/linkage.h> + #include <asm/page.h> #include <asm/processor.h> #include <asm/cputable.h> @@ -274,7 +276,7 @@ _GLOBAL(flush_dcache_L1) blr -has_L2_cache: +SYM_FUNC_START_LOCAL(has_L2_cache) /* skip L2 cache on P2040/P2040E as they have no L2 cache */ mfspr r3, SPRN_SVR /* shift right by 8 bits and clear E bit of SVR */ @@ -290,9 +292,10 @@ has_L2_cache: 1: li r3, 0 blr +SYM_FUNC_END(has_L2_cache) /* flush backside L2 cache */ -flush_backside_L2_cache: +SYM_FUNC_START_LOCAL(flush_backside_L2_cache) mflr r10 bl has_L2_cache mtlr r10 @@ -313,6 +316,7 @@ flush_backside_L2_cache: bne 1b 2: blr +SYM_FUNC_END(flush_backside_L2_cache) _GLOBAL(cpu_down_flush_e500v2) mflr r0 diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3fc7c9886bb7..5e0763be1549 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -18,6 +18,8 @@ #include <linux/err.h> #include <linux/sys.h> #include <linux/threads.h> +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -74,17 +76,18 @@ _ASM_NOKPROBE_SYMBOL(prepare_transfer_to_handler) #endif /* CONFIG_PPC_BOOK3S_32 || CONFIG_PPC_E500 */ #if defined(CONFIG_PPC_KUEP) && defined(CONFIG_PPC_BOOK3S_32) - .globl __kuep_lock -__kuep_lock: +SYM_FUNC_START(__kuep_lock) lwz r9, THREAD+THSR0(r2) update_user_segments_by_4 r9, r10, r11, r12 blr +SYM_FUNC_END(__kuep_lock) -__kuep_unlock: +SYM_FUNC_START_LOCAL(__kuep_unlock) lwz r9, THREAD+THSR0(r2) rlwinm r9,r9,0,~SR_NX update_user_segments_by_4 r9, r10, r11, r12 blr +SYM_FUNC_END(__kuep_unlock) .macro kuep_lock bl __kuep_lock diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 088f500896c7..9110fe9d6747 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -28,6 +28,8 @@ #include <linux/init.h> #include <linux/pgtable.h> #include <linux/sizes.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -662,7 +664,7 @@ start_here: * kernel initialization. This maps the first 32 MBytes of memory 1:1 * virtual to physical and more importantly sets the cache mode. */ -initial_mmu: +SYM_FUNC_START_LOCAL(initial_mmu) tlbia /* Invalidate all TLB entries */ isync @@ -711,6 +713,7 @@ initial_mmu: mtspr SPRN_EVPR,r0 blr +SYM_FUNC_END(initial_mmu) _GLOBAL(abort) mfspr r13,SPRN_DBCR0 diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 52c0ab416326..6be3cc36b716 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -29,6 +29,8 @@ #include <linux/init.h> #include <linux/threads.h> #include <linux/pgtable.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -885,7 +887,7 @@ KernelSPE: * Translate the effec addr in r3 to phys addr. The phys addr will be put * into r3(higher 32bit) and r4(lower 32bit) */ -get_phys_addr: +SYM_FUNC_START_LOCAL(get_phys_addr) mfmsr r8 mfspr r9,SPRN_PID rlwinm r9,r9,16,0x3fff0000 /* turn PID into MAS6[SPID] */ @@ -907,6 +909,7 @@ get_phys_addr: mfspr r3,SPRN_MAS7 #endif blr +SYM_FUNC_END(get_phys_addr) /* * Global functions diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 0b05f2be66b9..c94ed5a08c93 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -18,6 +18,8 @@ #include <linux/magic.h> #include <linux/pgtable.h> #include <linux/sizes.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/mmu.h> @@ -625,7 +627,7 @@ start_here: * 24 Mbytes of data, and the 512k IMMR space. Anything not covered by * these mappings is mapped by page tables. */ -initial_mmu: +SYM_FUNC_START_LOCAL(initial_mmu) li r8, 0 mtspr SPRN_MI_CTR, r8 /* remove PINNED ITLB entries */ lis r10, MD_TWAM@h @@ -686,6 +688,7 @@ initial_mmu: #endif mtspr SPRN_DER, r8 blr +SYM_FUNC_END(initial_mmu) _GLOBAL(mmu_pin_tlb) lis r9, (1f - PAGE_OFFSET)@h diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 519b60695167..4af12447dc0b 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -18,6 +18,8 @@ #include <linux/init.h> #include <linux/pgtable.h> +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -877,7 +879,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_HPTE_TABLE) * Load stuff into the MMU. Intended to be called with * IR=0 and DR=0. */ -early_hash_table: +SYM_FUNC_START_LOCAL(early_hash_table) sync /* Force all PTE updates to finish */ isync tlbia /* Clear all TLB entries */ @@ -888,8 +890,9 @@ early_hash_table: ori r6, r6, 3 /* 256kB table */ mtspr SPRN_SDR1, r6 blr +SYM_FUNC_END(early_hash_table) -load_up_mmu: +SYM_FUNC_START_LOCAL(load_up_mmu) sync /* Force all PTE updates to finish */ isync tlbia /* Clear all TLB entries */ @@ -918,6 +921,7 @@ BEGIN_MMU_FTR_SECTION LOAD_BAT(7,r3,r4,r5) END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +SYM_FUNC_END(load_up_mmu) _GLOBAL(load_segment_registers) li r0, NUM_USER_SEGMENTS /* load up user segment register values */ @@ -1028,7 +1032,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_HPTE_TABLE) * this makes sure it's done. * -- Cort */ -clear_bats: +SYM_FUNC_START_LOCAL(clear_bats) li r10,0 mtspr SPRN_DBAT0U,r10 @@ -1072,6 +1076,7 @@ BEGIN_MMU_FTR_SECTION mtspr SPRN_IBAT7L,r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) blr +SYM_FUNC_END(clear_bats) _GLOBAL(update_bats) lis r4, 1f@h @@ -1108,15 +1113,16 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) mtspr SPRN_SRR1, r6 rfi -flush_tlbs: +SYM_FUNC_START_LOCAL(flush_tlbs) lis r10, 0x40 1: addic. r10, r10, -0x1000 tlbie r10 bgt 1b sync blr +SYM_FUNC_END(flush_tlbs) -mmu_off: +SYM_FUNC_START_LOCAL(mmu_off) addi r4, r3, __after_mmu_off - _start mfmsr r3 andi. r0,r3,MSR_DR|MSR_IR /* MMU enabled? */ @@ -1128,9 +1134,10 @@ mmu_off: mtspr SPRN_SRR1,r3 sync rfi +SYM_FUNC_END(mmu_off) /* We use one BAT to map up to 256M of RAM at _PAGE_OFFSET */ -initial_bats: +SYM_FUNC_START_LOCAL(initial_bats) lis r11,PAGE_OFFSET@h tophys(r8,r11) #ifdef CONFIG_SMP @@ -1146,9 +1153,10 @@ initial_bats: mtspr SPRN_IBAT0U,r11 isync blr +SYM_FUNC_END(initial_bats) #ifdef CONFIG_BOOTX_TEXT -setup_disp_bat: +SYM_FUNC_START_LOCAL(setup_disp_bat) /* * setup the display bat prepared for us in prom.c */ @@ -1164,10 +1172,11 @@ setup_disp_bat: mtspr SPRN_DBAT3L,r8 mtspr SPRN_DBAT3U,r11 blr +SYM_FUNC_END(setup_disp_bat) #endif /* CONFIG_BOOTX_TEXT */ #ifdef CONFIG_PPC_EARLY_DEBUG_CPM -setup_cpm_bat: +SYM_FUNC_START_LOCAL(setup_cpm_bat) lis r8, 0xf000 ori r8, r8, 0x002a mtspr SPRN_DBAT1L, r8 @@ -1177,10 +1186,11 @@ setup_cpm_bat: mtspr SPRN_DBAT1U, r11 blr +SYM_FUNC_END(setup_cpm_bat) #endif #ifdef CONFIG_PPC_EARLY_DEBUG_USBGECKO -setup_usbgecko_bat: +SYM_FUNC_START_LOCAL(setup_usbgecko_bat) /* prepare a BAT for early io */ #if defined(CONFIG_GAMECUBE) lis r8, 0x0c00 @@ -1199,6 +1209,7 @@ setup_usbgecko_bat: mtspr SPRN_DBAT1L, r8 mtspr SPRN_DBAT1U, r11 blr +SYM_FUNC_END(setup_usbgecko_bat) #endif .data diff --git a/arch/powerpc/kernel/swsusp_32.S b/arch/powerpc/kernel/swsusp_32.S index e0cbd63007f2..ffb79326483c 100644 --- a/arch/powerpc/kernel/swsusp_32.S +++ b/arch/powerpc/kernel/swsusp_32.S @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/threads.h> +#include <linux/linkage.h> + #include <asm/processor.h> #include <asm/page.h> #include <asm/cputable.h> @@ -400,7 +402,7 @@ _ASM_NOKPROBE_SYMBOL(swsusp_arch_resume) /* FIXME:This construct is actually not useful since we don't shut * down the instruction MMU, we could just flip back MSR-DR on. */ -turn_on_mmu: +SYM_FUNC_START_LOCAL(turn_on_mmu) mflr r4 mtsrr0 r4 mtsrr1 r3 @@ -408,4 +410,5 @@ turn_on_mmu: isync rfi _ASM_NOKPROBE_SYMBOL(turn_on_mmu) +SYM_FUNC_END(turn_on_mmu) diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S index 315c94946bad..b68e7f26a81f 100644 --- a/arch/powerpc/kvm/fpu.S +++ b/arch/powerpc/kvm/fpu.S @@ -6,6 +6,8 @@ */ #include <linux/pgtable.h> +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/page.h> #include <asm/mmu.h> @@ -110,18 +112,22 @@ FPS_THREE_IN(fsel) * R8 = (double*)¶m3 [load_three] * LR = instruction call function */ -fpd_load_three: +SYM_FUNC_START_LOCAL(fpd_load_three) lfd 2,0(r8) /* load param3 */ -fpd_load_two: +SYM_FUNC_START_LOCAL(fpd_load_two) lfd 1,0(r7) /* load param2 */ -fpd_load_one: +SYM_FUNC_START_LOCAL(fpd_load_one) lfd 0,0(r6) /* load param1 */ -fpd_load_none: +SYM_FUNC_START_LOCAL(fpd_load_none) lfd 3,0(r3) /* load up fpscr value */ MTFSF_L(3) lwz r6, 0(r4) /* load cr */ mtcr r6 blr +SYM_FUNC_END(fpd_load_none) +SYM_FUNC_END(fpd_load_one) +SYM_FUNC_END(fpd_load_two) +SYM_FUNC_END(fpd_load_three) /* * End of double instruction processing @@ -131,13 +137,14 @@ fpd_load_none: * R5 = (double*)&result * LR = caller of instruction call function */ -fpd_return: +SYM_FUNC_START_LOCAL(fpd_return) mfcr r6 stfd 0,0(r5) /* save result */ mffs 0 stfd 0,0(r3) /* save new fpscr value */ stw r6,0(r4) /* save new cr value */ blr +SYM_FUNC_END(fpd_return) /* * Double operation with no input operand diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S index afee8b1515a8..0b12647e7b42 100644 --- a/arch/powerpc/platforms/52xx/lite5200_sleep.S +++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> + #include <asm/reg.h> #include <asm/ppc_asm.h> #include <asm/processor.h> @@ -178,7 +180,8 @@ sram_code: /* local udelay in sram is needed */ - udelay: /* r11 - tb_ticks_per_usec, r12 - usecs, overwrites r13 */ +SYM_FUNC_START_LOCAL(udelay) + /* r11 - tb_ticks_per_usec, r12 - usecs, overwrites r13 */ mullw r12, r12, r11 mftb r13 /* start */ add r12, r13, r12 /* end */ @@ -187,6 +190,7 @@ sram_code: cmp cr0, r13, r12 blt 1b blr +SYM_FUNC_END(udelay) sram_code_end: @@ -271,7 +275,7 @@ _ASM_NOKPROBE_SYMBOL(lite5200_wakeup) SAVE_SR(n+2, addr+2); \ SAVE_SR(n+3, addr+3); -save_regs: +SYM_FUNC_START_LOCAL(save_regs) stw r0, 0(r4) stw r1, 0x4(r4) stw r2, 0x8(r4) @@ -317,6 +321,7 @@ save_regs: SAVE_SPRN(TBRU, 0x5b) blr +SYM_FUNC_END(save_regs) /* restore registers */ @@ -336,7 +341,7 @@ save_regs: LOAD_SR(n+2, addr+2); \ LOAD_SR(n+3, addr+3); -restore_regs: +SYM_FUNC_START_LOCAL(restore_regs) lis r4, registers@h ori r4, r4, registers@l @@ -393,6 +398,7 @@ restore_regs: blr _ASM_NOKPROBE_SYMBOL(restore_regs) +SYM_FUNC_END(restore_regs) @@ -403,7 +409,7 @@ _ASM_NOKPROBE_SYMBOL(restore_regs) * Flush data cache * Do this by just reading lots of stuff into the cache. */ -flush_data_cache: +SYM_FUNC_START_LOCAL(flush_data_cache) lis r3,CONFIG_KERNEL_START@h ori r3,r3,CONFIG_KERNEL_START@l li r4,NUM_CACHE_LINES @@ -413,3 +419,4 @@ flush_data_cache: addi r3,r3,L1_CACHE_BYTES /* Next line, please */ bdnz 1b blr +SYM_FUNC_END(flush_data_cache) From d0160bd5d389da247fb5affb6a35ea393d22fedb Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:45 +0530 Subject: [PATCH 007/181] powerpc/vdso: Skip objtool from running on VDSO files Do not run objtool on VDSO files, by using OBJECT_FILES_NON_STANDARD. Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu> Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-8-sv@linux.ibm.com --- arch/powerpc/kernel/vdso/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index a2e7b0ce5b19..6a977b0d8ffc 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -102,3 +102,5 @@ quiet_cmd_vdso64ld_and_check = VDSO64L $@ cmd_vdso64ld_and_check = $(VDSOCC) $(c_flags) $(CC64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -z noexecstack ; $(cmd_vdso_check) quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(VDSOCC) $(a_flags) $(CC64FLAGS) $(AS64FLAGS) -c -o $@ $< + +OBJECT_FILES_NON_STANDARD := y From efb11fdb3e1a9f694fa12b70b21e69e55ec59c36 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Mon, 14 Nov 2022 23:27:46 +0530 Subject: [PATCH 008/181] objtool: Fix SEGFAULT find_insn() will return NULL in case of failure. Check insn in order to avoid a kernel Oops for NULL pointer dereference. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-9-sv@linux.ibm.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 43ec14c29a60..8427af808221 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -207,7 +207,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, return false; insn = find_insn(file, func->sec, func->offset); - if (!insn->func) + if (!insn || !insn->func) return false; func_for_each_insn(file, func, insn) { From 0646c28b417b7fe307c9da72ca1c508e43b57dc0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Mon, 14 Nov 2022 23:27:47 +0530 Subject: [PATCH 009/181] objtool: Use target file endianness instead of a compiled constant Some architectures like powerpc support both endianness, it's therefore not possible to fix the endianness via arch/endianness.h because there is no easy way to get the target endianness at build time. Use the endianness recorded in the file objtool is working on. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-10-sv@linux.ibm.com --- .../arch/x86/include/arch/endianness.h | 9 ------ tools/objtool/check.c | 2 +- tools/objtool/include/objtool/endianness.h | 32 +++++++++---------- tools/objtool/orc_dump.c | 11 +++++-- tools/objtool/orc_gen.c | 4 +-- tools/objtool/special.c | 3 +- 6 files changed, 30 insertions(+), 31 deletions(-) delete mode 100644 tools/objtool/arch/x86/include/arch/endianness.h diff --git a/tools/objtool/arch/x86/include/arch/endianness.h b/tools/objtool/arch/x86/include/arch/endianness.h deleted file mode 100644 index 7c362527da20..000000000000 --- a/tools/objtool/arch/x86/include/arch/endianness.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ARCH_ENDIANNESS_H -#define _ARCH_ENDIANNESS_H - -#include <endian.h> - -#define __TARGET_BYTE_ORDER __LITTLE_ENDIAN - -#endif /* _ARCH_ENDIANNESS_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 8427af808221..ad5dab175701 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2100,7 +2100,7 @@ static int read_unwind_hints(struct objtool_file *file) return -1; } - cfi.cfa.offset = bswap_if_needed(hint->sp_offset); + cfi.cfa.offset = bswap_if_needed(file->elf, hint->sp_offset); cfi.type = hint->type; cfi.end = hint->end; diff --git a/tools/objtool/include/objtool/endianness.h b/tools/objtool/include/objtool/endianness.h index 10241341eff3..4d2aa9b0fe2f 100644 --- a/tools/objtool/include/objtool/endianness.h +++ b/tools/objtool/include/objtool/endianness.h @@ -2,33 +2,33 @@ #ifndef _OBJTOOL_ENDIANNESS_H #define _OBJTOOL_ENDIANNESS_H -#include <arch/endianness.h> #include <linux/kernel.h> #include <endian.h> - -#ifndef __TARGET_BYTE_ORDER -#error undefined arch __TARGET_BYTE_ORDER -#endif - -#if __BYTE_ORDER != __TARGET_BYTE_ORDER -#define __NEED_BSWAP 1 -#else -#define __NEED_BSWAP 0 -#endif +#include <objtool/elf.h> /* - * Does a byte swap if target endianness doesn't match the host, i.e. cross + * Does a byte swap if target file endianness doesn't match the host, i.e. cross * compilation for little endian on big endian and vice versa. * To be used for multi-byte values conversion, which are read from / about * to be written to a target native endianness ELF file. */ -#define bswap_if_needed(val) \ +static inline bool need_bswap(struct elf *elf) +{ + return (__BYTE_ORDER == __LITTLE_ENDIAN) ^ + (elf->ehdr.e_ident[EI_DATA] == ELFDATA2LSB); +} + +#define bswap_if_needed(elf, val) \ ({ \ __typeof__(val) __ret; \ + bool __need_bswap = need_bswap(elf); \ switch (sizeof(val)) { \ - case 8: __ret = __NEED_BSWAP ? bswap_64(val) : (val); break; \ - case 4: __ret = __NEED_BSWAP ? bswap_32(val) : (val); break; \ - case 2: __ret = __NEED_BSWAP ? bswap_16(val) : (val); break; \ + case 8: \ + __ret = __need_bswap ? bswap_64(val) : (val); break; \ + case 4: \ + __ret = __need_bswap ? bswap_32(val) : (val); break; \ + case 2: \ + __ret = __need_bswap ? bswap_16(val) : (val); break; \ default: \ BUILD_BUG(); break; \ } \ diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index f5a8508c42d6..4f1211fec82c 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -76,6 +76,7 @@ int orc_dump(const char *_objname) GElf_Rela rela; GElf_Sym sym; Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL; + struct elf dummy_elf = {}; objname = _objname; @@ -94,6 +95,12 @@ int orc_dump(const char *_objname) return -1; } + if (!elf64_getehdr(elf)) { + WARN_ELF("elf64_getehdr"); + return -1; + } + memcpy(&dummy_elf.ehdr, elf64_getehdr(elf), sizeof(dummy_elf.ehdr)); + if (elf_getshdrnum(elf, &nr_sections)) { WARN_ELF("elf_getshdrnum"); return -1; @@ -198,11 +205,11 @@ int orc_dump(const char *_objname) printf(" sp:"); - print_reg(orc[i].sp_reg, bswap_if_needed(orc[i].sp_offset)); + print_reg(orc[i].sp_reg, bswap_if_needed(&dummy_elf, orc[i].sp_offset)); printf(" bp:"); - print_reg(orc[i].bp_reg, bswap_if_needed(orc[i].bp_offset)); + print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset)); printf(" type:%s end:%d\n", orc_type_name(orc[i].type), orc[i].end); diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index dd3c64af9db2..1f22b7ebae58 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -97,8 +97,8 @@ static int write_orc_entry(struct elf *elf, struct section *orc_sec, /* populate ORC data */ orc = (struct orc_entry *)orc_sec->data->d_buf + idx; memcpy(orc, o, sizeof(*orc)); - orc->sp_offset = bswap_if_needed(orc->sp_offset); - orc->bp_offset = bswap_if_needed(orc->bp_offset); + orc->sp_offset = bswap_if_needed(elf, orc->sp_offset); + orc->bp_offset = bswap_if_needed(elf, orc->bp_offset); /* populate reloc for ip */ if (elf_add_reloc_to_insn(elf, ip_sec, idx * sizeof(int), R_X86_64_PC32, diff --git a/tools/objtool/special.c b/tools/objtool/special.c index e2223dd91c37..9c8d827f69af 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -87,7 +87,8 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, if (entry->feature) { unsigned short feature; - feature = bswap_if_needed(*(unsigned short *)(sec->data->d_buf + + feature = bswap_if_needed(elf, + *(unsigned short *)(sec->data->d_buf + offset + entry->feature)); arch_handle_alternative(feature, alt); From 86ea7f361537f825a699e86fdc9e49be19f128d1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Mon, 14 Nov 2022 23:27:48 +0530 Subject: [PATCH 010/181] objtool: Use target file class size instead of a compiled constant In order to allow using objtool on cross-built kernels, determine size of long from elf data instead of using sizeof(long) at build time. For the time being this covers only mcount. [Sathvika Vasireddy: Rename variable "size" to "addrsize" and function "elf_class_size()" to "elf_class_addrsize()", and modify create_mcount_loc_sections() function to follow reverse christmas tree format to order local variable declarations.] Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-11-sv@linux.ibm.com --- tools/objtool/check.c | 18 ++++++++++-------- tools/objtool/elf.c | 8 ++++++-- tools/objtool/include/objtool/elf.h | 8 ++++++++ 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ad5dab175701..b64518c7c7b4 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -852,9 +852,9 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) static int create_mcount_loc_sections(struct objtool_file *file) { - struct section *sec; - unsigned long *loc; + int addrsize = elf_class_addrsize(file->elf); struct instruction *insn; + struct section *sec; int idx; sec = find_section_by_name(file->elf, "__mcount_loc"); @@ -871,23 +871,25 @@ static int create_mcount_loc_sections(struct objtool_file *file) list_for_each_entry(insn, &file->mcount_loc_list, call_node) idx++; - sec = elf_create_section(file->elf, "__mcount_loc", 0, sizeof(unsigned long), idx); + sec = elf_create_section(file->elf, "__mcount_loc", 0, addrsize, idx); if (!sec) return -1; + sec->sh.sh_addralign = addrsize; + idx = 0; list_for_each_entry(insn, &file->mcount_loc_list, call_node) { + void *loc; - loc = (unsigned long *)sec->data->d_buf + idx; - memset(loc, 0, sizeof(unsigned long)); + loc = sec->data->d_buf + idx; + memset(loc, 0, addrsize); - if (elf_add_reloc_to_insn(file->elf, sec, - idx * sizeof(unsigned long), + if (elf_add_reloc_to_insn(file->elf, sec, idx, R_X86_64_64, insn->sec, insn->offset)) return -1; - idx++; + idx += addrsize; } return 0; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 7e24b09b1163..33739865735b 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -1129,6 +1129,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec { char *relocname; struct section *sec; + int addrsize = elf_class_addrsize(elf); relocname = malloc(strlen(base->name) + strlen(".rela") + 1); if (!relocname) { @@ -1138,7 +1139,10 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec strcpy(relocname, ".rela"); strcat(relocname, base->name); - sec = elf_create_section(elf, relocname, 0, sizeof(GElf_Rela), 0); + if (addrsize == sizeof(u32)) + sec = elf_create_section(elf, relocname, 0, sizeof(Elf32_Rela), 0); + else + sec = elf_create_section(elf, relocname, 0, sizeof(GElf_Rela), 0); free(relocname); if (!sec) return NULL; @@ -1147,7 +1151,7 @@ static struct section *elf_create_rela_reloc_section(struct elf *elf, struct sec sec->base = base; sec->sh.sh_type = SHT_RELA; - sec->sh.sh_addralign = 8; + sec->sh.sh_addralign = addrsize; sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; sec->sh.sh_info = base->idx; sec->sh.sh_flags = SHF_INFO_LINK; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 16f4067b82ae..78b3aa2e546d 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -142,6 +142,14 @@ static inline bool has_multiple_files(struct elf *elf) return elf->num_files > 1; } +static inline int elf_class_addrsize(struct elf *elf) +{ + if (elf->ehdr.e_ident[EI_CLASS] == ELFCLASS32) + return sizeof(u32); + else + return sizeof(u64); +} + struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, unsigned int sh_flags, size_t entsize, int nr); From 280981d6994e0700abd36647b141e73059851e66 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:49 +0530 Subject: [PATCH 011/181] objtool: Add --mnop as an option to --mcount Some architectures (powerpc) may not support ftrace locations being nop'ed out at build time. Introduce CONFIG_HAVE_OBJTOOL_NOP_MCOUNT for objtool, as a means for architectures to enable nop'ing of ftrace locations. Add --mnop as an option to objtool --mcount, to indicate support for the same. Also, make sure that --mnop can be passed as an option to objtool only when --mcount is passed. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-12-sv@linux.ibm.com --- Makefile | 4 +++- arch/x86/Kconfig | 1 + kernel/trace/Kconfig | 7 +++++++ scripts/Makefile.lib | 3 +++ tools/objtool/builtin-check.c | 14 ++++++++++++++ tools/objtool/check.c | 19 ++++++++++--------- tools/objtool/include/objtool/builtin.h | 1 + 7 files changed, 39 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index d148a55bfd0f..53c2b715d0bf 100644 --- a/Makefile +++ b/Makefile @@ -933,7 +933,9 @@ ifdef CONFIG_FTRACE_MCOUNT_USE_CC endif endif ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL - CC_FLAGS_USING += -DCC_USING_NOP_MCOUNT + ifdef CONFIG_HAVE_OBJTOOL_NOP_MCOUNT + CC_FLAGS_USING += -DCC_USING_NOP_MCOUNT + endif endif ifdef CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT ifdef CONFIG_HAVE_C_RECORDMCOUNT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67745ceab0db..4be7c06a5d18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -195,6 +195,7 @@ config X86 select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL + select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e9e95c790b8e..2b782321376a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -82,6 +82,13 @@ config HAVE_OBJTOOL_MCOUNT help Arch supports objtool --mcount +config HAVE_OBJTOOL_NOP_MCOUNT + bool + help + Arch supports the objtool options --mcount with --mnop. + An architecture can select this if it wants to enable nop'ing + of ftrace locations. + config HAVE_C_RECORDMCOUNT bool help diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3aa384cec76b..658f541c2782 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -256,6 +256,9 @@ objtool-args-$(CONFIG_HAVE_JUMP_LABEL_HACK) += --hacks=jump_label objtool-args-$(CONFIG_HAVE_NOINSTR_HACK) += --hacks=noinstr objtool-args-$(CONFIG_X86_KERNEL_IBT) += --ibt objtool-args-$(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL) += --mcount +ifdef CONFIG_FTRACE_MCOUNT_USE_OBJTOOL +objtool-args-$(CONFIG_HAVE_OBJTOOL_NOP_MCOUNT) += --mnop +endif objtool-args-$(CONFIG_UNWINDER_ORC) += --orc objtool-args-$(CONFIG_RETPOLINE) += --retpoline objtool-args-$(CONFIG_RETHUNK) += --rethunk diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 24fbe803a0d3..9bd347d3c244 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -82,6 +82,7 @@ const struct option check_options[] = { OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), + OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), @@ -150,6 +151,16 @@ static bool opts_valid(void) return false; } +static bool mnop_opts_valid(void) +{ + if (opts.mnop && !opts.mcount) { + ERROR("--mnop requires --mcount"); + return false; + } + + return true; +} + static bool link_opts_valid(struct objtool_file *file) { if (opts.link) @@ -198,6 +209,9 @@ int objtool_run(int argc, const char **argv) if (!file) return 1; + if (!mnop_opts_valid()) + return 1; + if (!link_opts_valid(file)) return 1; diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b64518c7c7b4..71cf4b4ba1da 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1256,18 +1256,19 @@ static void annotate_call_site(struct objtool_file *file, if (opts.mcount && sym->fentry) { if (sibling) WARN_FUNC("Tail call to __fentry__ !?!?", insn->sec, insn->offset); + if (opts.mnop) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } - if (reloc) { - reloc->type = R_NONE; - elf_write_reloc(file->elf, reloc); + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + + insn->type = INSN_NOP; } - elf_write_insn(file->elf, insn->sec, - insn->offset, insn->len, - arch_nop_insn(insn->len)); - - insn->type = INSN_NOP; - list_add_tail(&insn->call_node, &file->mcount_loc_list); return; } diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 42a52f1a0add..0785707c5a92 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -31,6 +31,7 @@ struct opts { bool backup; bool dryrun; bool link; + bool mnop; bool module; bool no_unreachable; bool sec_address; From de6fbcedf5abce4c321eeb15d7d286b79804b8b6 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:50 +0530 Subject: [PATCH 012/181] objtool: Read special sections with alts only when specific options are selected Call add_special_section_alts() only when stackval or orc or uaccess or noinstr options are passed to objtool. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-13-sv@linux.ibm.com --- tools/objtool/check.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 71cf4b4ba1da..752a6ffd5c4c 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2392,9 +2392,11 @@ static int decode_sections(struct objtool_file *file) * Must be before add_jump_destinations(), which depends on 'func' * being set for alternatives, to enable proper sibling call detection. */ - ret = add_special_section_alts(file); - if (ret) - return ret; + if (opts.stackval || opts.orc || opts.uaccess || opts.noinstr) { + ret = add_special_section_alts(file); + if (ret) + return ret; + } ret = add_jump_destinations(file); if (ret) From c1449735211dd8c4c2d54fa0ece6890ecbd74e24 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:51 +0530 Subject: [PATCH 013/181] objtool: Use macros to define arch specific reloc types Make relocation types architecture specific. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-14-sv@linux.ibm.com --- tools/objtool/arch/x86/include/arch/elf.h | 2 ++ tools/objtool/check.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/objtool/arch/x86/include/arch/elf.h b/tools/objtool/arch/x86/include/arch/elf.h index 69cc4264b28a..ac14987cf687 100644 --- a/tools/objtool/arch/x86/include/arch/elf.h +++ b/tools/objtool/arch/x86/include/arch/elf.h @@ -2,5 +2,7 @@ #define _OBJTOOL_ARCH_ELF #define R_NONE R_X86_64_NONE +#define R_ABS64 R_X86_64_64 +#define R_ABS32 R_X86_64_32 #endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 752a6ffd5c4c..2d7153b5d5d1 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -885,7 +885,7 @@ static int create_mcount_loc_sections(struct objtool_file *file) memset(loc, 0, addrsize); if (elf_add_reloc_to_insn(file->elf, sec, idx, - R_X86_64_64, + addrsize == sizeof(u64) ? R_ABS64 : R_ABS32, insn->sec, insn->offset)) return -1; From 4ca993d498987332ceeedee5380101b84accaf35 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:52 +0530 Subject: [PATCH 014/181] objtool: Add arch specific function arch_ftrace_match() Add architecture specific function to look for relocation records pointing to architecture specific symbols. Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu> Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-15-sv@linux.ibm.com --- tools/objtool/arch/x86/decode.c | 5 +++++ tools/objtool/check.c | 2 +- tools/objtool/include/objtool/arch.h | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 1c253b4b7ce0..af7ad09c926c 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -23,6 +23,11 @@ #include <objtool/builtin.h> #include <arch/elf.h> +int arch_ftrace_match(char *name) +{ + return !strcmp(name, "__fentry__"); +} + static int is_x86_64(const struct elf *elf) { switch (elf->ehdr.e_machine) { diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2d7153b5d5d1..7580c66ca5c8 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2316,7 +2316,7 @@ static int classify_symbols(struct objtool_file *file) if (arch_is_rethunk(func)) func->return_thunk = true; - if (!strcmp(func->name, "__fentry__")) + if (arch_ftrace_match(func->name)) func->fentry = true; if (is_profiling_func(func->name)) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index beb2f3aa94ff..5149330f400f 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -69,6 +69,8 @@ struct stack_op { struct instruction; +int arch_ftrace_match(char *name); + void arch_initial_func_cfi_state(struct cfi_init_state *state); int arch_decode_instruction(struct objtool_file *file, const struct section *sec, From e52ec98c5ab18c0710ea22bf52f45e60a725adaf Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:53 +0530 Subject: [PATCH 015/181] objtool/powerpc: Enable objtool to be built on ppc This patch adds [stub] implementations for required functions, inorder to enable objtool build on powerpc. [Christophe Leroy: powerpc: Add missing asm/asm.h for objtool, Use local variables for type and imm in arch_decode_instruction(), Adapt len for prefixed instructions.] Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-16-sv@linux.ibm.com --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/asm.h | 7 ++ tools/objtool/arch/powerpc/Build | 2 + tools/objtool/arch/powerpc/decode.c | 85 +++++++++++++++++++ .../arch/powerpc/include/arch/cfi_regs.h | 11 +++ tools/objtool/arch/powerpc/include/arch/elf.h | 8 ++ .../arch/powerpc/include/arch/special.h | 21 +++++ tools/objtool/arch/powerpc/special.c | 19 +++++ 8 files changed, 154 insertions(+) create mode 100644 arch/powerpc/include/asm/asm.h create mode 100644 tools/objtool/arch/powerpc/Build create mode 100644 tools/objtool/arch/powerpc/decode.c create mode 100644 tools/objtool/arch/powerpc/include/arch/cfi_regs.h create mode 100644 tools/objtool/arch/powerpc/include/arch/elf.h create mode 100644 tools/objtool/arch/powerpc/include/arch/special.h create mode 100644 tools/objtool/arch/powerpc/special.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 699df27b0e2f..12e6c16be54e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -238,6 +238,7 @@ config PPC select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_OPTPROBES + select HAVE_OBJTOOL if PPC32 || MPROFILE_KERNEL select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS diff --git a/arch/powerpc/include/asm/asm.h b/arch/powerpc/include/asm/asm.h new file mode 100644 index 000000000000..86f46b604e9a --- /dev/null +++ b/arch/powerpc/include/asm/asm.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_ASM_H +#define _ASM_POWERPC_ASM_H + +#define _ASM_PTR " .long " + +#endif /* _ASM_POWERPC_ASM_H */ diff --git a/tools/objtool/arch/powerpc/Build b/tools/objtool/arch/powerpc/Build new file mode 100644 index 000000000000..d24d5636a5b8 --- /dev/null +++ b/tools/objtool/arch/powerpc/Build @@ -0,0 +1,2 @@ +objtool-y += decode.o +objtool-y += special.o diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c new file mode 100644 index 000000000000..dcd0975cad6b --- /dev/null +++ b/tools/objtool/arch/powerpc/decode.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <stdio.h> +#include <stdlib.h> +#include <objtool/check.h> +#include <objtool/elf.h> +#include <objtool/arch.h> +#include <objtool/warn.h> +#include <objtool/builtin.h> +#include <objtool/endianness.h> + +unsigned long arch_dest_reloc_offset(int addend) +{ + return addend; +} + +bool arch_callee_saved_reg(unsigned char reg) +{ + return false; +} + +int arch_decode_hint_reg(u8 sp_reg, int *base) +{ + exit(-1); +} + +const char *arch_nop_insn(int len) +{ + exit(-1); +} + +const char *arch_ret_insn(int len) +{ + exit(-1); +} + +int arch_decode_instruction(struct objtool_file *file, const struct section *sec, + unsigned long offset, unsigned int maxlen, + unsigned int *len, enum insn_type *type, + unsigned long *immediate, + struct list_head *ops_list) +{ + unsigned int opcode; + enum insn_type typ; + unsigned long imm; + u32 insn; + + insn = bswap_if_needed(file->elf, *(u32 *)(sec->data->d_buf + offset)); + opcode = insn >> 26; + typ = INSN_OTHER; + imm = 0; + + if (opcode == 1) + *len = 8; + else + *len = 4; + + *type = typ; + *immediate = imm; + + return 0; +} + +unsigned long arch_jump_destination(struct instruction *insn) +{ + return insn->offset + insn->immediate; +} + +void arch_initial_func_cfi_state(struct cfi_init_state *state) +{ + int i; + + for (i = 0; i < CFI_NUM_REGS; i++) { + state->regs[i].base = CFI_UNDEFINED; + state->regs[i].offset = 0; + } + + /* initial CFA (call frame address) */ + state->cfa.base = CFI_SP; + state->cfa.offset = 0; + + /* initial LR (return address) */ + state->regs[CFI_RA].base = CFI_CFA; + state->regs[CFI_RA].offset = 0; +} diff --git a/tools/objtool/arch/powerpc/include/arch/cfi_regs.h b/tools/objtool/arch/powerpc/include/arch/cfi_regs.h new file mode 100644 index 000000000000..59638ebeafc8 --- /dev/null +++ b/tools/objtool/arch/powerpc/include/arch/cfi_regs.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _OBJTOOL_CFI_REGS_H +#define _OBJTOOL_CFI_REGS_H + +#define CFI_BP 1 +#define CFI_SP CFI_BP +#define CFI_RA 32 +#define CFI_NUM_REGS 33 + +#endif diff --git a/tools/objtool/arch/powerpc/include/arch/elf.h b/tools/objtool/arch/powerpc/include/arch/elf.h new file mode 100644 index 000000000000..3c8ebb7d2a6b --- /dev/null +++ b/tools/objtool/arch/powerpc/include/arch/elf.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _OBJTOOL_ARCH_ELF +#define _OBJTOOL_ARCH_ELF + +#define R_NONE R_PPC_NONE + +#endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/arch/powerpc/include/arch/special.h b/tools/objtool/arch/powerpc/include/arch/special.h new file mode 100644 index 000000000000..ffef9ada7133 --- /dev/null +++ b/tools/objtool/arch/powerpc/include/arch/special.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _PPC_ARCH_SPECIAL_H +#define _PPC_ARCH_SPECIAL_H + +#define EX_ENTRY_SIZE 8 +#define EX_ORIG_OFFSET 0 +#define EX_NEW_OFFSET 4 + +#define JUMP_ENTRY_SIZE 16 +#define JUMP_ORIG_OFFSET 0 +#define JUMP_NEW_OFFSET 4 +#define JUMP_KEY_OFFSET 8 + +#define ALT_ENTRY_SIZE 12 +#define ALT_ORIG_OFFSET 0 +#define ALT_NEW_OFFSET 4 +#define ALT_FEATURE_OFFSET 8 +#define ALT_ORIG_LEN_OFFSET 10 +#define ALT_NEW_LEN_OFFSET 11 + +#endif /* _PPC_ARCH_SPECIAL_H */ diff --git a/tools/objtool/arch/powerpc/special.c b/tools/objtool/arch/powerpc/special.c new file mode 100644 index 000000000000..d33868147196 --- /dev/null +++ b/tools/objtool/arch/powerpc/special.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <string.h> +#include <stdlib.h> +#include <objtool/special.h> +#include <objtool/builtin.h> + + +bool arch_support_alt_relocation(struct special_alt *special_alt, + struct instruction *insn, + struct reloc *reloc) +{ + exit(-1); +} + +struct reloc *arch_find_switch_table(struct objtool_file *file, + struct instruction *insn) +{ + exit(-1); +} From c984aef8c8326035570ff6e01d0ff9e79a5dfa76 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy <sv@linux.ibm.com> Date: Mon, 14 Nov 2022 23:27:54 +0530 Subject: [PATCH 016/181] objtool/powerpc: Add --mcount specific implementation This patch enables objtool --mcount on powerpc, and adds implementation specific to powerpc. Tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Acked-by: Josh Poimboeuf <jpoimboe@kernel.org> Signed-off-by: Sathvika Vasireddy <sv@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114175754.1131267-17-sv@linux.ibm.com --- arch/powerpc/Kconfig | 1 + tools/objtool/arch/powerpc/decode.c | 16 ++++++++++++++++ tools/objtool/arch/powerpc/include/arch/elf.h | 2 ++ 3 files changed, 19 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 12e6c16be54e..9c07068ba5e5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -239,6 +239,7 @@ config PPC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) select HAVE_OPTPROBES select HAVE_OBJTOOL if PPC32 || MPROFILE_KERNEL + select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_REGS diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index dcd0975cad6b..01cade98b49e 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -9,6 +9,11 @@ #include <objtool/builtin.h> #include <objtool/endianness.h> +int arch_ftrace_match(char *name) +{ + return !strcmp(name, "_mcount"); +} + unsigned long arch_dest_reloc_offset(int addend) { return addend; @@ -50,6 +55,17 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec typ = INSN_OTHER; imm = 0; + switch (opcode) { + case 18: /* b[l][a] */ + if ((insn & 3) == 1) /* bl */ + typ = INSN_CALL; + + imm = insn & 0x3fffffc; + if (imm & 0x2000000) + imm -= 0x4000000; + break; + } + if (opcode == 1) *len = 8; else diff --git a/tools/objtool/arch/powerpc/include/arch/elf.h b/tools/objtool/arch/powerpc/include/arch/elf.h index 3c8ebb7d2a6b..73f9ae172fe5 100644 --- a/tools/objtool/arch/powerpc/include/arch/elf.h +++ b/tools/objtool/arch/powerpc/include/arch/elf.h @@ -4,5 +4,7 @@ #define _OBJTOOL_ARCH_ELF #define R_NONE R_PPC_NONE +#define R_ABS64 R_PPC64_ADDR64 +#define R_ABS32 R_PPC_ADDR32 #endif /* _OBJTOOL_ARCH_ELF */ From a39818a3fb2bf12ae945a7c5fba8c5d9048a0e96 Mon Sep 17 00:00:00 2001 From: Michael Ellerman <mpe@ellerman.id.au> Date: Wed, 23 Nov 2022 21:26:10 +1100 Subject: [PATCH 017/181] objtool/powerpc: Implement arch_pc_relative_reloc() Provide an implementation for arch_pc_relative_reloc(). It is needed to pass the build once 61c6065ef7ec ("objtool: Allow !PC relative relocations") is merged. Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> --- tools/objtool/arch/powerpc/decode.c | 9 +++++++++ tools/objtool/include/objtool/arch.h | 2 ++ 2 files changed, 11 insertions(+) diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index 01cade98b49e..9c653805a08a 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -82,6 +82,15 @@ unsigned long arch_jump_destination(struct instruction *insn) return insn->offset + insn->immediate; } +bool arch_pc_relative_reloc(struct reloc *reloc) +{ + /* + * The powerpc build only allows certain relocation types, see + * relocs_check.sh, and none of those accepted are PC relative. + */ + return false; +} + void arch_initial_func_cfi_state(struct cfi_init_state *state) { int i; diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index 5149330f400f..4ecb480131c7 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -95,4 +95,6 @@ bool arch_is_rethunk(struct symbol *sym); int arch_rewrite_retpolines(struct objtool_file *file); +bool arch_pc_relative_reloc(struct reloc *reloc); + #endif /* _ARCH_H */ From 61119786de40f61b8843aa57217b678361763d67 Mon Sep 17 00:00:00 2001 From: XueBing Chen <chenxuebing@jari.cn> Date: Fri, 17 Jun 2022 23:50:19 +0800 Subject: [PATCH 018/181] KVM: PPC: Use __func__ to get function's name Prefer using '"%s...", __func__' to get current function's name in output messages. Signed-off-by: XueBing Chen <chenxuebing@jari.cn> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/13b2c857.beb.181725bad35.Coremail.chenxuebing@jari.cn --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index e9744b41a226..351ff0f89b00 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1202,7 +1202,7 @@ static int resize_hpt_allocate(struct kvm_resize_hpt *resize) if (rc < 0) return rc; - resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n", + resize_hpt_debug(resize, "%s(): HPT @ 0x%lx\n", __func__, resize->hpt.virt); return 0; @@ -1443,7 +1443,7 @@ static void resize_hpt_prepare_work(struct work_struct *work) */ mutex_unlock(&kvm->arch.mmu_setup_lock); - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", + resize_hpt_debug(resize, "%s(): order = %d\n", __func__, resize->order); err = resize_hpt_allocate(resize); @@ -1887,8 +1887,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf, ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, tmp); if (ret != H_SUCCESS) { - pr_err("kvm_htab_write ret %ld i=%ld v=%lx " - "r=%lx\n", ret, i, v, r); + pr_err("%s ret %ld i=%ld v=%lx r=%lx\n", __func__, ret, i, v, r); goto out; } if (!mmu_ready && is_vrma_hpte(v)) { From 392a58f1eaab0c90b80d7ba4a03dbf6eaaeabe60 Mon Sep 17 00:00:00 2001 From: Zhang Jiaming <jiaming@nfschina.com> Date: Thu, 23 Jun 2022 18:20:31 +0800 Subject: [PATCH 019/181] KVM: PPC: Book3S HV: XIVE: Fix spelling mistakes Change 'subsquent' to 'subsequent'. Change 'accross' to 'across'. Signed-off-by: Zhang Jiaming <jiaming@nfschina.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220623102031.15359-1-jiaming@nfschina.com --- arch/powerpc/kvm/book3s_xive.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 4ca23644f752..b4b680f2d853 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -539,7 +539,7 @@ static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) if (irq == XICS_IPI || irq == 0) { /* * This barrier orders the setting of xc->cppr vs. - * subsquent test of xc->mfrr done inside + * subsequent test of xc->mfrr done inside * scan_interrupts and push_pending_to_hw */ smp_mb(); @@ -563,7 +563,7 @@ static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) /* * This barrier orders both setting of in_eoi above vs, * subsequent test of guest_priority, and the setting - * of xc->cppr vs. subsquent test of xc->mfrr done inside + * of xc->cppr vs. subsequent test of xc->mfrr done inside * scan_interrupts and push_pending_to_hw */ smp_mb(); @@ -2392,7 +2392,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* * Now, we select a target if we have one. If we don't we * leave the interrupt untargetted. It means that an interrupt - * can become "untargetted" accross migration if it was masked + * can become "untargetted" across migration if it was masked * by set_xive() but there is little we can do about it. */ From 6fa1efeaa6671fb7339a6c62ceeec19e8e787963 Mon Sep 17 00:00:00 2001 From: Deming Wang <wangdeming@inspur.com> Date: Sun, 3 Jul 2022 13:29:32 -0400 Subject: [PATCH 020/181] KVM: PPC: Book3s: Use arg->size directly in kvm_vm_ioctl_create_spapr_tce() The size variable is just a copy of args->size, neither size nor args are modifed, so just use args->size directly. Signed-off-by: Deming Wang <wangdeming@inspur.com> [mpe: Reword change log] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220703172932.11329-1-wangdeming@inspur.com --- arch/powerpc/kvm/book3s_64_vio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 40864373ef87..95e738ef9062 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -294,14 +294,14 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; struct mm_struct *mm = kvm->mm; - unsigned long npages, size = args->size; + unsigned long npages; int ret; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) return -EINVAL; - npages = kvmppc_tce_pages(size); + npages = kvmppc_tce_pages(args->size); ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -314,7 +314,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, stt->liobn = args->liobn; stt->page_shift = args->page_shift; stt->offset = args->offset; - stt->size = size; + stt->size = args->size; stt->kvm = kvm; mutex_init(&stt->alloc_lock); INIT_LIST_HEAD_RCU(&stt->iommu_tables); From a96b20758b23be7e9f693218908228d6100c3c26 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Sat, 9 Jul 2022 17:56:43 +0200 Subject: [PATCH 021/181] KVM: PPC: Book3S HV: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/52e843a460bc374973149b8da0bd04f9761b80b7.1657382184.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/kvm/book3s_hv_uvmem.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index e2f11f9c3f2a..1d67baa5557a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -1190,8 +1190,7 @@ int kvmppc_uvmem_init(void) pfn_first = res->start >> PAGE_SHIFT; pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT); - kvmppc_uvmem_bitmap = kcalloc(BITS_TO_LONGS(pfn_last - pfn_first), - sizeof(unsigned long), GFP_KERNEL); + kvmppc_uvmem_bitmap = bitmap_zalloc(pfn_last - pfn_first, GFP_KERNEL); if (!kvmppc_uvmem_bitmap) { ret = -ENOMEM; goto out_unmap; @@ -1215,5 +1214,5 @@ void kvmppc_uvmem_free(void) memunmap_pages(&kvmppc_uvmem_pgmap); release_mem_region(kvmppc_uvmem_pgmap.range.start, range_len(&kvmppc_uvmem_pgmap.range)); - kfree(kvmppc_uvmem_bitmap); + bitmap_free(kvmppc_uvmem_bitmap); } From 8daa9c1dc9b4a3422801017ca46d935073dc14c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:23 +0100 Subject: [PATCH 022/181] macintosh/ams-i2c: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-290-uwe@kleine-koenig.org --- drivers/macintosh/ams/ams-i2c.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/macintosh/ams/ams-i2c.c b/drivers/macintosh/ams/ams-i2c.c index 3ded340699fb..a4a1035eb412 100644 --- a/drivers/macintosh/ams/ams-i2c.c +++ b/drivers/macintosh/ams/ams-i2c.c @@ -56,8 +56,7 @@ enum ams_i2c_cmd { AMS_CMD_START, }; -static int ams_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id); +static int ams_i2c_probe(struct i2c_client *client); static void ams_i2c_remove(struct i2c_client *client); static const struct i2c_device_id ams_id[] = { @@ -70,7 +69,7 @@ static struct i2c_driver ams_i2c_driver = { .driver = { .name = "ams", }, - .probe = ams_i2c_probe, + .probe_new = ams_i2c_probe, .remove = ams_i2c_remove, .id_table = ams_id, }; @@ -155,8 +154,7 @@ static void ams_i2c_get_xyz(s8 *x, s8 *y, s8 *z) *z = ams_i2c_read(AMS_DATAZ); } -static int ams_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ams_i2c_probe(struct i2c_client *client) { int vmaj, vmin; int result; From 0424113fed923a2fcb699b5f3aa335d16e092f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:24 +0100 Subject: [PATCH 023/181] macintosh/therm_adt746x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-291-uwe@kleine-koenig.org --- drivers/macintosh/therm_adt746x.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c index b004ea2a1102..8f5db9093c9a 100644 --- a/drivers/macintosh/therm_adt746x.c +++ b/drivers/macintosh/therm_adt746x.c @@ -464,9 +464,9 @@ static void thermostat_remove_files(struct thermostat *th) } -static int probe_thermostat(struct i2c_client *client, - const struct i2c_device_id *id) +static int probe_thermostat(struct i2c_client *client) { + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct device_node *np = client->dev.of_node; struct thermostat* th; const __be32 *prop; @@ -598,7 +598,7 @@ static struct i2c_driver thermostat_driver = { .driver = { .name = "therm_adt746x", }, - .probe = probe_thermostat, + .probe_new = probe_thermostat, .remove = remove_thermostat, .id_table = therm_adt746x_id, }; From dc9be0735c3e245fe60775307cf7842b1f9b45a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:25 +0100 Subject: [PATCH 024/181] macintosh/therm_windtunnel: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-292-uwe@kleine-koenig.org --- drivers/macintosh/therm_windtunnel.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c index b8228ca40454..22b15efcc025 100644 --- a/drivers/macintosh/therm_windtunnel.c +++ b/drivers/macintosh/therm_windtunnel.c @@ -411,8 +411,9 @@ static const struct i2c_device_id therm_windtunnel_id[] = { MODULE_DEVICE_TABLE(i2c, therm_windtunnel_id); static int -do_probe(struct i2c_client *cl, const struct i2c_device_id *id) +do_probe(struct i2c_client *cl) { + const struct i2c_device_id *id = i2c_client_get_device_id(cl); struct i2c_adapter *adapter = cl->adapter; int ret = 0; @@ -441,7 +442,7 @@ static struct i2c_driver g4fan_driver = { .driver = { .name = "therm_windtunnel", }, - .probe = do_probe, + .probe_new = do_probe, .remove = do_remove, .id_table = therm_windtunnel_id, }; From 9d533bdf4a582f037327f1a38ed8cf689d67cab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:26 +0100 Subject: [PATCH 025/181] macintosh/windfarm_ad7417_sensor: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-293-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_ad7417_sensor.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_ad7417_sensor.c b/drivers/macintosh/windfarm_ad7417_sensor.c index c5c54a4ce91f..33b4723d235e 100644 --- a/drivers/macintosh/windfarm_ad7417_sensor.c +++ b/drivers/macintosh/windfarm_ad7417_sensor.c @@ -229,8 +229,7 @@ static void wf_ad7417_init_chip(struct wf_ad7417_priv *pv) pv->config = config; } -static int wf_ad7417_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int wf_ad7417_probe(struct i2c_client *client) { struct wf_ad7417_priv *pv; const struct mpu_data *mpu; @@ -321,7 +320,7 @@ static struct i2c_driver wf_ad7417_driver = { .name = "wf_ad7417", .of_match_table = wf_ad7417_of_id, }, - .probe = wf_ad7417_probe, + .probe_new = wf_ad7417_probe, .remove = wf_ad7417_remove, .id_table = wf_ad7417_id, }; From 472e4c61d2bb4977ade8e2491953954bf9723563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:27 +0100 Subject: [PATCH 026/181] macintosh/windfarm_fcu_controls: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-294-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_fcu_controls.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_fcu_controls.c b/drivers/macintosh/windfarm_fcu_controls.c index c5b1ca5bcd73..e027d889d7e8 100644 --- a/drivers/macintosh/windfarm_fcu_controls.c +++ b/drivers/macintosh/windfarm_fcu_controls.c @@ -514,8 +514,7 @@ static int wf_fcu_init_chip(struct wf_fcu_priv *pv) return 0; } -static int wf_fcu_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int wf_fcu_probe(struct i2c_client *client) { struct wf_fcu_priv *pv; @@ -590,7 +589,7 @@ static struct i2c_driver wf_fcu_driver = { .name = "wf_fcu", .of_match_table = wf_fcu_of_id, }, - .probe = wf_fcu_probe, + .probe_new = wf_fcu_probe, .remove = wf_fcu_remove, .id_table = wf_fcu_id, }; From 51a9e1755cdd8b127191030d15b74b97f7d3ce75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:28 +0100 Subject: [PATCH 027/181] macintosh/windfarm_lm75_sensor: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit .probe_new() doesn't get the i2c_device_id * parameter, so determine that explicitly in the probe function. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-295-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_lm75_sensor.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/macintosh/windfarm_lm75_sensor.c b/drivers/macintosh/windfarm_lm75_sensor.c index 204661c8e918..24f0a444d312 100644 --- a/drivers/macintosh/windfarm_lm75_sensor.c +++ b/drivers/macintosh/windfarm_lm75_sensor.c @@ -87,9 +87,9 @@ static const struct wf_sensor_ops wf_lm75_ops = { .owner = THIS_MODULE, }; -static int wf_lm75_probe(struct i2c_client *client, - const struct i2c_device_id *id) -{ +static int wf_lm75_probe(struct i2c_client *client) +{ + const struct i2c_device_id *id = i2c_client_get_device_id(client); struct wf_lm75_sensor *lm; int rc, ds1775; const char *name, *loc; @@ -177,7 +177,7 @@ static struct i2c_driver wf_lm75_driver = { .name = "wf_lm75", .of_match_table = wf_lm75_of_id, }, - .probe = wf_lm75_probe, + .probe_new = wf_lm75_probe, .remove = wf_lm75_remove, .id_table = wf_lm75_id, }; From 0e2211b3373ea718d2161bcc360cd4d9a3bcebc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:29 +0100 Subject: [PATCH 028/181] macintosh/windfarm_lm87_sensor: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-296-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_lm87_sensor.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_lm87_sensor.c b/drivers/macintosh/windfarm_lm87_sensor.c index 40d25463346e..f37a32c2070c 100644 --- a/drivers/macintosh/windfarm_lm87_sensor.c +++ b/drivers/macintosh/windfarm_lm87_sensor.c @@ -95,8 +95,7 @@ static const struct wf_sensor_ops wf_lm87_ops = { .owner = THIS_MODULE, }; -static int wf_lm87_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int wf_lm87_probe(struct i2c_client *client) { struct wf_lm87_sensor *lm; const char *name = NULL, *loc; @@ -173,7 +172,7 @@ static struct i2c_driver wf_lm87_driver = { .name = "wf_lm87", .of_match_table = wf_lm87_of_id, }, - .probe = wf_lm87_probe, + .probe_new = wf_lm87_probe, .remove = wf_lm87_remove, .id_table = wf_lm87_id, }; From 2d7a9d780444c8f31ee6af522a92a99492d9eeb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:30 +0100 Subject: [PATCH 029/181] macintosh/windfarm_max6690_sensor: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-297-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_max6690_sensor.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_max6690_sensor.c b/drivers/macintosh/windfarm_max6690_sensor.c index c0d404ebc792..6c5ab657b6b3 100644 --- a/drivers/macintosh/windfarm_max6690_sensor.c +++ b/drivers/macintosh/windfarm_max6690_sensor.c @@ -60,8 +60,7 @@ static const struct wf_sensor_ops wf_max6690_ops = { .owner = THIS_MODULE, }; -static int wf_max6690_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int wf_max6690_probe(struct i2c_client *client) { const char *name, *loc; struct wf_6690_sensor *max; @@ -129,7 +128,7 @@ static struct i2c_driver wf_max6690_driver = { .name = "wf_max6690", .of_match_table = wf_max6690_of_id, }, - .probe = wf_max6690_probe, + .probe_new = wf_max6690_probe, .remove = wf_max6690_remove, .id_table = wf_max6690_id, }; From d05921a09a5a72805a1d669dce0fcbd66df86237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de> Date: Fri, 18 Nov 2022 23:40:31 +0100 Subject: [PATCH 030/181] macintosh/windfarm_smu_sat: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118224540.619276-298-uwe@kleine-koenig.org --- drivers/macintosh/windfarm_smu_sat.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/windfarm_smu_sat.c b/drivers/macintosh/windfarm_smu_sat.c index be5d4593db93..ebc4256a9e4a 100644 --- a/drivers/macintosh/windfarm_smu_sat.c +++ b/drivers/macintosh/windfarm_smu_sat.c @@ -189,8 +189,7 @@ static const struct wf_sensor_ops wf_sat_ops = { .owner = THIS_MODULE, }; -static int wf_sat_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int wf_sat_probe(struct i2c_client *client) { struct device_node *dev = client->dev.of_node; struct wf_sat *sat; @@ -349,7 +348,7 @@ static struct i2c_driver wf_sat_driver = { .name = "wf_smu_sat", .of_match_table = wf_sat_of_id, }, - .probe = wf_sat_probe, + .probe_new = wf_sat_probe, .remove = wf_sat_remove, .id_table = wf_sat_id, }; From e0acfdd13474815696595206e11169736b4bca9d Mon Sep 17 00:00:00 2001 From: Haowen Bai <baihaowen@meizu.com> Date: Thu, 17 Mar 2022 10:32:39 +0800 Subject: [PATCH 031/181] macintosh/windfarm_pm81: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Signed-off-by: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1647484359-12402-1-git-send-email-baihaowen@meizu.com --- drivers/macintosh/windfarm_pm81.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c index e0f4743f21cc..257fb2c695c5 100644 --- a/drivers/macintosh/windfarm_pm81.c +++ b/drivers/macintosh/windfarm_pm81.c @@ -401,7 +401,7 @@ static void wf_smu_create_cpu_fans(void) /* First, locate the PID params in SMU SBD */ hdr = smu_get_sdb_partition(SMU_SDB_CPUPIDDATA_ID, NULL); - if (hdr == 0) { + if (!hdr) { printk(KERN_WARNING "windfarm: CPU PID fan config not found " "max fan speed\n"); goto fail; @@ -705,7 +705,7 @@ static int wf_init_pm(void) const struct smu_sdbp_header *hdr; hdr = smu_get_sdb_partition(SMU_SDB_SENSORTREE_ID, NULL); - if (hdr != 0) { + if (hdr) { struct smu_sdbp_sensortree *st = (struct smu_sdbp_sensortree *)&hdr[1]; wf_smu_mach_model = st->model_id; From 2f59562c140d3119328f869126e8e593a99a392f Mon Sep 17 00:00:00 2001 From: Haowen Bai <baihaowen@meizu.com> Date: Thu, 17 Mar 2022 10:35:54 +0800 Subject: [PATCH 032/181] macintosh/adb: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Signed-off-by: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1647484554-13258-1-git-send-email-baihaowen@meizu.com --- drivers/macintosh/adb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 1bbb9ca08d40..23bd0c77ac1a 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -478,7 +478,7 @@ adb_register(int default_id, int handler_id, struct adb_ids *ids, if ((adb_handler[i].original_address == default_id) && (!handler_id || (handler_id == adb_handler[i].handler_id) || try_handler_change(i, handler_id))) { - if (adb_handler[i].handler != 0) { + if (adb_handler[i].handler) { pr_err("Two handlers for ADB device %d\n", default_id); continue; @@ -673,7 +673,7 @@ static int adb_open(struct inode *inode, struct file *file) goto out; } state = kmalloc(sizeof(struct adbdev_state), GFP_KERNEL); - if (state == 0) { + if (!state) { ret = -ENOMEM; goto out; } From 88316944c3b3aa3ce3249c51689ef1621049df9d Mon Sep 17 00:00:00 2001 From: Haowen Bai <baihaowen@meizu.com> Date: Thu, 17 Mar 2022 10:42:33 +0800 Subject: [PATCH 033/181] macintosh/windfarm_pm91: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Signed-off-by: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1647484953-15249-1-git-send-email-baihaowen@meizu.com --- drivers/macintosh/windfarm_pm91.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/windfarm_pm91.c b/drivers/macintosh/windfarm_pm91.c index c8535855360d..120a9cfba0c5 100644 --- a/drivers/macintosh/windfarm_pm91.c +++ b/drivers/macintosh/windfarm_pm91.c @@ -150,7 +150,7 @@ static void wf_smu_create_cpu_fans(void) /* First, locate the PID params in SMU SBD */ hdr = smu_get_sdb_partition(SMU_SDB_CPUPIDDATA_ID, NULL); - if (hdr == 0) { + if (!hdr) { printk(KERN_WARNING "windfarm: CPU PID fan config not found " "max fan speed\n"); goto fail; From a823307bf0a3b79b27eea916bf6499ba4377cdf9 Mon Sep 17 00:00:00 2001 From: Haowen Bai <baihaowen@meizu.com> Date: Thu, 17 Mar 2022 10:57:02 +0800 Subject: [PATCH 034/181] macintosh/windfarm_pm121: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Signed-off-by: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1647485822-16717-1-git-send-email-baihaowen@meizu.com --- drivers/macintosh/windfarm_pm121.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c index 36312f163aac..82500417ebee 100644 --- a/drivers/macintosh/windfarm_pm121.c +++ b/drivers/macintosh/windfarm_pm121.c @@ -651,7 +651,7 @@ static void pm121_create_cpu_fans(void) /* First, locate the PID params in SMU SBD */ hdr = smu_get_sdb_partition(SMU_SDB_CPUPIDDATA_ID, NULL); - if (hdr == 0) { + if (!hdr) { printk(KERN_WARNING "pm121: CPU PID fan config not found.\n"); goto fail; } @@ -970,7 +970,7 @@ static int pm121_init_pm(void) const struct smu_sdbp_header *hdr; hdr = smu_get_sdb_partition(SMU_SDB_SENSORTREE_ID, NULL); - if (hdr != 0) { + if (hdr) { struct smu_sdbp_sensortree *st = (struct smu_sdbp_sensortree *)&hdr[1]; pm121_mach_model = st->model_id; From fc21ed8f26d980428f9b4e08e0fb72c7f7ffc9b8 Mon Sep 17 00:00:00 2001 From: Haowen Bai <baihaowen@meizu.com> Date: Thu, 17 Mar 2022 17:24:49 +0800 Subject: [PATCH 035/181] macintosh/macio-adb: Fix warning comparing pointer to 0 Avoid pointer type value compared with 0 to make code clear. Signed-off-by: Haowen Bai <baihaowen@meizu.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1647509089-4280-1-git-send-email-baihaowen@meizu.com --- drivers/macintosh/macio-adb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index 9b63bd2551c6..3721402582b4 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -100,7 +100,7 @@ int macio_init(void) unsigned int irq; adbs = of_find_compatible_node(NULL, "adb", "chrp,adb0"); - if (adbs == 0) + if (!adbs) return -ENXIO; if (of_address_to_resource(adbs, 0, &r)) { @@ -183,7 +183,7 @@ static int macio_send_request(struct adb_request *req, int sync) req->reply_len = 0; spin_lock_irqsave(&macio_lock, flags); - if (current_req != 0) { + if (current_req) { last_req->next = req; last_req = req; } else { @@ -213,7 +213,8 @@ static irqreturn_t macio_adb_interrupt(int irq, void *arg) spin_lock(&macio_lock); if (in_8(&adb->intr.r) & TAG) { handled = 1; - if ((req = current_req) != 0) { + req = current_req; + if (req) { /* put the current request in */ for (i = 0; i < req->nbytes; ++i) out_8(&adb->data[i].r, req->data[i]); From 27f9690a81d7acf185b78be8d03d4b3a243116b1 Mon Sep 17 00:00:00 2001 From: Finn Thain <fthain@linux-m68k.org> Date: Mon, 21 Mar 2022 20:28:00 +1100 Subject: [PATCH 036/181] macintosh/via-pmu: Avoid compiler warnings when CONFIG_PROC_FS is disabled drivers/macintosh/via-pmu.c:897:12: warning: 'pmu_battery_proc_show' defined but not used [-Wunused-function] static int pmu_battery_proc_show(struct seq_file *m, void *v) ^~~~~~~~~~~~~~~~~~~~~ drivers/macintosh/via-pmu.c:871:12: warning: 'pmu_irqstats_proc_show' defined but not used [-Wunused-function] static int pmu_irqstats_proc_show(struct seq_file *m, void *v) ^~~~~~~~~~~~~~~~~~~~~~ drivers/macintosh/via-pmu.c:860:12: warning: 'pmu_info_proc_show' defined but not used [-Wunused-function] static int pmu_info_proc_show(struct seq_file *m, void *v) ^~~~~~~~~~~~~~~~~~ Add some #ifdefs to avoid unused code warnings when CONFIG_PROC_FS is disabled. Reported-by: Randy Dunlap <rdunlap@infradead.org> Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Finn Thain <fthain@linux-m68k.org> Tested-by: Randy Dunlap <rdunlap@infradead.org> Acked-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/0c11c0770fc4ec7e80a4b2e0ffce1055b792cfdb.1647854880.git.fthain@linux-m68k.org --- drivers/macintosh/via-pmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 49657962d892..e0cb8daf4f08 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -203,9 +203,11 @@ static int init_pmu(void); static void pmu_start(void); static irqreturn_t via_pmu_interrupt(int irq, void *arg); static irqreturn_t gpio1_interrupt(int irq, void *arg); +#ifdef CONFIG_PROC_FS static int pmu_info_proc_show(struct seq_file *m, void *v); static int pmu_irqstats_proc_show(struct seq_file *m, void *v); static int pmu_battery_proc_show(struct seq_file *m, void *v); +#endif static void pmu_pass_intr(unsigned char *data, int len); static const struct proc_ops pmu_options_proc_ops; @@ -852,6 +854,7 @@ query_battery_state(void) 2, PMU_SMART_BATTERY_STATE, pmu_cur_battery+1); } +#ifdef CONFIG_PROC_FS static int pmu_info_proc_show(struct seq_file *m, void *v) { seq_printf(m, "PMU driver version : %d\n", PMU_DRIVER_VERSION); @@ -972,6 +975,7 @@ static const struct proc_ops pmu_options_proc_ops = { .proc_release = single_release, .proc_write = pmu_options_proc_write, }; +#endif #ifdef CONFIG_ADB /* Send an ADB command */ From a0542d2c45a64162e63ad2d80684e57de0566271 Mon Sep 17 00:00:00 2001 From: Stephen Kitt <steve@sk2.org> Date: Thu, 16 Jun 2022 19:04:24 +0200 Subject: [PATCH 037/181] macintosh/via-pmu-backlight: Use backlight helper backlight_properties.fb_blank is deprecated. The states it represents are handled by other properties; but instead of accessing those properties directly, drivers should use the helpers provided by backlight.h. Instead of retrieving the backlight brightness in struct backlight_properties manually, and then checking whether the backlight should be on at all, use backlight_get_brightness() which does all this and insulates this from future changes. Signed-off-by: Stephen Kitt <steve@sk2.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220616170425.1346081-1-steve@sk2.org --- drivers/macintosh/via-pmu-backlight.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/macintosh/via-pmu-backlight.c b/drivers/macintosh/via-pmu-backlight.c index 2194016122d2..c2d87e7fa85b 100644 --- a/drivers/macintosh/via-pmu-backlight.c +++ b/drivers/macintosh/via-pmu-backlight.c @@ -71,12 +71,7 @@ static int pmu_backlight_get_level_brightness(int level) static int __pmu_backlight_update_status(struct backlight_device *bd) { struct adb_request req; - int level = bd->props.brightness; - - - if (bd->props.power != FB_BLANK_UNBLANK || - bd->props.fb_blank != FB_BLANK_UNBLANK) - level = 0; + int level = backlight_get_brightness(bd); if (level > 0) { int pmulevel = pmu_backlight_get_level_brightness(level); From 2dfcace75e1e1dfbd89af63fce1bfe8aebe38427 Mon Sep 17 00:00:00 2001 From: Li zeming <zeming@nfschina.com> Date: Thu, 7 Jul 2022 09:53:52 +0800 Subject: [PATCH 038/181] macintosh/ams/ams: Add header file macro definition Add header file macro definition. Signed-off-by: Li zeming <zeming@nfschina.com> [mpe: Add endif comment] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220707015352.3391-1-zeming@nfschina.com --- drivers/macintosh/ams/ams.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/macintosh/ams/ams.h b/drivers/macintosh/ams/ams.h index 935bdd9cd9a6..2c159c8844c1 100644 --- a/drivers/macintosh/ams/ams.h +++ b/drivers/macintosh/ams/ams.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _AMS_H +#define _AMS_H + #include <linux/i2c.h> #include <linux/input.h> #include <linux/kthread.h> @@ -69,3 +72,5 @@ extern int ams_i2c_init(struct device_node *np); extern int ams_input_init(void); extern void ams_input_exit(void); + +#endif /* _AMS_H */ From e3e528d29d13c01289f382a0d3ddb5312ac3dae3 Mon Sep 17 00:00:00 2001 From: Li zeming <zeming@nfschina.com> Date: Thu, 7 Jul 2022 09:59:49 +0800 Subject: [PATCH 039/181] macintosh/windfarm_pid: Add header file macro definition I think the header file could avoid redefinition errors at compile time by adding macro definitions. Signed-off-by: Li zeming <zeming@nfschina.com> [mpe: Add endif comment] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220707015949.3733-1-zeming@nfschina.com --- drivers/macintosh/windfarm_pid.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/macintosh/windfarm_pid.h b/drivers/macintosh/windfarm_pid.h index 83f747dbeafc..335613a200fb 100644 --- a/drivers/macintosh/windfarm_pid.h +++ b/drivers/macintosh/windfarm_pid.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _WINDFARM_PID_H +#define _WINDFARM_PID_H + /* * Windfarm PowerMac thermal control. Generic PID helpers * @@ -82,3 +85,5 @@ struct wf_cpu_pid_state { extern void wf_cpu_pid_init(struct wf_cpu_pid_state *st, struct wf_cpu_pid_param *param); extern s32 wf_cpu_pid_run(struct wf_cpu_pid_state *st, s32 power, s32 temp); + +#endif /* _WINDFARM_PID_H */ From 3aa16303dc98b7b8baa2adbc3210fd513ec0e810 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 16 Sep 2022 22:16:38 +0800 Subject: [PATCH 040/181] macintosh: Switch to use for_each_child_of_node() macro Use for_each_child_of_node() macro instead of open coding it. No functional change. Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220916141638.685575-1-yangyingliang@huawei.com --- drivers/macintosh/windfarm_smu_controls.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c index e9957ad49a2a..bdd92b27da2a 100644 --- a/drivers/macintosh/windfarm_smu_controls.c +++ b/drivers/macintosh/windfarm_smu_controls.c @@ -266,12 +266,11 @@ static int __init smu_controls_init(void) return -ENODEV; /* Look for RPM fans */ - for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) + for_each_child_of_node(smu, fans) if (of_node_name_eq(fans, "rpm-fans") || of_device_is_compatible(fans, "smu-rpm-fans")) break; - for (fan = NULL; - fans && (fan = of_get_next_child(fans, fan)) != NULL;) { + for_each_child_of_node(fans, fan) { struct smu_fan_control *fct; fct = smu_fan_create(fan, 0); @@ -286,11 +285,10 @@ static int __init smu_controls_init(void) /* Look for PWM fans */ - for (fans = NULL; (fans = of_get_next_child(smu, fans)) != NULL;) + for_each_child_of_node(smu, fans) if (of_node_name_eq(fans, "pwm-fans")) break; - for (fan = NULL; - fans && (fan = of_get_next_child(fans, fan)) != NULL;) { + for_each_child_of_node(fans, fan) { struct smu_fan_control *fct; fct = smu_fan_create(fan, 1); From 5ca86eae55a2f006e6c1edd2029b2cacb6979515 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 4 Nov 2022 11:25:51 +0800 Subject: [PATCH 041/181] macintosh: fix possible memory leak in macio_add_one_device() Afer commit 1fa5ae857bb1 ("driver core: get rid of struct device's bus_id string array"), the name of device is allocated dynamically. It needs to be freed when of_device_register() fails. Call put_device() to give up the reference that's taken in device_initialize(), so that it can be freed in kobject_cleanup() when the refcount hits 0. macio device is freed in macio_release_dev(), so the kfree() can be removed. Fixes: 1fa5ae857bb1 ("driver core: get rid of struct device's bus_id string array") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221104032551.1075335-1-yangyingliang@huawei.com --- drivers/macintosh/macio_asic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 1ec1e5984563..3bc1f374e657 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -424,7 +424,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, if (of_device_register(&dev->ofdev) != 0) { printk(KERN_DEBUG"macio: device registration error for %s!\n", dev_name(&dev->ofdev.dev)); - kfree(dev); + put_device(&dev->ofdev.dev); return NULL; } From dbaa3105736d4d73063ea0a3b01cd7fafce924e6 Mon Sep 17 00:00:00 2001 From: Xie Shaowen <studentxswpy@163.com> Date: Tue, 2 Aug 2022 15:41:48 +0800 Subject: [PATCH 042/181] macintosh/macio-adb: check the return value of ioremap() The function ioremap() in macio_init() can fail, so its return value should be checked. Fixes: 36874579dbf4c ("[PATCH] powerpc: macio-adb build fix") Reported-by: Hacash Robot <hacashRobot@santino.com> Signed-off-by: Xie Shaowen <studentxswpy@163.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220802074148.3213659-1-studentxswpy@163.com --- drivers/macintosh/macio-adb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index 3721402582b4..55a9f8c3a150 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -108,6 +108,10 @@ int macio_init(void) return -ENXIO; } adb = ioremap(r.start, sizeof(struct adb_regs)); + if (!adb) { + of_node_put(adbs); + return -ENOMEM; + } out_8(&adb->ctrl.r, 0); out_8(&adb->intr.r, 0); From 5836947613ef33d311b4eff6a32d019580a214f5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Sat, 29 Jan 2022 08:16:04 +0100 Subject: [PATCH 043/181] powerpc/52xx: Fix a resource leak in an error handling path The error handling path of mpc52xx_lpbfifo_probe() has a request_irq() that is not balanced by a corresponding free_irq(). Add the missing call, as already done in the remove function. Fixes: 3c9059d79f5e ("powerpc/5200: add LocalPlus bus FIFO device driver") Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/dec1496d46ccd5311d0f6e9f9ca4238be11bf6a6.1643440531.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c index 48038aaedbd3..2875c206ac0f 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c @@ -531,6 +531,7 @@ static int mpc52xx_lpbfifo_probe(struct platform_device *op) err_bcom_rx_irq: bcom_gen_bd_rx_release(lpbfifo.bcom_rx_task); err_bcom_rx: + free_irq(lpbfifo.irq, &lpbfifo); err_irq: iounmap(lpbfifo.regs); lpbfifo.regs = NULL; From e75d07bd8303588c33e6f1f180a9081fb58c872e Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Wed, 9 Mar 2022 10:29:50 +0100 Subject: [PATCH 044/181] powerpc: Remove find_current_mm_pte() Last usage of find_current_mm_pte() was removed by commit 15759cb054ef ("powerpc/perf/callchain: Use __get_user_pages_fast in read_user_stack_slow") Remove it. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/ec79f462a3bfa8365b7df505e574d5d85246bc68.1646818177.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pte-walk.h | 25 ------------------------- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/pte-walk.h b/arch/powerpc/include/asm/pte-walk.h index 714a35f0d425..73c22c579a79 100644 --- a/arch/powerpc/include/asm/pte-walk.h +++ b/arch/powerpc/include/asm/pte-walk.h @@ -60,29 +60,4 @@ static inline phys_addr_t ppc_find_vmap_phys(unsigned long addr) return pa; } -/* - * This is what we should always use. Any other lockless page table lookup needs - * careful audit against THP split. - */ -static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea, - bool *is_thp, unsigned *hshift) -{ - pte_t *pte; - - VM_WARN(!arch_irqs_disabled(), "%s called with irq enabled\n", __func__); - VM_WARN(pgdir != current->mm->pgd, - "%s lock less page table lookup called on wrong mm\n", __func__); - pte = __find_linux_pte(pgdir, ea, is_thp, hshift); - -#if defined(CONFIG_DEBUG_VM) && \ - !(defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE)) - /* - * We should not find huge page if these configs are not enabled. - */ - if (hshift) - WARN_ON(*hshift); -#endif - return pte; -} - #endif /* _ASM_POWERPC_PTE_WALK_H */ diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index f6151a589298..85c84e89e3ea 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -100,14 +100,14 @@ static void do_serialize(void *arg) } /* - * Serialize against find_current_mm_pte which does lock-less + * Serialize against __find_linux_pte() which does lock-less * lookup in page tables with local interrupts disabled. For huge pages * it casts pmd_t to pte_t. Since format of pte_t is different from * pmd_t we want to prevent transit from pmd pointing to page table * to pmd pointing to huge page (and back) while interrupts are disabled. * We clear pmd to possibly replace it with page table pointer in * different code paths. So make sure we wait for the parallel - * find_current_mm_pte to finish. + * __find_linux_pte() to finish. */ void serialize_against_pte_lookup(struct mm_struct *mm) { From 4562bffb83b88e61ea9c9912e50efbd5a941f0b3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Wed, 23 Nov 2022 22:19:18 -0800 Subject: [PATCH 045/181] powerpc/mpc52xx_lpbfifo: fix all kernel-doc warnings Fix multiple kernel-doc warnings in mpc52xx_lpbfifo.c: arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c:377: warning: expecting prototype for mpc52xx_lpbfifo_bcom_poll(). Prototype was for mpc52xx_lpbfifo_poll() instead mpc52xx_lpbfifo.c:221: warning: No description found for return value of 'mpc52xx_lpbfifo_irq' mpc52xx_lpbfifo.c:327: warning: No description found for return value of 'mpc52xx_lpbfifo_bcom_irq' mpc52xx_lpbfifo.c:398: warning: No description found for return value of 'mpc52xx_lpbfifo_submit' mpc52xx_lpbfifo.c:64: warning: Function parameter or member 'req' not described in 'mpc52xx_lpbfifo_kick' mpc52xx_lpbfifo.c:220: warning: contents before sections mpc52xx_lpbfifo.c:223: warning: Function parameter or member 'irq' not described in 'mpc52xx_lpbfifo_irq' mpc52xx_lpbfifo.c:223: warning: Function parameter or member 'dev_id' not described in 'mpc52xx_lpbfifo_irq' mpc52xx_lpbfifo.c:328: warning: contents before sections mpc52xx_lpbfifo.c:331: warning: Function parameter or member 'irq' not described in 'mpc52xx_lpbfifo_bcom_irq' mpc52xx_lpbfifo.c:331: warning: Function parameter or member 'dev_id' not described in 'mpc52xx_lpbfifo_bcom_irq' Reported-by: kernel test robot <lkp@intel.com> Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221124061918.1967-1-rdunlap@infradead.org --- arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c index 2875c206ac0f..6d1dd6e87478 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_lpbfifo.c @@ -59,6 +59,8 @@ static struct mpc52xx_lpbfifo lpbfifo; /** * mpc52xx_lpbfifo_kick - Trigger the next block of data to be transferred + * + * @req: Pointer to request structure */ static void mpc52xx_lpbfifo_kick(struct mpc52xx_lpbfifo_request *req) { @@ -178,6 +180,8 @@ static void mpc52xx_lpbfifo_kick(struct mpc52xx_lpbfifo_request *req) /** * mpc52xx_lpbfifo_irq - IRQ handler for LPB FIFO + * @irq: IRQ number to be handled + * @dev_id: device ID cookie * * On transmit, the dma completion irq triggers before the fifo completion * triggers. Handle the dma completion here instead of the LPB FIFO Bestcomm @@ -216,6 +220,8 @@ static void mpc52xx_lpbfifo_kick(struct mpc52xx_lpbfifo_request *req) * or nested spinlock condition. The out path is non-trivial, so * extra fiddling is done to make sure all paths lead to the same * outbound code. + * + * Return: irqreturn code (%IRQ_HANDLED) */ static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id) { @@ -320,8 +326,12 @@ static irqreturn_t mpc52xx_lpbfifo_irq(int irq, void *dev_id) /** * mpc52xx_lpbfifo_bcom_irq - IRQ handler for LPB FIFO Bestcomm task + * @irq: IRQ number to be handled + * @dev_id: device ID cookie * * Only used when receiving data. + * + * Return: irqreturn code (%IRQ_HANDLED) */ static irqreturn_t mpc52xx_lpbfifo_bcom_irq(int irq, void *dev_id) { @@ -372,7 +382,7 @@ static irqreturn_t mpc52xx_lpbfifo_bcom_irq(int irq, void *dev_id) } /** - * mpc52xx_lpbfifo_bcom_poll - Poll for DMA completion + * mpc52xx_lpbfifo_poll - Poll for DMA completion */ void mpc52xx_lpbfifo_poll(void) { @@ -393,6 +403,8 @@ EXPORT_SYMBOL(mpc52xx_lpbfifo_poll); /** * mpc52xx_lpbfifo_submit - Submit an LPB FIFO transfer request. * @req: Pointer to request structure + * + * Return: %0 on success, -errno code on error */ int mpc52xx_lpbfifo_submit(struct mpc52xx_lpbfifo_request *req) { From 932c6dea4f32f7d71488137c475b60a77e56bb2a Mon Sep 17 00:00:00 2001 From: Deming Wang <wangdeming@inspur.com> Date: Wed, 13 Apr 2022 06:55:07 -0400 Subject: [PATCH 046/181] powerpc/xive: remove unused parameter The parameter xc to xive_cleanup_single_escalation() is unused, so we can remove it. Signed-off-by: Deming Wang <wangdeming@inspur.com> [mpe: Reword change log, unwrap lines < 90 columns] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220413105507.1729-1-wangdeming@inspur.com --- arch/powerpc/kvm/book3s_xive.c | 6 ++---- arch/powerpc/kvm/book3s_xive.h | 3 +-- arch/powerpc/kvm/book3s_xive_native.c | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 4ca23644f752..d64b2dcc0e7f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1785,8 +1785,7 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) * stale_p (because it has no easy way to address it). Hence we have * to adjust stale_p before shutting down the interrupt. */ -void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, - struct kvmppc_xive_vcpu *xc, int irq) +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int irq) { struct irq_data *d = irq_get_irq_data(irq); struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -1827,8 +1826,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { if (xc->esc_virq[i]) { if (kvmppc_xive_has_single_escalation(xc->xive)) - xive_cleanup_single_escalation(vcpu, xc, - xc->esc_virq[i]); + xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 1e48f72e8aa5..62bf39f53783 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -299,8 +299,7 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); -void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, - struct kvmppc_xive_vcpu *xc, int irq); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int irq); int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 5271c33fe79e..4f566bea5e10 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -93,8 +93,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Free the escalation irq */ if (xc->esc_virq[i]) { if (kvmppc_xive_has_single_escalation(xc->xive)) - xive_cleanup_single_escalation(vcpu, xc, - xc->esc_virq[i]); + xive_cleanup_single_escalation(vcpu, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); From 37195edebf479b94f1e20c2a83a29e4beebe7ff5 Mon Sep 17 00:00:00 2001 From: Julia Lawall <Julia.Lawall@inria.fr> Date: Sat, 21 May 2022 13:11:32 +0200 Subject: [PATCH 047/181] cxl: fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220521111145.81697-82-Julia.Lawall@inria.fr --- include/misc/cxl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/misc/cxl.h b/include/misc/cxl.h index 0410412de16b..d8044299d654 100644 --- a/include/misc/cxl.h +++ b/include/misc/cxl.h @@ -30,7 +30,7 @@ unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev); /* * Context lifetime overview: * - * An AFU context may be inited and then started and stoppped multiple times + * An AFU context may be inited and then started and stopped multiple times * before it's released. ie. * - cxl_dev_context_init() * - cxl_start_context() From 1d09697ff22908ae487fc8c4fbde1811732be523 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin <linmq006@gmail.com> Date: Sun, 5 Jun 2022 10:00:38 +0400 Subject: [PATCH 048/181] cxl: Fix refcount leak in cxl_calc_capp_routing of_get_next_parent() returns a node pointer with refcount incremented, we should use of_node_put() on it when not need anymore. This function only calls of_node_put() in normal path, missing it in the error path. Add missing of_node_put() to avoid refcount leak. Fixes: f24be42aab37 ("cxl: Add psl9 specific code") Signed-off-by: Miaoqian Lin <linmq006@gmail.com> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220605060038.62217-1-linmq006@gmail.com --- drivers/misc/cxl/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 3de0aea62ade..62385a529d86 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -387,6 +387,7 @@ int cxl_calc_capp_routing(struct pci_dev *dev, u64 *chipid, rc = get_phb_index(np, phb_index); if (rc) { pr_err("cxl: invalid phb index\n"); + of_node_put(np); return rc; } From f949ccee1dde970bc77dc871b4f0b5e651577344 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 11 Nov 2022 22:54:39 +0800 Subject: [PATCH 049/181] cxl: fix possible null-ptr-deref in cxl_guest_init_afu|adapter() If device_register() fails in cxl_register_afu|adapter(), the device is not added, device_unregister() can not be called in the error path, otherwise it will cause a null-ptr-deref because of removing not added device. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So split device_unregister() into device_del() and put_device(), then goes to put dev when register fails. Fixes: 14baf4d9c739 ("cxl: Add guest-specific code") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221111145440.2426970-1-yangyingliang@huawei.com --- drivers/misc/cxl/guest.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 375f692ae9d6..fb95a2d5cef4 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -965,10 +965,10 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; /* * pHyp doesn't expose the programming models supported by the @@ -984,7 +984,7 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n afu->modes_supported = CXL_MODE_DIRECTED; if ((rc = cxl_afu_select_best_mode(afu))) - goto err_put2; + goto err_remove_sysfs; adapter->afu[afu->slice] = afu; @@ -1004,10 +1004,12 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return 0; -err_put2: +err_remove_sysfs: cxl_sysfs_afu_remove(afu); -err_put1: - device_unregister(&afu->dev); +err_del_dev: + device_del(&afu->dev); +err_put_dev: + put_device(&afu->dev); free = false; guest_release_serr_irq(afu); err2: @@ -1141,18 +1143,20 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* release the context lock as the adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: - device_unregister(&adapter->dev); +err_del_dev: + device_del(&adapter->dev); +err_put_dev: + put_device(&adapter->dev); free = false; cxl_guest_remove_chardev(adapter); err1: From 8bf03f557d6c6e108cf47bea32f4a68e276e1157 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 11 Nov 2022 22:54:40 +0800 Subject: [PATCH 050/181] cxl: fix possible null-ptr-deref in cxl_pci_init_afu|adapter() If device_register() fails in cxl_pci_afu|adapter(), the device is not added, device_unregister() can not be called in the error path, otherwise it will cause a null-ptr-deref because of removing not added device. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So split device_unregister() into device_del() and put_device(), then goes to put dev when register fails. Fixes: f204e0b8cedd ("cxl: Driver code for powernv PCIe based cards for userspace access") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221111145440.2426970-2-yangyingliang@huawei.com --- drivers/misc/cxl/pci.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 62385a529d86..0ff944860dda 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1165,10 +1165,10 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; adapter->afu[afu->slice] = afu; @@ -1177,10 +1177,12 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) return 0; -err_put1: +err_del_dev: + device_del(&afu->dev); +err_put_dev: pci_deconfigure_afu(afu); cxl_debugfs_afu_remove(afu); - device_unregister(&afu->dev); + put_device(&afu->dev); return rc; err_free_native: @@ -1668,23 +1670,25 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev) * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* Release the context lock as adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: +err_del_dev: + device_del(&adapter->dev); +err_put_dev: /* This should mirror cxl_remove_adapter, except without the * sysfs parts */ cxl_debugfs_adapter_remove(adapter); cxl_deconfigure_adapter(adapter); - device_unregister(&adapter->dev); + put_device(&adapter->dev); return ERR_PTR(rc); err_release: From 295faa17722a11cac8dbf51e4c9f9405a5e07ef1 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Fri, 11 Nov 2022 22:59:29 +0800 Subject: [PATCH 051/181] ocxl: fix possible name leak in ocxl_file_register_afu() If device_register() returns error in ocxl_file_register_afu(), the name allocated by dev_set_name() need be freed. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So fix this by calling put_device(), then the name can be freed in kobject_cleanup(), and info is freed in info_release(). Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221111145929.2429271-1-yangyingliang@huawei.com --- drivers/misc/ocxl/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index d46dba2df5a1..452d5777a0e4 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -541,8 +541,11 @@ int ocxl_file_register_afu(struct ocxl_afu *afu) goto err_put; rc = device_register(&info->dev); - if (rc) - goto err_put; + if (rc) { + free_minor(info); + put_device(&info->dev); + return rc; + } rc = ocxl_sysfs_register_afu(info); if (rc) From 5f58cad1e4c65bebee34292696c6d2105eeb2027 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 21 Nov 2022 23:43:39 +0800 Subject: [PATCH 052/181] ocxl: fix pci device refcount leak when calling get_function_0() get_function_0() calls pci_get_domain_bus_and_slot(), as comment says, it returns a pci device with refcount increment, so after using it, pci_dev_put() needs be called. Get the device reference when get_function_0() is not called, so pci_dev_put() can be called in the error path and callers unconditionally. And add comment above get_dvsec_vendor0() to tell callers to call pci_dev_put(). Fixes: 87db7579ebd5 ("ocxl: control via sysfs whether the FPGA is reloaded on a link reset") Suggested-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Acked-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221121154339.4088935-1-yangyingliang@huawei.com --- drivers/misc/ocxl/config.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index e401a51596b9..92ab49705f64 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -193,6 +193,18 @@ static int read_dvsec_vendor(struct pci_dev *dev) return 0; } +/** + * get_dvsec_vendor0() - Find a related PCI device (function 0) + * @dev: PCI device to match + * @dev0: The PCI device (function 0) found + * @out_pos: The position of PCI device (function 0) + * + * Returns 0 on success, negative on failure. + * + * NOTE: If it's successful, the reference of dev0 is increased, + * so after using it, the callers must call pci_dev_put() to give + * up the reference. + */ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, int *out_pos) { @@ -202,10 +214,14 @@ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, dev = get_function_0(dev); if (!dev) return -1; + } else { + dev = pci_dev_get(dev); } pos = find_dvsec(dev, OCXL_DVSEC_VENDOR_ID); - if (!pos) + if (!pos) { + pci_dev_put(dev); return -1; + } *dev0 = dev; *out_pos = pos; return 0; @@ -222,6 +238,7 @@ int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val) pci_read_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, &reset_reload); + pci_dev_put(dev0); *val = !!(reset_reload & BIT(0)); return 0; } @@ -243,6 +260,7 @@ int ocxl_config_set_reset_reload(struct pci_dev *dev, int val) reset_reload &= ~BIT(0); pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, reset_reload); + pci_dev_put(dev0); return 0; } From 14b5d59a261b1947db287b3b52f4bb1dc496dede Mon Sep 17 00:00:00 2001 From: Deming Wang <wangdeming@inspur.com> Date: Fri, 1 Jul 2022 05:45:53 -0400 Subject: [PATCH 053/181] powerpc/pseries: Fix formatting to make code look more beautiful Operators should be separated by spaces in tce_buildmulti_pSeriesLP() Signed-off-by: Deming Wang <wangdeming@inspur.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220701094553.1722-1-wangdeming@inspur.com --- arch/powerpc/platforms/pseries/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 561adac69022..c74b71d4733d 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -248,7 +248,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, * Set up the page with TCE data, looping through and setting * the values. */ - limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); + limit = min_t(long, npages, 4096 / TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); From 7af82ff90a2b0690c2c45818fcce4c4ac3b187f3 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Tue, 9 Aug 2022 16:24:25 +0530 Subject: [PATCH 054/181] powerpc/ftrace: Ignore weak functions Extend commit b39181f7c6907d ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function") to ppc32 and ppc64 -mprofile-kernel by defining FTRACE_MCOUNT_MAX_OFFSET. For ppc64 -mprofile-kernel ABI, we can have two instructions at function entry for TOC setup followed by 'mflr r0' and 'bl _mcount'. So, the mcount location is at most the 4th instruction in a function. For ppc32, mcount location is always the 3rd instruction in a function, preceded by 'mflr r0' and 'stw r0,4(r1)'. With this patch, and with ppc64le_guest_defconfig and some ftrace/bpf config items enabled: # grep __ftrace_invalid_address available_filter_functions | wc -l 79 Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220809105425.424045-1-naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ftrace.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index 3cee7115441b..ade406dc6504 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -10,6 +10,13 @@ #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +/* Ignore unused weak functions which will have larger offsets */ +#ifdef CONFIG_MPROFILE_KERNEL +#define FTRACE_MCOUNT_MAX_OFFSET 12 +#elif defined(CONFIG_PPC32) +#define FTRACE_MCOUNT_MAX_OFFSET 8 +#endif + #ifndef __ASSEMBLY__ extern void _mcount(void); From addebe1cfa71eb29caa9d5c6bc719171e4e76414 Mon Sep 17 00:00:00 2001 From: Nicholas Miehlbradt <nicholas@linux.ibm.com> Date: Wed, 10 Aug 2022 04:03:21 +0000 Subject: [PATCH 055/181] docs: powerpc: add POWER9 and POWER10 to CPU families Add POWER9 and POWER10 to CPU families and list Radix MMU. Signed-off-by: Nicholas Miehlbradt <nicholas@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220810040321.375396-1-nicholas@linux.ibm.com --- Documentation/powerpc/cpu_families.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Documentation/powerpc/cpu_families.rst b/Documentation/powerpc/cpu_families.rst index 9b84e045e713..eb7e60649b43 100644 --- a/Documentation/powerpc/cpu_families.rst +++ b/Documentation/powerpc/cpu_families.rst @@ -10,6 +10,7 @@ Book3S (aka sPAPR) ------------------ - Hash MMU (except 603 and e300) +- Radix MMU (POWER9 and later) - Software loaded TLB (603 and e300) - Selectable Software loaded TLB in addition to hash MMU (755, 7450, e600) - Mix of 32 & 64 bit:: @@ -100,6 +101,18 @@ Book3S (aka sPAPR) v +--------------+ | POWER8 | + +--------------+ + | + | + v + +--------------+ + | POWER9 | + +--------------+ + | + | + v + +--------------+ + | POWER10 | +--------------+ From ff8fae94e26f5cd2779ceda0ee6d714a10501abd Mon Sep 17 00:00:00 2001 From: Shaomin Deng <dengshaomin@cdjrlc.com> Date: Sun, 4 Sep 2022 11:51:02 -0400 Subject: [PATCH 056/181] drivers/ps3: Fix double word in comments Drop the repeated word "when" in comments. Signed-off-by: Shaomin Deng <dengshaomin@cdjrlc.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220904155102.26957-1-dengshaomin@cdjrlc.com --- drivers/ps3/ps3-lpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ps3/ps3-lpm.c b/drivers/ps3/ps3-lpm.c index 65512b6cc6fd..200ad8751860 100644 --- a/drivers/ps3/ps3-lpm.c +++ b/drivers/ps3/ps3-lpm.c @@ -1066,7 +1066,7 @@ EXPORT_SYMBOL_GPL(ps3_disable_pm_interrupts); * instance, specified by one of enum ps3_lpm_tb_type. * @tb_cache: Optional user supplied buffer to use as the trace buffer cache. * If NULL, the driver will allocate and manage an internal buffer. - * Unused when when @tb_type is PS3_LPM_TB_TYPE_NONE. + * Unused when @tb_type is PS3_LPM_TB_TYPE_NONE. * @tb_cache_size: The size in bytes of the user supplied @tb_cache buffer. * Unused when @tb_cache is NULL or @tb_type is PS3_LPM_TB_TYPE_NONE. */ From b86cf14f240e002e001fd4f2bf49114c7836fd5c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Fri, 9 Sep 2022 15:23:12 +1000 Subject: [PATCH 057/181] powerpc: add compile-time support for lbarx, lharx ISA v2.06 (POWER7 and up) as well as e6500 support lbarx and lharx. Add a compile option that allows code to use it, and add support in cmpxchg and xchg 8 and 16 bit values without shifting and masking. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220909052312.63916-1-npiggin@gmail.com --- arch/powerpc/Kconfig | 3 + arch/powerpc/include/asm/cmpxchg.h | 231 ++++++++++++++++++++++++- arch/powerpc/lib/sstep.c | 21 +-- arch/powerpc/platforms/Kconfig.cputype | 5 + 4 files changed, 249 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 699df27b0e2f..4fd4924f6d50 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -293,6 +293,9 @@ config PPC_BARRIER_NOSPEC default y depends on PPC_BOOK3S_64 || PPC_E500 +config PPC_HAS_LBARX_LHARX + bool + config EARLY_PRINTK bool default y diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index 05f246c0e36e..d0ea0571e79a 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -77,10 +77,76 @@ u32 __cmpxchg_##type##sfx(volatile void *p, u32 old, u32 new) \ * the previous value stored there. */ +#ifndef CONFIG_PPC_HAS_LBARX_LHARX XCHG_GEN(u8, _local, "memory"); XCHG_GEN(u8, _relaxed, "cc"); XCHG_GEN(u16, _local, "memory"); XCHG_GEN(u16, _relaxed, "cc"); +#else +static __always_inline unsigned long +__xchg_u8_local(volatile void *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lbarx %0,0,%2 # __xchg_u8_local\n" +" stbcx. %3,0,%2 \n" +" bne- 1b" + : "=&r" (prev), "+m" (*(volatile unsigned char *)p) + : "r" (p), "r" (val) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u8_relaxed(u8 *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lbarx %0,0,%2 # __xchg_u8_relaxed\n" +" stbcx. %3,0,%2\n" +" bne- 1b" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (val) + : "cc"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u16_local(volatile void *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lharx %0,0,%2 # __xchg_u16_local\n" +" sthcx. %3,0,%2\n" +" bne- 1b" + : "=&r" (prev), "+m" (*(volatile unsigned short *)p) + : "r" (p), "r" (val) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__xchg_u16_relaxed(u16 *p, unsigned long val) +{ + unsigned long prev; + + __asm__ __volatile__( +"1: lharx %0,0,%2 # __xchg_u16_relaxed\n" +" sthcx. %3,0,%2\n" +" bne- 1b" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (val) + : "cc"); + + return prev; +} +#endif static __always_inline unsigned long __xchg_u32_local(volatile void *p, unsigned long val) @@ -198,11 +264,12 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size) (__typeof__(*(ptr))) __xchg_relaxed((ptr), \ (unsigned long)_x_, sizeof(*(ptr))); \ }) + /* * Compare and exchange - if *p == old, set it to new, * and return the old value of *p. */ - +#ifndef CONFIG_PPC_HAS_LBARX_LHARX CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); CMPXCHG_GEN(u8, _local, , , "memory"); CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); @@ -211,6 +278,168 @@ CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory"); CMPXCHG_GEN(u16, _local, , , "memory"); CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory"); CMPXCHG_GEN(u16, _relaxed, , , "cc"); +#else +static __always_inline unsigned long +__cmpxchg_u8(volatile unsigned char *p, unsigned long old, unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( + PPC_ATOMIC_ENTRY_BARRIER +"1: lbarx %0,0,%2 # __cmpxchg_u8\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" stbcx. %4,0,%2\n" +" bne- 1b" + PPC_ATOMIC_EXIT_BARRIER + "\n\ +2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u8_local(volatile unsigned char *p, unsigned long old, + unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( +"1: lbarx %0,0,%2 # __cmpxchg_u8_local\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" stbcx. %4,0,%2\n" +" bne- 1b\n" +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u8_relaxed(u8 *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( +"1: lbarx %0,0,%2 # __cmpxchg_u8_relaxed\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" stbcx. %4,0,%2\n" +" bne- 1b\n" +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u8_acquire(u8 *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( +"1: lbarx %0,0,%2 # __cmpxchg_u8_acquire\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" stbcx. %4,0,%2\n" +" bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u16(volatile unsigned short *p, unsigned long old, unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( + PPC_ATOMIC_ENTRY_BARRIER +"1: lharx %0,0,%2 # __cmpxchg_u16\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" sthcx. %4,0,%2\n" +" bne- 1b\n" + PPC_ATOMIC_EXIT_BARRIER +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u16_local(volatile unsigned short *p, unsigned long old, + unsigned long new) +{ + unsigned int prev; + + __asm__ __volatile__ ( +"1: lharx %0,0,%2 # __cmpxchg_u16_local\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" sthcx. %4,0,%2\n" +" bne- 1b" +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u16_relaxed(u16 *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( +"1: lharx %0,0,%2 # __cmpxchg_u16_relaxed\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" sthcx. %4,0,%2\n" +" bne- 1b\n" +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc"); + + return prev; +} + +static __always_inline unsigned long +__cmpxchg_u16_acquire(u16 *p, unsigned long old, unsigned long new) +{ + unsigned long prev; + + __asm__ __volatile__ ( +"1: lharx %0,0,%2 # __cmpxchg_u16_acquire\n" +" cmpw 0,%0,%3\n" +" bne- 2f\n" +" sthcx. %4,0,%2\n" +" bne- 1b\n" + PPC_ACQUIRE_BARRIER +"2:" + : "=&r" (prev), "+m" (*p) + : "r" (p), "r" (old), "r" (new) + : "cc", "memory"); + + return prev; +} +#endif static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 398b5694aeb7..38158b77a801 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -2284,15 +2284,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->type = MKOP(STCX, 0, 4); break; -#ifdef __powerpc64__ - case 84: /* ldarx */ - op->type = MKOP(LARX, 0, 8); - break; - - case 214: /* stdcx. */ - op->type = MKOP(STCX, 0, 8); - break; - +#ifdef CONFIG_PPC_HAS_LBARX_LHARX case 52: /* lbarx */ op->type = MKOP(LARX, 0, 1); break; @@ -2308,6 +2300,15 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 726: /* sthcx. */ op->type = MKOP(STCX, 0, 2); break; +#endif +#ifdef __powerpc64__ + case 84: /* ldarx */ + op->type = MKOP(LARX, 0, 8); + break; + + case 214: /* stdcx. */ + op->type = MKOP(STCX, 0, 8); + break; case 276: /* lqarx */ if (!((rd & 1) || rd == ra || rd == rb)) @@ -3334,7 +3335,7 @@ int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op) err = 0; val = 0; switch (size) { -#ifdef __powerpc64__ +#ifdef CONFIG_PPC_HAS_LBARX_LHARX case 1: __get_user_asmx(val, ea, err, "lbarx"); break; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 0c4eed9aea80..7bac213b4125 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -135,6 +135,7 @@ config GENERIC_CPU depends on PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select ARCH_HAS_FAST_MULTIPLIER select PPC_64S_HASH_MMU + select PPC_HAS_LBARX_LHARX config POWERPC_CPU bool "Generic 32 bits powerpc" @@ -160,17 +161,20 @@ config POWER7_CPU depends on PPC_BOOK3S_64 select ARCH_HAS_FAST_MULTIPLIER select PPC_64S_HASH_MMU + select PPC_HAS_LBARX_LHARX config POWER8_CPU bool "POWER8" depends on PPC_BOOK3S_64 select ARCH_HAS_FAST_MULTIPLIER select PPC_64S_HASH_MMU + select PPC_HAS_LBARX_LHARX config POWER9_CPU bool "POWER9" depends on PPC_BOOK3S_64 select ARCH_HAS_FAST_MULTIPLIER + select PPC_HAS_LBARX_LHARX config POWER10_CPU bool "POWER10" @@ -184,6 +188,7 @@ config E5500_CPU config E6500_CPU bool "Freescale e6500" depends on PPC64 && PPC_E500 + select PPC_HAS_LBARX_LHARX config 405_CPU bool "40x family" From d87a233717da400792fa601b29fa74a7d28e03c2 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng <xiujianfeng@huawei.com> Date: Sun, 11 Sep 2022 16:43:44 +0800 Subject: [PATCH 058/181] powerpc/pasemi: Add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220911084344.196353-1-xiujianfeng@huawei.com --- arch/powerpc/platforms/pasemi/gpio_mdio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/gpio_mdio.c b/arch/powerpc/platforms/pasemi/gpio_mdio.c index bf300167ad6b..913b77b92cea 100644 --- a/arch/powerpc/platforms/pasemi/gpio_mdio.c +++ b/arch/powerpc/platforms/pasemi/gpio_mdio.c @@ -294,7 +294,7 @@ static struct platform_driver gpio_mdio_driver = }, }; -static int gpio_mdio_init(void) +static int __init gpio_mdio_init(void) { struct device_node *np; @@ -314,7 +314,7 @@ static int gpio_mdio_init(void) } module_init(gpio_mdio_init); -static void gpio_mdio_exit(void) +static void __exit gpio_mdio_exit(void) { platform_driver_unregister(&gpio_mdio_driver); if (gpio_regs) From 2223552256dfc48435e0699dbe1e9b8d2cd56b06 Mon Sep 17 00:00:00 2001 From: Disha Goel <disgoel@linux.vnet.ibm.com> Date: Fri, 16 Sep 2022 16:27:35 +0530 Subject: [PATCH 059/181] powerpc/kvm: Remove unused macros from asm-offset The kvm code was refactored to convert some of kvm assembly routines to C. This includes commits which moved code path for the kvm guest entry/exit for p7/8 from aseembly to C. As part of the code changes, usage of some of the macros were removed. But definitions still exist in the assembly files. Commits are listed below: Commit 2e1ae9cd56f8 ("KVM: PPC: Book3S HV: Implement radix prefetch workaround by disabling MMU") Commit 9769a7fd79b6 ("KVM: PPC: Book3S HV: Remove radix guest support from P7/8 path") Commit fae5c9f3664b ("KVM: PPC: Book3S HV: remove ISA v3.0 and v3.1 support from P7/8 path") Commit 57dc0eed73ca ("KVM: PPC: Book3S HV P9: Implement PMU save/restore in C") Many of the asm-offset macro definitions were missed to remove. Patch fixes by removing the unused macros. Signed-off-by: Disha Goel <disgoel@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220916105736.268153-2-disgoel@linux.vnet.ibm.com --- arch/powerpc/kernel/asm-offsets.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4ce2a4aa3985..b4b661f631f5 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -418,21 +418,18 @@ int main(void) /* book3s */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - OFFSET(KVM_TLB_SETS, kvm, arch.tlb_sets); OFFSET(KVM_SDR1, kvm, arch.sdr1); OFFSET(KVM_HOST_LPID, kvm, arch.host_lpid); OFFSET(KVM_HOST_LPCR, kvm, arch.host_lpcr); OFFSET(KVM_HOST_SDR1, kvm, arch.host_sdr1); OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); - OFFSET(KVM_RADIX, kvm, arch.radix); OFFSET(KVM_SECURE_GUEST, kvm, arch.secure_guest); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); - OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif @@ -449,16 +446,12 @@ int main(void) OFFSET(VCPU_DABRX, kvm_vcpu, arch.dabrx); OFFSET(VCPU_DAWR0, kvm_vcpu, arch.dawr0); OFFSET(VCPU_DAWRX0, kvm_vcpu, arch.dawrx0); - OFFSET(VCPU_DAWR1, kvm_vcpu, arch.dawr1); - OFFSET(VCPU_DAWRX1, kvm_vcpu, arch.dawrx1); OFFSET(VCPU_CIABR, kvm_vcpu, arch.ciabr); OFFSET(VCPU_HFLAGS, kvm_vcpu, arch.hflags); OFFSET(VCPU_DEC_EXPIRES, kvm_vcpu, arch.dec_expires); OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); - OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending); - OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_MMCRA, kvm_vcpu, arch.mmcra); OFFSET(VCPU_MMCRS, kvm_vcpu, arch.mmcrs); @@ -486,8 +479,6 @@ int main(void) OFFSET(VCPU_TCSCR, kvm_vcpu, arch.tcscr); OFFSET(VCPU_ACOP, kvm_vcpu, arch.acop); OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); - OFFSET(VCPU_TID, kvm_vcpu, arch.tid); - OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); @@ -582,8 +573,6 @@ int main(void) HSTATE_FIELD(HSTATE_HWTHREAD_STATE, hwthread_state); HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); - HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys); - HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt); HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi); HSTATE_FIELD(HSTATE_PTID, ptid); HSTATE_FIELD(HSTATE_FAKE_SUSPEND, fake_suspend); @@ -594,9 +583,6 @@ int main(void) HSTATE_FIELD(HSTATE_SDAR, host_mmcr[4]); HSTATE_FIELD(HSTATE_MMCR2, host_mmcr[5]); HSTATE_FIELD(HSTATE_SIER, host_mmcr[6]); - HSTATE_FIELD(HSTATE_MMCR3, host_mmcr[7]); - HSTATE_FIELD(HSTATE_SIER2, host_mmcr[8]); - HSTATE_FIELD(HSTATE_SIER3, host_mmcr[9]); HSTATE_FIELD(HSTATE_PMC1, host_pmc[0]); HSTATE_FIELD(HSTATE_PMC2, host_pmc[1]); HSTATE_FIELD(HSTATE_PMC3, host_pmc[2]); @@ -672,17 +658,6 @@ int main(void) OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6); #endif -#ifdef CONFIG_KVM_XICS - DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu, - arch.xive_saved_state)); - DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, - arch.xive_cam_word)); - DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); - DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on)); - DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr)); - DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr)); -#endif - #ifdef CONFIG_KVM_EXIT_TIMING OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu); OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl); From 4ac9d3187cc7ccba25f76a3faef3e08a366f77b9 Mon Sep 17 00:00:00 2001 From: Kajol Jain <kjain@linux.ibm.com> Date: Fri, 16 Sep 2022 16:27:36 +0530 Subject: [PATCH 060/181] powerpc/kvm: Remove unused references for MMCR3/SIER2/SIER3 registers Commit 57dc0eed73ca ("KVM: PPC: Book3S HV P9: Implement PMU save/restore in C") removed the PMU save/restore functions from assembly code and implemented these functions in C, for power9 and later platforms. After the code refactoring, Performance Monitoring Unit (PMU) registers became part of "p9_host_os_sprs" structure and now this structure is used to save/restore pmu host registers, for power9 and later platfroms. But we still have old unused registers references. Patch removes unused host_mmcr references for Monitor Mode Control Register 3 (MMCR3)/ Sampled Instruction Event Register 2 (SIER2)/ SIER3 registers from "struct kvmppc_host_state". Fixes: 57dc0eed73ca ("KVM: PPC: Book3S HV P9: Implement PMU save/restore in C") Signed-off-by: Kajol Jain <kjain@linux.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220916105736.268153-3-disgoel@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index c8882d9b86c2..a36797938620 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -105,7 +105,7 @@ struct kvmppc_host_state { void __iomem *xive_tima_virt; u32 saved_xirr; u64 dabr; - u64 host_mmcr[10]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER, MMCR3, SIER2/3 */ + u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ u32 host_pmc[8]; u64 host_purr; u64 host_spurr; From 1c4a4a4c8410be4a231a58b23e7a30923ff954ac Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" <gustavoars@kernel.org> Date: Fri, 16 Sep 2022 15:15:04 +0100 Subject: [PATCH 061/181] powerpc/xmon: Fix -Wswitch-unreachable warning in bpt_cmds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with automatic stack variable initialization, GCC 12 complains about variables defined outside of switch case statements. Move the variable into the case that uses it, which silences the warning: arch/powerpc/xmon/xmon.c: In function ‘bpt_cmds’: arch/powerpc/xmon/xmon.c:1529:13: warning: statement will never be executed [-Wswitch-unreachable] 1529 | int mode; | ^~~~ Fixes: 09b6c1129f89 ("powerpc/xmon: Fix compile error with PPC_8xx=y") Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/YySE6FHiOcbWWR+9@work --- arch/powerpc/xmon/xmon.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index f51c882bf902..e34d7809f6c9 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1525,9 +1525,9 @@ bpt_cmds(void) cmd = inchar(); switch (cmd) { - static const char badaddr[] = "Only kernel addresses are permitted for breakpoints\n"; - int mode; - case 'd': /* bd - hardware data breakpoint */ + case 'd': { /* bd - hardware data breakpoint */ + static const char badaddr[] = "Only kernel addresses are permitted for breakpoints\n"; + int mode; if (xmon_is_ro) { printf(xmon_ro_msg); break; @@ -1560,6 +1560,7 @@ bpt_cmds(void) force_enable_xmon(); break; + } case 'i': /* bi - hardware instr breakpoint */ if (xmon_is_ro) { From 1892e87a3e9170146549779622cb844582f1e2bb Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov <dmitry.torokhov@gmail.com> Date: Mon, 26 Sep 2022 23:03:25 -0700 Subject: [PATCH 062/181] powerpc/warp: switch to using gpiod API This switches PIKA Warp away from legacy gpio API and to newer gpiod API, so that we can eventually deprecate the former. Because LEDs are normally driven by leds-gpio driver, but the platform code also wants to access the LEDs during thermal shutdown, and gpiod API does not allow locating GPIO without requesting it, the platform code is now responsible for locating GPIOs through device tree and requesting them. It then constructs platform data for leds-gpio platform device and registers it. This allows platform code to retain access to LED GPIO descriptors and use them when needed. Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> Acked-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/YzKSLcrYmV5kjyeX@google.com --- arch/powerpc/boot/dts/warp.dts | 4 +- arch/powerpc/platforms/44x/warp.c | 105 ++++++++++++++++++++++++++---- 2 files changed, 94 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/boot/dts/warp.dts b/arch/powerpc/boot/dts/warp.dts index b4f32740870e..aa62d08e97c2 100644 --- a/arch/powerpc/boot/dts/warp.dts +++ b/arch/powerpc/boot/dts/warp.dts @@ -258,14 +258,12 @@ }; power-leds { - compatible = "gpio-leds"; + compatible = "warp-power-leds"; green { gpios = <&GPIO1 0 0>; - default-state = "keep"; }; red { gpios = <&GPIO1 1 0>; - default-state = "keep"; }; }; diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index f03432ef010b..cefa313c09f0 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -5,15 +5,17 @@ * Copyright (c) 2008-2009 PIKA Technologies * Sean MacLennan <smaclennan@pikatech.com> */ +#include <linux/err.h> #include <linux/init.h> #include <linux/of_platform.h> #include <linux/kthread.h> +#include <linux/leds.h> #include <linux/i2c.h> #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/of_address.h> #include <linux/of_irq.h> -#include <linux/of_gpio.h> +#include <linux/gpio/consumer.h> #include <linux/slab.h> #include <linux/export.h> @@ -92,8 +94,6 @@ static int __init warp_post_info(void) static LIST_HEAD(dtm_shutdown_list); static void __iomem *dtm_fpga; -static unsigned green_led, red_led; - struct dtm_shutdown { struct list_head list; @@ -101,7 +101,6 @@ struct dtm_shutdown { void *arg; }; - int pika_dtm_register_shutdown(void (*func)(void *arg), void *arg) { struct dtm_shutdown *shutdown; @@ -132,6 +131,35 @@ int pika_dtm_unregister_shutdown(void (*func)(void *arg), void *arg) return -EINVAL; } +#define WARP_GREEN_LED 0 +#define WARP_RED_LED 1 + +static struct gpio_led warp_gpio_led_pins[] = { + [WARP_GREEN_LED] = { + .name = "green", + .default_state = LEDS_DEFSTATE_KEEP, + .gpiod = NULL, /* to be filled by pika_setup_leds() */ + }, + [WARP_RED_LED] = { + .name = "red", + .default_state = LEDS_DEFSTATE_KEEP, + .gpiod = NULL, /* to be filled by pika_setup_leds() */ + }, +}; + +static struct gpio_led_platform_data warp_gpio_led_data = { + .leds = warp_gpio_led_pins, + .num_leds = ARRAY_SIZE(warp_gpio_led_pins), +}; + +static struct platform_device warp_gpio_leds = { + .name = "leds-gpio", + .id = -1, + .dev = { + .platform_data = &warp_gpio_led_data, + }, +}; + static irqreturn_t temp_isr(int irq, void *context) { struct dtm_shutdown *shutdown; @@ -139,7 +167,7 @@ static irqreturn_t temp_isr(int irq, void *context) local_irq_disable(); - gpio_set_value(green_led, 0); + gpiod_set_value(warp_gpio_led_pins[WARP_GREEN_LED].gpiod, 0); /* Run through the shutdown list. */ list_for_each_entry(shutdown, &dtm_shutdown_list, list) @@ -153,7 +181,7 @@ static irqreturn_t temp_isr(int irq, void *context) out_be32(dtm_fpga + 0x14, reset); } - gpio_set_value(red_led, value); + gpiod_set_value(warp_gpio_led_pins[WARP_RED_LED].gpiod, value); value ^= 1; mdelay(500); } @@ -162,25 +190,78 @@ static irqreturn_t temp_isr(int irq, void *context) return IRQ_HANDLED; } +/* + * Because green and red power LEDs are normally driven by leds-gpio driver, + * but in case of critical temperature shutdown we want to drive them + * ourselves, we acquire both and then create leds-gpio platform device + * ourselves, instead of doing it through device tree. This way we can still + * keep access to the gpios and use them when needed. + */ static int pika_setup_leds(void) { struct device_node *np, *child; + struct gpio_desc *gpio; + struct gpio_led *led; + int led_count = 0; + int error; + int i; - np = of_find_compatible_node(NULL, NULL, "gpio-leds"); + np = of_find_compatible_node(NULL, NULL, "warp-power-leds"); if (!np) { printk(KERN_ERR __FILE__ ": Unable to find leds\n"); return -ENOENT; } - for_each_child_of_node(np, child) - if (of_node_name_eq(child, "green")) - green_led = of_get_gpio(child, 0); - else if (of_node_name_eq(child, "red")) - red_led = of_get_gpio(child, 0); + for_each_child_of_node(np, child) { + for (i = 0; i < ARRAY_SIZE(warp_gpio_led_pins); i++) { + led = &warp_gpio_led_pins[i]; + + if (!of_node_name_eq(child, led->name)) + continue; + + if (led->gpiod) { + printk(KERN_ERR __FILE__ ": %s led has already been defined\n", + led->name); + continue; + } + + gpio = fwnode_gpiod_get_index(of_fwnode_handle(child), + NULL, 0, GPIOD_ASIS, + led->name); + error = PTR_ERR_OR_ZERO(gpio); + if (error) { + printk(KERN_ERR __FILE__ ": Failed to get %s led gpio: %d\n", + led->name, error); + of_node_put(child); + goto err_cleanup_pins; + } + + led->gpiod = gpio; + led_count++; + } + } of_node_put(np); + /* Skip device registration if no leds have been defined */ + if (led_count) { + error = platform_device_register(&warp_gpio_leds); + if (error) { + printk(KERN_ERR __FILE__ ": Unable to add leds-gpio: %d\n", + error); + goto err_cleanup_pins; + } + } + return 0; + +err_cleanup_pins: + for (i = 0; i < ARRAY_SIZE(warp_gpio_led_pins); i++) { + led = &warp_gpio_led_pins[i]; + gpiod_put(led->gpiod); + led->gpiod = NULL; + } + return error; } static void pika_setup_critical_temp(struct device_node *np, From 4e87bd14e501030619d1bad29b3ec1f947f84fc4 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov <dmitry.torokhov@gmail.com> Date: Tue, 27 Sep 2022 12:23:58 -0700 Subject: [PATCH 063/181] powerpc/sgy_cts1000: convert to using gpiod API and facelift This patch converts the driver to newer gpiod API, and away from OF-specific legacy gpio API that we want to stop using. While at it, let's address a few more issues: - switch to using dev_info()/pr_info() and friends - cancel work when unbinding the driver Note that the original code handled halt GPIO polarity incorrectly: in halt callback, when line polarity is "low" it would set trigger to "1" and drive halt line high, which is counter to the annotation. gpiod API will drive such line low. However I do not see any DTSes in mainline that have a DT node with "sgy,gpio-halt" compatible. Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/YzNNznewTyCJiGFz@google.com --- arch/powerpc/platforms/85xx/sgy_cts1000.c | 132 +++++++++------------- 1 file changed, 53 insertions(+), 79 deletions(-) diff --git a/arch/powerpc/platforms/85xx/sgy_cts1000.c b/arch/powerpc/platforms/85xx/sgy_cts1000.c index e14d1b74d4e4..751395cbf022 100644 --- a/arch/powerpc/platforms/85xx/sgy_cts1000.c +++ b/arch/powerpc/platforms/85xx/sgy_cts1000.c @@ -7,10 +7,13 @@ * Copyright 2012 by Servergy, Inc. */ +#define pr_fmt(fmt) "gpio-halt: " fmt + +#include <linux/err.h> #include <linux/platform_device.h> #include <linux/device.h> +#include <linux/gpio/consumer.h> #include <linux/module.h> -#include <linux/of_gpio.h> #include <linux/of_irq.h> #include <linux/workqueue.h> #include <linux/reboot.h> @@ -18,7 +21,8 @@ #include <asm/machdep.h> -static struct device_node *halt_node; +static struct gpio_desc *halt_gpio; +static int halt_irq; static const struct of_device_id child_match[] = { { @@ -36,23 +40,10 @@ static DECLARE_WORK(gpio_halt_wq, gpio_halt_wfn); static void __noreturn gpio_halt_cb(void) { - enum of_gpio_flags flags; - int trigger, gpio; - - if (!halt_node) - panic("No reset GPIO information was provided in DT\n"); - - gpio = of_get_gpio_flags(halt_node, 0, &flags); - - if (!gpio_is_valid(gpio)) - panic("Provided GPIO is invalid\n"); - - trigger = (flags == OF_GPIO_ACTIVE_LOW); - - printk(KERN_INFO "gpio-halt: triggering GPIO.\n"); + pr_info("triggering GPIO.\n"); /* Probably wont return */ - gpio_set_value(gpio, trigger); + gpiod_set_value(halt_gpio, 1); panic("Halt failed\n"); } @@ -61,95 +52,78 @@ static void __noreturn gpio_halt_cb(void) * to handle the shutdown/poweroff. */ static irqreturn_t gpio_halt_irq(int irq, void *__data) { - printk(KERN_INFO "gpio-halt: shutdown due to power button IRQ.\n"); + struct platform_device *pdev = __data; + + dev_info(&pdev->dev, "scheduling shutdown due to power button IRQ\n"); schedule_work(&gpio_halt_wq); return IRQ_HANDLED; }; -static int gpio_halt_probe(struct platform_device *pdev) +static int __gpio_halt_probe(struct platform_device *pdev, + struct device_node *halt_node) { - enum of_gpio_flags flags; - struct device_node *node = pdev->dev.of_node; - struct device_node *child_node; - int gpio, err, irq; - int trigger; + int err; - if (!node) - return -ENODEV; - - /* If there's no matching child, this isn't really an error */ - child_node = of_find_matching_node(node, child_match); - if (!child_node) - return 0; - - /* Technically we could just read the first one, but punish - * DT writers for invalid form. */ - if (of_gpio_count(child_node) != 1) { - err = -EINVAL; - goto err_put; - } - - /* Get the gpio number relative to the dynamic base. */ - gpio = of_get_gpio_flags(child_node, 0, &flags); - if (!gpio_is_valid(gpio)) { - err = -EINVAL; - goto err_put; - } - - err = gpio_request(gpio, "gpio-halt"); + halt_gpio = fwnode_gpiod_get_index(of_fwnode_handle(halt_node), + NULL, 0, GPIOD_OUT_LOW, "gpio-halt"); + err = PTR_ERR_OR_ZERO(halt_gpio); if (err) { - printk(KERN_ERR "gpio-halt: error requesting GPIO %d.\n", - gpio); - goto err_put; + dev_err(&pdev->dev, "failed to request halt GPIO: %d\n", err); + return err; } - trigger = (flags == OF_GPIO_ACTIVE_LOW); - - gpio_direction_output(gpio, !trigger); - /* Now get the IRQ which tells us when the power button is hit */ - irq = irq_of_parse_and_map(child_node, 0); - err = request_irq(irq, gpio_halt_irq, IRQF_TRIGGER_RISING | - IRQF_TRIGGER_FALLING, "gpio-halt", child_node); + halt_irq = irq_of_parse_and_map(halt_node, 0); + err = request_irq(halt_irq, gpio_halt_irq, + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + "gpio-halt", pdev); if (err) { - printk(KERN_ERR "gpio-halt: error requesting IRQ %d for " - "GPIO %d.\n", irq, gpio); - gpio_free(gpio); - goto err_put; + dev_err(&pdev->dev, "failed to request IRQ %d: %d\n", + halt_irq, err); + gpiod_put(halt_gpio); + halt_gpio = NULL; + return err; } /* Register our halt function */ ppc_md.halt = gpio_halt_cb; pm_power_off = gpio_halt_cb; - printk(KERN_INFO "gpio-halt: registered GPIO %d (%d trigger, %d" - " irq).\n", gpio, trigger, irq); + dev_info(&pdev->dev, "registered halt GPIO, irq: %d\n", halt_irq); - halt_node = child_node; return 0; +} -err_put: - of_node_put(child_node); - return err; +static int gpio_halt_probe(struct platform_device *pdev) +{ + struct device_node *halt_node; + int ret; + + if (!pdev->dev.of_node) + return -ENODEV; + + /* If there's no matching child, this isn't really an error */ + halt_node = of_find_matching_node(pdev->dev.of_node, child_match); + if (!halt_node) + return -ENODEV; + + ret = __gpio_halt_probe(pdev, halt_node); + of_node_put(halt_node); + + return ret; } static int gpio_halt_remove(struct platform_device *pdev) { - if (halt_node) { - int gpio = of_get_gpio(halt_node, 0); - int irq = irq_of_parse_and_map(halt_node, 0); + free_irq(halt_irq, pdev); + cancel_work_sync(&gpio_halt_wq); - free_irq(irq, halt_node); + ppc_md.halt = NULL; + pm_power_off = NULL; - ppc_md.halt = NULL; - pm_power_off = NULL; - - gpio_free(gpio); - - of_node_put(halt_node); - halt_node = NULL; - } + gpiod_put(halt_gpio); + halt_gpio = NULL; return 0; } From f2c45962cc618c12f69fd46e6ebc20b9cd7f15ac Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Wed, 28 Sep 2022 08:29:00 +0200 Subject: [PATCH 064/181] powerpc/8xx: Simplify pte_update() with 16k pages While looking at code generated for code patching, I saw that pte_clear generated: 2d8: 38 a0 00 00 li r5,0 2dc: 38 e0 10 00 li r7,4096 2e0: 39 00 20 00 li r8,8192 2e4: 39 40 30 00 li r10,12288 2e8: 90 a9 00 00 stw r5,0(r9) 2ec: 90 e9 00 04 stw r7,4(r9) 2f0: 91 09 00 08 stw r8,8(r9) 2f4: 91 49 00 0c stw r10,12(r9) With 16k pages, only the first entry is used by the kernel, so no need to adapt the address of other entries. Only duplicate the first entry for hardware. Now it is: 2cc: 39 40 00 00 li r10,0 2d0: 91 49 00 00 stw r10,0(r9) 2d4: 91 49 00 04 stw r10,4(r9) 2d8: 91 49 00 08 stw r10,8(r9) 2dc: 91 49 00 0c stw r10,12(r9) Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/65f76300de07091a59a042a3db2d0ce9b939a05c.1664346532.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 0d40b33184eb..0e861e59b769 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -256,8 +256,14 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p num = number_of_cells_per_pte(pmd, new, huge); - for (i = 0; i < num; i++, entry++, new += SZ_4K) - *entry = new; + for (i = 0; i < num; i += PAGE_SIZE / SZ_4K, new += PAGE_SIZE) { + *entry++ = new; + if (IS_ENABLED(CONFIG_PPC_16K_PAGES) && num != 1) { + *entry++ = new; + *entry++ = new; + *entry++ = new; + } + } return old; } From 0b4721815c5328e08c3acdee4a53890e012d830b Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Wed, 28 Sep 2022 08:29:22 +0200 Subject: [PATCH 065/181] powerpc/8xx: Reverse order entries are written by __set_pte_at() At the time being, with 16k pages __set_pte_at() writes table entries in reverse order: 294: 91 49 00 0c stw r10,12(r9) 298: 91 49 00 08 stw r10,8(r9) 29c: 91 49 00 04 stw r10,4(r9) 2a0: 91 49 00 00 stw r10,0(r9) Allthough there should be no impact at all as it stays in a single cacheline, reverse the writing in a more natural order. 288: 91 49 00 0c stw r10,0(r9) 28c: 91 49 00 08 stw r10,4(r9) 290: 91 49 00 04 stw r10,8(r9) 294: 91 49 00 00 stw r10,12(r9) Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/67c3b5d44edfec054234ea9b4d05fc4b4f7f8a0e.1664346554.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index d9067dfc531c..69c3a050a3d8 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -183,7 +183,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * cases, and 32-bit non-hash with 32-bit PTEs. */ #if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES) - ptep->pte = ptep->pte1 = ptep->pte2 = ptep->pte3 = pte_val(pte); + ptep->pte3 = ptep->pte2 = ptep->pte1 = ptep->pte = pte_val(pte); #else *ptep = pte; #endif From 5825603f67bc5ff445a1847302884154f0afa627 Mon Sep 17 00:00:00 2001 From: Joel Stanley <joel@jms.id.au> Date: Fri, 30 Sep 2022 16:20:12 +0930 Subject: [PATCH 066/181] powerpc/microwatt: Add litesd This is the register layout of the litesd peripheral for the fusesoc based Microwatt SoC. It requires a description of the system clock, which is hardcoded to 100MHz. Signed-off-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220930065012.2860577-1-joel@jms.id.au --- arch/powerpc/boot/dts/microwatt.dts | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index b69db1d275cd..269e930b3b0b 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -21,6 +21,14 @@ reg = <0x00000000 0x00000000 0x00000000 0x10000000>; }; + clocks { + sys_clk: litex_sys_clk { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <100000000>; + }; + }; + cpus { #size-cells = <0x00>; #address-cells = <0x01>; @@ -141,6 +149,20 @@ litex,slot-size = <0x800>; interrupts = <0x11 0x1>; }; + + mmc@8040000 { + compatible = "litex,mmc"; + reg = <0x8042800 0x800 + 0x8041000 0x800 + 0x8040800 0x800 + 0x8042000 0x800 + 0x8041800 0x800>; + reg-names = "phy", "core", "reader", "writer", "irq"; + bus-width = <4>; + interrupts = <0x13 1>; + cap-sd-highspeed; + clocks = <&sys_clk>; + }; }; chosen { From 3e65412709293d5fb65249408e8e801b23b72635 Mon Sep 17 00:00:00 2001 From: Michael Ellerman <mpe@ellerman.id.au> Date: Thu, 6 Oct 2022 14:20:18 +1100 Subject: [PATCH 067/181] powerpc: Make instruction dump work with scripts/decodecode Matt reported that scripts/decodecode doesn't work for the instruction dump in the powerpc oops output. Although there are scripts around that can decode it, it would be preferable if the standard in-tree script worked. All other arches prefix the instruction dump with "Code:", and that's what the script looks for, so use that. The script then works as expected: $ CROSS_COMPILE=powerpc64le-linux-gnu- ./scripts/decodecode Code: fbc1fff0 f821ffc1 7c7d1b78 7c9c2378 ebc30028 7fdff378 48000018 60000000 60000000 ebff0008 7c3ef840 41820048 <815f0060> e93f0000 5529077c 7d295378 ^D All code ======== 0: f0 ff c1 fb std r30,-16(r1) 4: c1 ff 21 f8 stdu r1,-64(r1) 8: 78 1b 7d 7c mr r29,r3 ... Note that the script doesn't cope well with printk timestamps or printk caller info. Reported-by: Matthew Wilcox <willy@infradead.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221006032019.1128624-1-mpe@ellerman.id.au --- arch/powerpc/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 67da147fe34d..3372b5c21168 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1359,7 +1359,7 @@ static void show_instructions(struct pt_regs *regs) unsigned long nip = regs->nip; unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - printk("Instruction dump:"); + printk("Code:"); /* * If we were executing with the MMU off for instructions, adjust pc From d90bb7b4fdaff3f2fa68c7af85de2ce9e70189b1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman <mpe@ellerman.id.au> Date: Thu, 6 Oct 2022 14:20:19 +1100 Subject: [PATCH 068/181] powerpc: Print instruction dump on a single line Although the previous commit made the powerpc instruction dump usable with scripts/decodecode, there are still some problems. Because the dump is split across multiple lines, the script doesn't cope with printk timestamps or caller info. That can be fixed by printing the entire dump on one line, eg: [ 12.016307][ T112] --- interrupt: c00 [ 12.016605][ T112] Code: 4b7aae15 60000000 3d22016e 3c62ffec 39291160 38639bc0 e8890000 4b7aadf9 60000000 4bfffee8 7c0802a6 60000000 <0fe00000> 60420000 3c4c008f 384268a0 [ 12.017655][ T112] ---[ end trace 0000000000000000 ]--- That output can then be piped directly into scripts/decodecode and interpreted correctly. Printing the dump on a single line does produce a very long line, about 173 characters. That is still shorter than x86, which prints nearly 200 characters even without timestamps etc. All consoles I'm aware of will wrap the line if it's too long, so the length should not be a functional problem. If anything it should help on consoles like VGA by using less vertical space. Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221006032019.1128624-2-mpe@ellerman.id.au --- arch/powerpc/kernel/process.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3372b5c21168..e3e1feaa536a 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1359,7 +1359,7 @@ static void show_instructions(struct pt_regs *regs) unsigned long nip = regs->nip; unsigned long pc = regs->nip - (NR_INSN_TO_PRINT * 3 / 4 * sizeof(int)); - printk("Code:"); + printk("Code: "); /* * If we were executing with the MMU off for instructions, adjust pc @@ -1373,9 +1373,6 @@ static void show_instructions(struct pt_regs *regs) for (i = 0; i < NR_INSN_TO_PRINT; i++) { int instr; - if (!(i % 8)) - pr_cont("\n"); - if (!__kernel_text_address(pc) || get_kernel_nofault(instr, (const void *)pc)) { pr_cont("XXXXXXXX "); From f985adaf2ff934ec869b32ca1f7f97e2825e3a49 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Thu, 6 Oct 2022 20:56:53 +1000 Subject: [PATCH 069/181] powerpc: remove the last remnants of cputime_t cputime_t was a core kernel type, removed by commits ed5c8c854f2b..b672592f0221. As explained in commit b672592f0221 ("sched/cputime: Remove generic asm headers"), the final cleanup is for the arch to provide cputime_to_nsec[s](). Commit ade7667a981b ("powerpc: Add cputime_to_nsecs()") did that, but justdidn't remove the then-unused cputime_to_usecs(), cputime_t type, and associated remnants. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221006105653.115829-1-npiggin@gmail.com --- arch/powerpc/include/asm/cputime.h | 17 +---------------- arch/powerpc/kernel/time.c | 23 ++--------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 431ae2343022..4961fb38e438 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -21,23 +21,8 @@ #include <asm/param.h> #include <asm/firmware.h> -typedef u64 __nocast cputime_t; -typedef u64 __nocast cputime64_t; - -#define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) - #ifdef __KERNEL__ -/* - * Convert cputime <-> microseconds - */ -extern u64 __cputime_usec_factor; - -static inline unsigned long cputime_to_usecs(const cputime_t ct) -{ - return mulhdu((__force u64) ct, __cputime_usec_factor); -} - -#define cputime_to_nsecs(cputime) tb_to_ns((__force u64)cputime) +#define cputime_to_nsecs(cputime) tb_to_ns(cputime) /* * PPC64 uses PACA which is task independent for storing accounting data while diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index a2ab397065c6..d68de3618741 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -130,7 +130,7 @@ unsigned long tb_ticks_per_jiffy; unsigned long tb_ticks_per_usec = 100; /* sane default */ EXPORT_SYMBOL(tb_ticks_per_usec); unsigned long tb_ticks_per_sec; -EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */ +EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime conversions */ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); @@ -150,21 +150,6 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq); bool tb_invalid; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -/* - * Factor for converting from cputime_t (timebase ticks) to - * microseconds. This is stored as 0.64 fixed-point binary fraction. - */ -u64 __cputime_usec_factor; -EXPORT_SYMBOL(__cputime_usec_factor); - -static void calc_cputime_factors(void) -{ - struct div_result res; - - div128_by_32(1000000, 0, tb_ticks_per_sec, &res); - __cputime_usec_factor = res.result_low; -} - /* * Read the SPURR on systems that have it, otherwise the PURR, * or if that doesn't exist return the timebase value passed in. @@ -369,10 +354,7 @@ void vtime_flush(struct task_struct *tsk) acct->hardirq_time = 0; acct->softirq_time = 0; } - -#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -#define calc_cputime_factors() -#endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void __delay(unsigned long loops) { @@ -914,7 +896,6 @@ void __init time_init(void) tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; - calc_cputime_factors(); /* * Compute scale factor for sched_clock. From 2cb1dfac6f792f9e4a092793215f0d26e9f8d5b2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Date: Sun, 9 Oct 2022 12:49:50 +0200 Subject: [PATCH 070/181] powerpc/sysdev: Remove some duplicate prefix in some messages At the beginning of the file, we have: #define pr_fmt(fmt) "xive: " fmt So, there is no need to duplicate "XIVE:" in debug and error messages. For the records, these useless prefix have been added in commit 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/7b8b5915a2c7c1616b33e8433ebe0a0bf07070a2.1665312579.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/sysdev/xive/native.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 3925825954bc..19d880ebc5e6 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -535,13 +535,13 @@ static bool __init xive_parse_provisioning(struct device_node *np) static void __init xive_native_setup_pools(void) { /* Allocate a pool big enough */ - pr_debug("XIVE: Allocating VP block for pool size %u\n", nr_cpu_ids); + pr_debug("Allocating VP block for pool size %u\n", nr_cpu_ids); xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids); if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP)) - pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n"); + pr_err("Failed to allocate pool VP, KVM might not function\n"); - pr_debug("XIVE: Pool VPs allocated at 0x%x for %u max CPUs\n", + pr_debug("Pool VPs allocated at 0x%x for %u max CPUs\n", xive_pool_vps, nr_cpu_ids); } From 579aee9fc594af94c242068c011b0233563d4bbf Mon Sep 17 00:00:00 2001 From: Stephen Rothwell <sfr@canb.auug.org.au> Date: Mon, 10 Oct 2022 16:57:21 +1100 Subject: [PATCH 071/181] powerpc: suppress some linker warnings in recent linker versions This is a follow on from commit 0d362be5b142 ("Makefile: link with -z noexecstack --no-warn-rwx-segments") for arch/powerpc/boot to address wanrings like: ld: warning: opal-calls.o: missing .note.GNU-stack section implies executable stack ld: NOTE: This behaviour is deprecated and will be removed in a future version of the linker ld: warning: arch/powerpc/boot/zImage.epapr has a LOAD segment with RWX permissions This fixes issue https://github.com/linuxppc/issues/issues/417 Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221010165721.106267e6@canb.auug.org.au --- arch/powerpc/boot/wrapper | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 5bdd4dd20bbb..a8a87d7667f4 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -215,6 +215,11 @@ ld_version() }' } +ld_is_lld() +{ + ${CROSS}ld -V 2>&1 | grep -q LLD +} + # Do not include PT_INTERP segment when linking pie. Non-pie linking # just ignores this option. LD_VERSION=$(${CROSS}ld --version | ld_version) @@ -223,6 +228,14 @@ if [ "$LD_VERSION" -ge "$LD_NO_DL_MIN_VERSION" ] ; then nodl="--no-dynamic-linker" fi +# suppress some warnings in recent ld versions +nowarn="-z noexecstack" +if ! ld_is_lld; then + if [ "$LD_VERSION" -ge "$(echo 2.39 | ld_version)" ]; then + nowarn="$nowarn --no-warn-rwx-segments" + fi +fi + platformo=$object/"$platform".o lds=$object/zImage.lds ext=strip @@ -504,7 +517,7 @@ if [ "$platform" != "miboot" ]; then text_start="-Ttext $link_address" fi #link everything - ${CROSS}ld -m $format -T $lds $text_start $pie $nodl $rodynamic $notext -o "$ofile" $map \ + ${CROSS}ld -m $format -T $lds $text_start $pie $nodl $nowarn $rodynamic $notext -o "$ofile" $map \ $platformo $tmp $object/wrapper.a rm $tmp fi From 8b49670f3bb3f10cd4d5a6dca17f5a31b173ecdc Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 17 Oct 2022 11:23:33 +0800 Subject: [PATCH 072/181] powerpc/xive: add missing iounmap() in error path in xive_spapr_populate_irq_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If remapping 'data->trig_page' fails, the 'data->eoi_mmio' need be unmapped before returning from xive_spapr_populate_irq_data(). Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Cédric Le Goater <clg@kaod.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221017032333.1852406-1-yangyingliang@huawei.com --- arch/powerpc/sysdev/xive/spapr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index e2c8f93b535b..e45419264391 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -439,6 +439,7 @@ static int xive_spapr_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); if (!data->trig_mmio) { + iounmap(data->eoi_mmio); pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); return -ENOMEM; } From 16a3f41ff3322830683d3ccc14d77736829c61bf Mon Sep 17 00:00:00 2001 From: ruanjinjie <ruanjinjie@huawei.com> Date: Wed, 19 Oct 2022 14:34:14 +0800 Subject: [PATCH 073/181] powerpc/mpic_msgr: fix cast removes address space of expression warnings When build Linux kernel, encounter the following warnings: ./arch/powerpc/sysdev/mpic_msgr.c:230:38: warning: cast removes address space '__iomem' of expression ./arch/powerpc/sysdev/mpic_msgr.c:230:27: warning: incorrect type in assignment (different address spaces) The data type of msgr->mer and msgr->base are 'u32 __iomem *', but converted to 'u32 *' and 'u8 *' directly and cause above warnings, now instead of using a type cast, change the size of the pointer offset to fix these warnings. Signed-off-by: ruanjinjie <ruanjinjie@huawei.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221019063414.3758087-1-ruanjinjie@huawei.com --- arch/powerpc/sysdev/mpic_msgr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index a439e33eae06..d75064fb7d12 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -20,7 +20,7 @@ #define MPIC_MSGR_REGISTERS_PER_BLOCK 4 #define MPIC_MSGR_STRIDE 0x10 -#define MPIC_MSGR_MER_OFFSET 0x100 +#define MPIC_MSGR_MER_OFFSET (0x100 / sizeof(u32)) #define MSGR_INUSE 0 #define MSGR_FREE 1 @@ -234,7 +234,7 @@ static int mpic_msgr_probe(struct platform_device *dev) reg_number = block_number * MPIC_MSGR_REGISTERS_PER_BLOCK + i; msgr->base = msgr_block_addr + i * MPIC_MSGR_STRIDE; - msgr->mer = (u32 *)((u8 *)msgr->base + MPIC_MSGR_MER_OFFSET); + msgr->mer = msgr->base + MPIC_MSGR_MER_OFFSET; msgr->in_use = MSGR_FREE; msgr->num = i; raw_spin_lock_init(&msgr->lock); From 2fa9482334b0593b7edc371a13c0cca81daaa89e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Thu, 20 Oct 2022 22:58:57 +0530 Subject: [PATCH 074/181] powerpc/kprobes: Remove preempt disable around call to get_kprobe() in arch_prepare_kprobe() arch_prepare_kprobe() is called from register_kprobe() via prepare_kprobe(), or through register_aggr_kprobe(), both with the kprobe_mutex held. Per the comment for get_kprobe(): /* * This routine is called either: * - under the 'kprobe_mutex' - during kprobe_[un]register(). * OR * - with preemption disabled - from architecture specific code. */ As such, there is no need to disable preemption around the call to get_kprobe(). Drop the same. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1043d06a0affed83a4a46dd29466e72820ee215d.1666262278.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/kernel/kprobes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bd7b1a035459..88f42de681e1 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -158,9 +158,7 @@ int arch_prepare_kprobe(struct kprobe *p) printk("Cannot register a kprobe on the second word of prefixed instruction\n"); ret = -EINVAL; } - preempt_disable(); prev = get_kprobe(p->addr - 1); - preempt_enable_no_resched(); /* * When prev is a ftrace-based kprobe, we don't have an insn, and it From 04ec5d5782fb346c291a05a2efe59483d8ada4c4 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Thu, 20 Oct 2022 22:58:58 +0530 Subject: [PATCH 075/181] powerpc/kprobes: Have optimized_callback() use preempt_enable() Similar to x86 commit 2e62024c265aa6 ("kprobes/x86: Use preempt_enable() in optimized_callback()"), change powerpc optprobes to use preempt_enable() rather than preempt_enable_no_resched() since powerpc also removed irq disabling for optprobes in commit f72180cc93a2c6 ("powerpc/kprobes: Do not disable interrupts for optprobes and kprobes_on_ftrace"). Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1885bab182626c33d9bf6421f430abf924c521a5.1666262278.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/kernel/optprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 3b1c2236cbee..004fae2044a3 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -112,7 +112,7 @@ static void optimized_callback(struct optimized_kprobe *op, __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + preempt_enable(); } NOKPROBE_SYMBOL(optimized_callback); From 266b1991a433cd55bb86a933216b3f6762737d47 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Thu, 20 Oct 2022 22:58:59 +0530 Subject: [PATCH 076/181] powerpc/kprobes: Use preempt_enable() rather than the no_resched variant preempt_enable_no_resched() is just the same as preempt_enable() when we are in a irqs disabled context. kprobe_handler() and the post/fault handlers are all called with irqs disabled. As such, convert those to just use preempt_enable(). Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/72639f75fe66f931ec8c2165276ffbfb0fe1006f.1666262278.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/kernel/kprobes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 88f42de681e1..86ca5a61ea9a 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -369,7 +369,7 @@ int kprobe_handler(struct pt_regs *regs) if (ret > 0) { restore_previous_kprobe(kcb); - preempt_enable_no_resched(); + preempt_enable(); return 1; } } @@ -382,7 +382,7 @@ int kprobe_handler(struct pt_regs *regs) if (p->pre_handler && p->pre_handler(p, regs)) { /* handler changed execution path, so skip ss setup */ reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } @@ -395,7 +395,7 @@ int kprobe_handler(struct pt_regs *regs) kcb->kprobe_status = KPROBE_HIT_SSDONE; reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); return 1; } } @@ -404,7 +404,7 @@ int kprobe_handler(struct pt_regs *regs) return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } NOKPROBE_SYMBOL(kprobe_handler); @@ -490,7 +490,7 @@ int kprobe_post_handler(struct pt_regs *regs) } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, msr @@ -529,7 +529,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: From 04757c5e21ea17615b66f45e38f1cab32a7a0654 Mon Sep 17 00:00:00 2001 From: Colin Ian King <colin.i.king@gmail.com> Date: Fri, 21 Oct 2022 09:45:45 +0100 Subject: [PATCH 077/181] selftests/powerpc: Fix spelling mistake "mmaping" -> "mmapping" There is a spelling mistake in a perror message. Fix it. Signed-off-by: Colin Ian King <colin.i.king@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221021084545.65973-1-colin.i.king@gmail.com --- tools/testing/selftests/powerpc/ptrace/core-pkey.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c index bbc05ffc5860..1a70a96f0bfe 100644 --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c @@ -329,7 +329,7 @@ static int parent(struct shared_info *info, pid_t pid) core = mmap(NULL, core_size, PROT_READ, MAP_PRIVATE, fd, 0); if (core == (void *) -1) { - perror("Error mmaping core file"); + perror("Error mmapping core file"); ret = TEST_FAIL; goto out; } From ad8284ead833379fc57d90e50dbae1352b116c2b Mon Sep 17 00:00:00 2001 From: Shaomin Deng <dengshaomin@cdjrlc.com> Date: Sat, 29 Oct 2022 05:46:43 -0400 Subject: [PATCH 078/181] selftests/powerpc: Remove repeated word in comments Remove the repeated word "not" in comments. Signed-off-by: Shaomin Deng <dengshaomin@cdjrlc.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221029094643.5595-1-dengshaomin@cdjrlc.com --- tools/testing/selftests/powerpc/include/pkeys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h index 3312cb1b058d..51729d9a7111 100644 --- a/tools/testing/selftests/powerpc/include/pkeys.h +++ b/tools/testing/selftests/powerpc/include/pkeys.h @@ -24,7 +24,7 @@ #undef PKEY_DISABLE_EXECUTE #define PKEY_DISABLE_EXECUTE 0x4 -/* Older versions of libc do not not define this */ +/* Older versions of libc do not define this */ #ifndef SEGV_PKUERR #define SEGV_PKUERR 4 #endif From f668027521561d1071ccf54500c82a58a1918b2b Mon Sep 17 00:00:00 2001 From: Russell Currey <ruscur@russell.cc> Date: Mon, 24 Oct 2022 15:13:46 +1100 Subject: [PATCH 079/181] powerpc/8xx: Fix warning in hw_breakpoint_handler() In hw_breakpoint_handler(), ea is set by wp_get_instr_detail() except for 8xx, leading the variable to be passed uninitialised to wp_check_constraints(). This is safe as wp_check_constraints() returns early without using ea, so just set it to make the compiler happy. Signed-off-by: Russell Currey <ruscur@russell.cc> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221024041346.103608-1-ruscur@russell.cc --- arch/powerpc/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 8db1a15d7acb..e1b4e70c8fd0 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -646,7 +646,7 @@ int hw_breakpoint_handler(struct die_args *args) ppc_inst_t instr = ppc_inst(0); int type = 0; int size = 0; - unsigned long ea; + unsigned long ea = 0; /* Disable breakpoints during exception handling */ hw_breakpoint_disable(); From afa1cda4097077e37639ca7098c2147e1885b2df Mon Sep 17 00:00:00 2001 From: Bo Liu <liubo03@inspur.com> Date: Mon, 31 Oct 2022 02:37:06 -0400 Subject: [PATCH 080/181] powerpc/pseries/eeh: Fix some kernel-doc warnings Fixes the following W=1 kernel build warning(s): arch/powerpc/platforms/pseries/eeh_pseries.c:163: warning: Function parameter or member 'config_addr' not described in 'pseries_eeh_phb_reset' arch/powerpc/platforms/pseries/eeh_pseries.c:163: warning: Excess function parameter 'config_adddr' description in 'pseries_eeh_phb_reset' arch/powerpc/platforms/pseries/eeh_pseries.c:198: warning: Function parameter or member 'config_addr' not described in 'pseries_eeh_phb_configure_bridge' arch/powerpc/platforms/pseries/eeh_pseries.c:198: warning: Excess function parameter 'config_adddr' description in 'pseries_eeh_phb_configure_bridge' Signed-off-by: Bo Liu <liubo03@inspur.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221031063706.2770-1-liubo03@inspur.com --- arch/powerpc/platforms/pseries/eeh_pseries.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 8e40ccac0f44..ea890037843c 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -154,7 +154,7 @@ static int pseries_eeh_get_pe_config_addr(struct pci_dn *pdn) /** * pseries_eeh_phb_reset - Reset the specified PHB * @phb: PCI controller - * @config_adddr: the associated config address + * @config_addr: the associated config address * @option: reset option * * Reset the specified PHB/PE @@ -188,7 +188,7 @@ static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, in /** * pseries_eeh_phb_configure_bridge - Configure PCI bridges in the indicated PE * @phb: PCI controller - * @config_adddr: the associated config address + * @config_addr: the associated config address * * The function will be called to reconfigure the bridges included * in the specified PE so that the mulfunctional PE would be recovered From 59dc2d94bc12dac53a5d2368ad97ca24e7cc5682 Mon Sep 17 00:00:00 2001 From: Chen Lifu <chenlifu@huawei.com> Date: Thu, 3 Nov 2022 15:01:22 +0800 Subject: [PATCH 081/181] powerpc/powermac: Fix symbol not declared warnings 1. ppc_override_l2cr and ppc_override_l2cr_value are only used in l2cr_init() function, remove them and used *l2cr directly. 2. has_l2cache is not used outside of the file, so mark it static and do not initialise statics to 0. Fixes the following warnings: arch/powerpc/platforms/powermac/setup.c:73:5: warning: symbol 'ppc_override_l2cr' was not declared. Should it be static? arch/powerpc/platforms/powermac/setup.c:74:5: warning: symbol 'ppc_override_l2cr_value' was not declared. Should it be static? arch/powerpc/platforms/powermac/setup.c:75:5: warning: symbol 'has_l2cache' was not declared. Should it be static? Signed-off-by: Chen Lifu <chenlifu@huawei.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> [mpe: Unwrap printk string] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221103070122.340773-1-chenlifu@huawei.com --- arch/powerpc/platforms/powermac/setup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 04daa7f0a03c..4f7ee885a78f 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -70,9 +70,7 @@ #undef SHOW_GATWICK_IRQS -int ppc_override_l2cr = 0; -int ppc_override_l2cr_value; -int has_l2cache = 0; +static int has_l2cache; int pmac_newworld; @@ -236,22 +234,16 @@ static void __init l2cr_init(void) const unsigned int *l2cr = of_get_property(np, "l2cr-value", NULL); if (l2cr) { - ppc_override_l2cr = 1; - ppc_override_l2cr_value = *l2cr; _set_L2CR(0); - _set_L2CR(ppc_override_l2cr_value); + _set_L2CR(*l2cr); + pr_info("L2CR overridden (0x%x), backside cache is %s\n", + *l2cr, ((*l2cr) & 0x80000000) ? + "enabled" : "disabled"); } of_node_put(np); break; } } - - if (ppc_override_l2cr) - printk(KERN_INFO "L2CR overridden (0x%x), " - "backside cache is %s\n", - ppc_override_l2cr_value, - (ppc_override_l2cr_value & 0x80000000) - ? "enabled" : "disabled"); } #endif From 2330757e0be0acad88852e211dcd6106390a729b Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:34 -0500 Subject: [PATCH 082/181] powerpc/pseries: fix the object owners enum value in plpks driver OS_VAR_LINUX enum in PLPKS driver should be 0x02 instead of 0x01. Fixes: 2454a7af0f2a ("powerpc/pseries: define driver for Platform KeyStore") Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-2-nayna@linux.ibm.com --- arch/powerpc/platforms/pseries/plpks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/plpks.h b/arch/powerpc/platforms/pseries/plpks.h index c6a291367bb1..275ccd86bfb5 100644 --- a/arch/powerpc/platforms/pseries/plpks.h +++ b/arch/powerpc/platforms/pseries/plpks.h @@ -17,7 +17,7 @@ #define WORLDREADABLE 0x08000000 #define SIGNEDUPDATE 0x01000000 -#define PLPKS_VAR_LINUX 0x01 +#define PLPKS_VAR_LINUX 0x02 #define PLPKS_VAR_COMMON 0x04 struct plpks_var { From af223e1728c448073d1e12fe464bf344310edeba Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:35 -0500 Subject: [PATCH 083/181] powerpc/pseries: Fix the H_CALL error code in PLPKS driver PAPR Spec defines H_P1 actually as H_PARAMETER and maps H_ABORTED to a different numerical value. Fix the error codes as per PAPR Specification. Fixes: 2454a7af0f2a ("powerpc/pseries: define driver for Platform KeyStore") Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-3-nayna@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 3 +-- arch/powerpc/platforms/pseries/plpks.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 8abae463f6c1..95fd7f9485d5 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -79,7 +79,7 @@ #define H_NOT_ENOUGH_RESOURCES -44 #define H_R_STATE -45 #define H_RESCINDED -46 -#define H_P1 -54 +#define H_ABORTED -54 #define H_P2 -55 #define H_P3 -56 #define H_P4 -57 @@ -100,7 +100,6 @@ #define H_COP_HW -74 #define H_STATE -75 #define H_IN_USE -77 -#define H_ABORTED -78 #define H_UNSUPPORTED_FLAG_START -256 #define H_UNSUPPORTED_FLAG_END -511 #define H_MULTI_THREADS_ACTIVE -9005 diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index f4b5b5a64db3..32ce4d780d8f 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -75,7 +75,7 @@ static int pseries_status_to_err(int rc) case H_FUNCTION: err = -ENXIO; break; - case H_P1: + case H_PARAMETER: case H_P2: case H_P3: case H_P4: From bb8e4c7cb759b90a04f2e94056b50288ff46a0ed Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:36 -0500 Subject: [PATCH 084/181] powerpc/pseries: Return -EIO instead of -EINTR for H_ABORTED error Some commands for eg. "cat" might continue to retry on encountering EINTR. This is not expected for original error code H_ABORTED. Map H_ABORTED to more relevant Linux error code EIO. Fixes: 2454a7af0f2a ("powerpc/pseries: define driver for Platform KeyStore") Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-4-nayna@linux.ibm.com --- arch/powerpc/platforms/pseries/plpks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index 32ce4d780d8f..cbea447122ca 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -111,7 +111,7 @@ static int pseries_status_to_err(int rc) err = -EEXIST; break; case H_ABORTED: - err = -EINTR; + err = -EIO; break; default: err = -EINVAL; From 8888ea772972323362660e9a1339175294664a6c Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:37 -0500 Subject: [PATCH 085/181] powerpc/pseries: cleanup error logs in plpks driver Logging H_CALL return codes in PLPKS driver are easy to confuse with Linux error codes. Let the caller of the function log the converted linux error code. Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-5-nayna@linux.ibm.com --- arch/powerpc/platforms/pseries/plpks.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index cbea447122ca..72d9debf18c0 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -312,10 +312,6 @@ int plpks_write_var(struct plpks_var var) if (!rc) rc = plpks_confirm_object_flushed(label, auth); - if (rc) - pr_err("Failed to write variable %s for component %s with error %d\n", - var.name, var.component, rc); - rc = pseries_status_to_err(rc); kfree(label); out: @@ -350,10 +346,6 @@ int plpks_remove_var(char *component, u8 varos, struct plpks_var_name vname) if (!rc) rc = plpks_confirm_object_flushed(label, auth); - if (rc) - pr_err("Failed to remove variable %s for component %s with error %d\n", - vname.name, component, rc); - rc = pseries_status_to_err(rc); kfree(label); out: @@ -395,8 +387,6 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) maxobjsize); if (rc != H_SUCCESS) { - pr_err("Failed to read variable %s for component %s with error %d\n", - var->name, var->component, rc); rc = pseries_status_to_err(rc); goto out_free_output; } From 212dd5cfbee7815f3c665a51c501701edb881599 Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:38 -0500 Subject: [PATCH 086/181] powerpc/pseries: replace kmalloc with kzalloc in PLPKS driver Replace kmalloc with kzalloc in construct_auth() function to default initialize structure with zeroes. Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-6-nayna@linux.ibm.com --- arch/powerpc/platforms/pseries/plpks.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index 72d9debf18c0..e8c02735b702 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -162,19 +162,15 @@ static struct plpks_auth *construct_auth(u8 consumer) if (consumer > PKS_OS_OWNER) return ERR_PTR(-EINVAL); - auth = kmalloc(struct_size(auth, password, maxpwsize), GFP_KERNEL); + auth = kzalloc(struct_size(auth, password, maxpwsize), GFP_KERNEL); if (!auth) return ERR_PTR(-ENOMEM); auth->version = 1; auth->consumer = consumer; - auth->rsvd0 = 0; - auth->rsvd1 = 0; - if (consumer == PKS_FW_OWNER || consumer == PKS_BOOTLOADER_OWNER) { - auth->passwordlength = 0; + if (consumer == PKS_FW_OWNER || consumer == PKS_BOOTLOADER_OWNER) return auth; - } memcpy(auth->password, ospassword, ospasswordlength); From 1f622f3f80cbf8999ff5955a2fcfbd801a1f32e0 Mon Sep 17 00:00:00 2001 From: Nayna Jain <nayna@linux.ibm.com> Date: Sun, 6 Nov 2022 15:58:39 -0500 Subject: [PATCH 087/181] powerpc/pseries: fix plpks_read_var() code for different consumers Even though plpks_read_var() is currently called to read variables owned by different consumers, it internally supports only OS consumer. Fix plpks_read_var() to handle different consumers correctly. Fixes: 2454a7af0f2a ("powerpc/pseries: define driver for Platform KeyStore") Signed-off-by: Nayna Jain <nayna@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221106205839.600442-7-nayna@linux.ibm.com --- arch/powerpc/platforms/pseries/plpks.c | 28 +++++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index e8c02735b702..4edd1585e245 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -354,22 +354,24 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) { unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 }; struct plpks_auth *auth; - struct label *label; + struct label *label = NULL; u8 *output; int rc; if (var->namelen > MAX_NAME_SIZE) return -EINVAL; - auth = construct_auth(PKS_OS_OWNER); + auth = construct_auth(consumer); if (IS_ERR(auth)) return PTR_ERR(auth); - label = construct_label(var->component, var->os, var->name, - var->namelen); - if (IS_ERR(label)) { - rc = PTR_ERR(label); - goto out_free_auth; + if (consumer == PKS_OS_OWNER) { + label = construct_label(var->component, var->os, var->name, + var->namelen); + if (IS_ERR(label)) { + rc = PTR_ERR(label); + goto out_free_auth; + } } output = kzalloc(maxobjsize, GFP_KERNEL); @@ -378,9 +380,15 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var) goto out_free_label; } - rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), - virt_to_phys(label), label->size, virt_to_phys(output), - maxobjsize); + if (consumer == PKS_OS_OWNER) + rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(label), label->size, virt_to_phys(output), + maxobjsize); + else + rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth), + virt_to_phys(var->name), var->namelen, virt_to_phys(output), + maxobjsize); + if (rc != H_SUCCESS) { rc = pseries_status_to_err(rc); From a9ffb8ee7b65a468474d6a2be7e9cca4b8f8ea5f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Fri, 18 Nov 2022 17:40:29 +0800 Subject: [PATCH 088/181] powerpc: Use "grep -E" instead of "egrep" The latest version of grep claims the egrep is now obsolete so the build now contains warnings that look like: egrep: warning: egrep is obsolescent; using grep -E fix this up by moving the related file to use "grep -E" instead. sed -i "s/egrep/grep -E/g" `grep egrep -rwl arch/powerpc` Here are the steps to install the latest grep: wget http://ftp.gnu.org/gnu/grep/grep-3.8.tar.gz tar xf grep-3.8.tar.gz cd grep-3.8 && ./configure && make sudo make install export PATH=/usr/local/bin:$PATH Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1668764429-11540-1-git-send-email-yangtiezhu@loongson.cn --- arch/powerpc/boot/wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index a8a87d7667f4..af04cea82b94 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -594,7 +594,7 @@ ps3) # reached, then enter the system reset vector of the partially decompressed # image. No warning is issued. rm -f "$odir"/{otheros,otheros-too-big}.bld - size=$(${CROSS}nm --no-sort --radix=d "$ofile" | egrep ' _end$' | cut -d' ' -f1) + size=$(${CROSS}nm --no-sort --radix=d "$ofile" | grep -E ' _end$' | cut -d' ' -f1) bld="otheros.bld" if [ $size -gt $((0x1000000)) ]; then bld="otheros-too-big.bld" From 6c645b01e536757a9e1a9f72c13767f9b3f8559f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:26 +1000 Subject: [PATCH 089/181] KVM: PPC: Book3E: Fix CONFIG_TRACE_IRQFLAGS support 32-bit does not trace_irqs_off() to match the trace_irqs_on() call in kvmppc_fix_ee_before_entry(). This can lead to irqs being enabled twice in the trace, and the irqs-off region between guest exit and the host enabling local irqs again is not properly traced. 64-bit code does call this, but from asm code where volatiles are live and so incorrectly get clobbered. Move the irq reconcile into C to fix both problems. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-2-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_ppc.h | 12 ++++++++++++ arch/powerpc/kvm/booke.c | 3 +++ arch/powerpc/kvm/bookehv_interrupts.S | 9 --------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bfacf12784dd..eae9619b6190 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -1014,6 +1014,18 @@ static inline void kvmppc_fix_ee_before_entry(void) #endif } +static inline void kvmppc_fix_ee_after_exit(void) +{ +#ifdef CONFIG_PPC64 + /* Only need to enable IRQs by hard enabling them after this */ + local_paca->irq_happened = PACA_IRQ_HARD_DIS; + irq_soft_mask_set(IRQS_ALL_DISABLED); +#endif + + trace_hardirqs_off(); +} + + static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) { ulong ea; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7b4920e9fd26..0dce93ccaadf 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1015,6 +1015,9 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr) u32 last_inst = KVM_INST_FETCH_FAILED; enum emulation_result emulated = EMULATE_DONE; + /* Fix irq state (pairs with kvmppc_fix_ee_before_entry()) */ + kvmppc_fix_ee_after_exit(); + /* update before a new last_exit_type is rewritten */ kvmppc_update_timing_stats(vcpu); diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S index 8262c14fc9e6..b5fe6fb53c66 100644 --- a/arch/powerpc/kvm/bookehv_interrupts.S +++ b/arch/powerpc/kvm/bookehv_interrupts.S @@ -424,15 +424,6 @@ _GLOBAL(kvmppc_resume_host) mtspr SPRN_EPCR, r3 isync -#ifdef CONFIG_64BIT - /* - * We enter with interrupts disabled in hardware, but - * we need to call RECONCILE_IRQ_STATE to ensure - * that the software state is kept in sync. - */ - RECONCILE_IRQ_STATE(r3,r5) -#endif - /* Switch to kernel stack and jump to handler. */ mr r3, r4 mr r5, r14 /* intno */ From dea681c91d3cd5326f87d0a3c93079573e22ce9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig <hch@lst.de> Date: Tue, 22 Nov 2022 08:22:25 +0100 Subject: [PATCH 090/181] powerpc/ps3: mark ps3_system_bus_type static ps3_system_bus_type is only used inside of system-bus.c, so remove the external declaration and the very outdated comment next to it. Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Geoff Levand <geoff@infradead.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221122072225.423432-1-hch@lst.de --- arch/powerpc/include/asm/ps3.h | 4 ---- arch/powerpc/platforms/ps3/system-bus.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h index 8a0d8fb35328..d503dbd7856c 100644 --- a/arch/powerpc/include/asm/ps3.h +++ b/arch/powerpc/include/asm/ps3.h @@ -425,10 +425,6 @@ static inline void *ps3_system_bus_get_drvdata( return dev_get_drvdata(&dev->core); } -/* These two need global scope for get_arch_dma_ops(). */ - -extern struct bus_type ps3_system_bus_type; - /* system manager */ struct ps3_sys_manager_ops { diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 2502e9b17df4..38a7e02295c8 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -466,7 +466,7 @@ static struct attribute *ps3_system_bus_dev_attrs[] = { }; ATTRIBUTE_GROUPS(ps3_system_bus_dev); -struct bus_type ps3_system_bus_type = { +static struct bus_type ps3_system_bus_type = { .name = "ps3_system_bus", .match = ps3_system_bus_match, .uevent = ps3_system_bus_uevent, From 71ae6305ad41cfd1ac5aa91d356e71c7a537df2e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Tue, 22 Nov 2022 12:10:52 +0530 Subject: [PATCH 091/181] selftests/powerpc: Move perror closer to its use Right now, if perf_event_open() fails for the systemwide tests, error report is printed too late, sometimes after subsequent system calls. Move use of perror() to the main function, just after the syscall. Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/372ac78c27899f1f612fbd6ac796604a4a9310aa.1669096083.git.naveen.n.rao@linux.vnet.ibm.com --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 21 +++++-------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index ecde2c199f3b..ea5e14ecbf30 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -146,6 +146,7 @@ static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len for (i = 0; i < nprocs; i++) { fd[i] = perf_cpu_event_open(i, type, addr, len); if (fd[i] < 0) { + perror("perf_systemwide_event_open"); close_fds(fd, i); return fd[i]; } @@ -543,15 +544,12 @@ static int test_syswide_multi_diff_addr(void) int ret; ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); - if (ret) { - perror("perf_systemwide_event_open"); + if (ret) exit(EXIT_FAILURE); - } ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&b, (__u64)sizeof(b)); if (ret) { close_fds(fd1, nprocs); - perror("perf_systemwide_event_open"); exit(EXIT_FAILURE); } @@ -590,15 +588,12 @@ static int test_syswide_multi_same_addr(void) int ret; ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); - if (ret) { - perror("perf_systemwide_event_open"); + if (ret) exit(EXIT_FAILURE); - } ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_RW, (__u64)&a, (__u64)sizeof(a)); if (ret) { close_fds(fd1, nprocs); - perror("perf_systemwide_event_open"); exit(EXIT_FAILURE); } @@ -637,15 +632,12 @@ static int test_syswide_multi_diff_addr_ro_wo(void) int ret; ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); - if (ret) { - perror("perf_systemwide_event_open"); + if (ret) exit(EXIT_FAILURE); - } ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&b, (__u64)sizeof(b)); if (ret) { close_fds(fd1, nprocs); - perror("perf_systemwide_event_open"); exit(EXIT_FAILURE); } @@ -684,15 +676,12 @@ static int test_syswide_multi_same_addr_ro_wo(void) int ret; ret = perf_systemwide_event_open(fd1, HW_BREAKPOINT_W, (__u64)&a, (__u64)sizeof(a)); - if (ret) { - perror("perf_systemwide_event_open"); + if (ret) exit(EXIT_FAILURE); - } ret = perf_systemwide_event_open(fd2, HW_BREAKPOINT_R, (__u64)&a, (__u64)sizeof(a)); if (ret) { close_fds(fd1, nprocs); - perror("perf_systemwide_event_open"); exit(EXIT_FAILURE); } From 616ad3f4aac287c48b66c92cb777395b4465ed4f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Tue, 22 Nov 2022 12:10:53 +0530 Subject: [PATCH 092/181] selftests/powerpc: Bump up rlimit for perf-hwbreak test The systemwide perf hardware breakpoint test tries to open a perf event on each cpu. On large systems, we run out of file descriptors and fail the test. Instead, have the test set the file descriptor limit to an arbitraty high value. Reported-by: Rohan Deshpande <rohan_d@linux.vnet.ibm.com> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/187fed5843cecc1e5066677b6296ee88337d7bef.1669096083.git.naveen.n.rao@linux.vnet.ibm.com --- .../testing/selftests/powerpc/ptrace/perf-hwbreak.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index ea5e14ecbf30..866e5be48ee3 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -26,6 +26,7 @@ #include <sys/ioctl.h> #include <sys/wait.h> #include <sys/ptrace.h> +#include <sys/resource.h> #include <sys/sysinfo.h> #include <asm/ptrace.h> #include <elf.h> @@ -140,8 +141,19 @@ static void disable_fds(int *fd, int n) static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len) { + struct rlimit rlim; int i = 0; + if (getrlimit(RLIMIT_NOFILE, &rlim)) { + perror("getrlimit"); + return -1; + } + rlim.rlim_cur = 65536; + if (setrlimit(RLIMIT_NOFILE, &rlim)) { + perror("setrlimit"); + return -1; + } + /* Assume online processors are 0 to nprocs for simplisity */ for (i = 0; i < nprocs; i++) { fd[i] = perf_cpu_event_open(i, type, addr, len); From 260095926d3956071c6699a28824c3f0fa7cd97a Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com> Date: Tue, 22 Nov 2022 12:10:54 +0530 Subject: [PATCH 093/181] selftests/powerpc: Account for offline cpus in perf-hwbreak test For systemwide tests, use online cpu mask to only open events on online cpus. This enables this test to work on systems in lower SMT modes. Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/15fd447dcefd19945a7d31f0a475349f548a3603.1669096083.git.naveen.n.rao@linux.vnet.ibm.com --- .../selftests/powerpc/ptrace/perf-hwbreak.c | 45 ++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c index 866e5be48ee3..f75739bbad28 100644 --- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c @@ -17,8 +17,11 @@ * Copyright (C) 2018 Michael Neuling, IBM Corporation. */ +#define _GNU_SOURCE + #include <unistd.h> #include <assert.h> +#include <sched.h> #include <stdio.h> #include <stdlib.h> #include <signal.h> @@ -141,8 +144,10 @@ static void disable_fds(int *fd, int n) static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len) { + int i, ncpus, cpu, ret = 0; struct rlimit rlim; - int i = 0; + cpu_set_t *mask; + size_t size; if (getrlimit(RLIMIT_NOFILE, &rlim)) { perror("getrlimit"); @@ -154,16 +159,44 @@ static int perf_systemwide_event_open(int *fd, __u32 type, __u64 addr, __u64 len return -1; } - /* Assume online processors are 0 to nprocs for simplisity */ - for (i = 0; i < nprocs; i++) { - fd[i] = perf_cpu_event_open(i, type, addr, len); + ncpus = get_nprocs_conf(); + size = CPU_ALLOC_SIZE(ncpus); + mask = CPU_ALLOC(ncpus); + if (!mask) { + perror("malloc"); + return -1; + } + + CPU_ZERO_S(size, mask); + + if (sched_getaffinity(0, size, mask)) { + perror("sched_getaffinity"); + ret = -1; + goto done; + } + + for (i = 0, cpu = 0; i < nprocs && cpu < ncpus; cpu++) { + if (!CPU_ISSET_S(cpu, size, mask)) + continue; + fd[i] = perf_cpu_event_open(cpu, type, addr, len); if (fd[i] < 0) { perror("perf_systemwide_event_open"); close_fds(fd, i); - return fd[i]; + ret = fd[i]; + goto done; } + i++; } - return 0; + + if (i < nprocs) { + printf("Error: Number of online cpus reduced since start of test: %d < %d\n", i, nprocs); + close_fds(fd, i); + ret = -1; + } + +done: + CPU_FREE(mask); + return ret; } static inline bool breakpoint_test(int len) From d5090716be6791ada9ee142163a4934c1c147aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net> Date: Sat, 26 Nov 2022 06:10:00 +0100 Subject: [PATCH 094/181] powerpc/book3e: remove #include <generated/utsrelease.h> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7ad4bd887d27 ("powerpc/book3e: get rid of #include <generated/compile.h>") removed the usage of the define UTS_RELEASE but forgot to drop the include. utsrelease.h is potentially generated on each build. By removing the unused include we can get rid of some spurious recompilations. Fixes: 7ad4bd887d27 ("powerpc/book3e: get rid of #include <generated/compile.h>") Signed-off-by: Thomas Weißschuh <linux@weissschuh.net> Reviewed-by: Masahiro Yamada <masahiroy@kernel.org> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> [mpe: Fix typo in change log and add more explanation] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126051002.123199-2-linux@weissschuh.net --- arch/powerpc/mm/nohash/kaslr_booke.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index 0d04f9d5da8d..2fb3edafe9ab 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -19,7 +19,6 @@ #include <asm/cacheflush.h> #include <asm/kdump.h> #include <mm/mmu_decl.h> -#include <generated/utsrelease.h> struct regions { unsigned long pa_start; From 67bbb62f61e810734da0a1577a9802ddaed24140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org> Date: Fri, 30 Sep 2022 14:39:01 +0200 Subject: [PATCH 095/181] powerpc: dts: turris1x.dts: Add channel labels for temperature sensor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Channel 0 of SA56004ED chip refers to internal SA56004ED chip sensor (chip itself is located on the board) and channel 1 of SA56004ED chip refers to external sensor which is connected to temperature diode of the P2020 CPU. Fixes: 54c15ec3b738 ("powerpc: dts: Add DTS file for CZ.NIC Turris 1.x routers") Signed-off-by: Pali Rohár <pali@kernel.org> Reviewed-by: Marek Behún <kabel@kernel.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220930123901.10251-1-pali@kernel.org --- arch/powerpc/boot/dts/turris1x.dts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/boot/dts/turris1x.dts b/arch/powerpc/boot/dts/turris1x.dts index 045af668e928..e9cda34a140e 100644 --- a/arch/powerpc/boot/dts/turris1x.dts +++ b/arch/powerpc/boot/dts/turris1x.dts @@ -69,6 +69,20 @@ interrupt-parent = <&gpio>; interrupts = <12 IRQ_TYPE_LEVEL_LOW>, /* GPIO12 - ALERT pin */ <13 IRQ_TYPE_LEVEL_LOW>; /* GPIO13 - CRIT pin */ + #address-cells = <1>; + #size-cells = <0>; + + /* Local temperature sensor (SA56004ED internal) */ + channel@0 { + reg = <0>; + label = "board"; + }; + + /* Remote temperature sensor (D+/D- connected to P2020 CPU Temperature Diode) */ + channel@1 { + reg = <1>; + label = "cpu"; + }; }; /* DDR3 SPD/EEPROM */ From e082e99f6f87f5204b2531d5a3db7bbd929d23b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org> Date: Sat, 20 Aug 2022 14:33:27 +0200 Subject: [PATCH 096/181] powerpc/fsl-pci: Choose PCI host bridge with alias pci0 as the primary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If there's no PCI host bridge with ISA then check for PCI host bridge with alias "pci0" (first PCI host bridge) and if it exists then choose it as the primary PCI host bridge. This makes choice of primary PCI host bridge more stable across boots and updates as the last fallback candidate for primary PCI host bridge (if there is no choice) is selected arbitrary. Signed-off-by: Pali Rohár <pali@kernel.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220820123327.20551-1-pali@kernel.org --- arch/powerpc/sysdev/fsl_pci.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 974d3db6faab..b7232c46b244 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -1138,6 +1138,19 @@ void __init fsl_pci_assign_primary(void) return; } + /* + * If there's no PCI host bridge with ISA then check for + * PCI host bridge with alias "pci0" (first PCI host bridge). + */ + np = of_find_node_by_path("pci0"); + if (np && of_match_node(pci_ids, np) && of_device_is_available(np)) { + fsl_pci_primary = np; + of_node_put(np); + return; + } + if (np) + of_node_put(np); + /* * If there's no PCI host bridge with ISA, arbitrarily * designate one as primary. This can go away once From 3671f4ebe3eb12e7222e4d7b0f94e85cfe34253a Mon Sep 17 00:00:00 2001 From: Jordan Niethe <jniethe5@gmail.com> Date: Wed, 9 Nov 2022 15:51:04 +1100 Subject: [PATCH 097/181] powerpc: Allow clearing and restoring registers independent of saved breakpoint state For the coming temporary mm used for instruction patching, the breakpoint registers need to be cleared to prevent them from accidentally being triggered. As soon as the patching is done, the breakpoints will be restored. The breakpoint state is stored in the per-cpu variable current_brk[]. Add a suspend_breakpoints() function which will clear the breakpoint registers without touching the state in current_brk[]. Add a pair function restore_breakpoints() which will move the state in current_brk[] back to the registers. Signed-off-by: Jordan Niethe <jniethe5@gmail.com> Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-2-bgray@linux.ibm.com --- arch/powerpc/include/asm/debug.h | 2 ++ arch/powerpc/kernel/process.c | 38 +++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index 86a14736c76c..51c744608f37 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -46,6 +46,8 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk); +void suspend_breakpoints(void); +void restore_breakpoints(void); bool ppc_breakpoint_available(void); #ifdef CONFIG_PPC_ADV_DEBUG_REGS extern void do_send_trap(struct pt_regs *regs, unsigned long address, diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e3e1feaa536a..5265da2d8034 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -862,10 +862,8 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk) return 0; } -void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) +static void set_hw_breakpoint(int nr, struct arch_hw_breakpoint *brk) { - memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); - if (dawr_enabled()) // Power8 or later set_dawr(nr, brk); @@ -879,6 +877,12 @@ void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) WARN_ON_ONCE(1); } +void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk) +{ + memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk)); + set_hw_breakpoint(nr, brk); +} + /* Check if we have DAWR or DABR hardware */ bool ppc_breakpoint_available(void) { @@ -891,6 +895,34 @@ bool ppc_breakpoint_available(void) } EXPORT_SYMBOL_GPL(ppc_breakpoint_available); +/* Disable the breakpoint in hardware without touching current_brk[] */ +void suspend_breakpoints(void) +{ + struct arch_hw_breakpoint brk = {0}; + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, &brk); +} + +/* + * Re-enable breakpoints suspended by suspend_breakpoints() in hardware + * from current_brk[] + */ +void restore_breakpoints(void) +{ + int i; + + if (!ppc_breakpoint_available()) + return; + + for (i = 0; i < nr_wp_slots(); i++) + set_hw_breakpoint(i, this_cpu_ptr(¤t_brk[i])); +} + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline bool tm_enabled(struct task_struct *tsk) From 071c95c1acbd96e76bab8b25b5cad0d71a011f37 Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:05 +1100 Subject: [PATCH 098/181] powerpc/code-patching: Use WARN_ON and fix check in poking_init BUG_ON() when failing to initialise the code patching window is unnecessary, and use of BUG_ON is discouraged. We don't set poking_init_done in this case, so failure to init the boot CPU will result in a strict RWX error when a following patch_instruction uses raw_patch_instruction. If it only fails for later CPUs, they won't be onlined in the first place. The return value of cpuhp_setup_state() is also >= 0 on success, so check for < 0. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-3-bgray@linux.ibm.com --- arch/powerpc/lib/code-patching.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index ad0cf3108dd0..3055eef7dcdc 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -81,16 +81,17 @@ static int text_area_cpu_down(unsigned int cpu) static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done); -/* - * Although BUG_ON() is rude, in this case it should only happen if ENOMEM, and - * we judge it as being preferable to a kernel that will crash later when - * someone tries to use patch_instruction(). - */ void __init poking_init(void) { - BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "powerpc/text_poke:online", text_area_cpu_up, - text_area_cpu_down)); + int ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", + text_area_cpu_up, + text_area_cpu_down); + + /* cpuhp_setup_state returns >= 0 on success */ + if (WARN_ON(ret < 0)) + return; + static_branch_enable(&poking_init_done); } From baf1ed24b27db475b38f534953885d0425e2232d Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:07 +1100 Subject: [PATCH 099/181] powerpc/mm: Remove empty hash__ functions The empty hash__* functions are unnecessary. The empty definitions were introduced when 64-bit Hash support was added, as the functions were still used in generic code. These empty definitions were prefixed with hash__ when Radix support was added, and new wrappers with the original names were added that selected the Radix or Hash version based on radix_enabled(). But the hash__ prefixed functions were not part of a public interface, so there is no need to include them for compatibility with anything. Generic code will use the non-prefixed wrappers, and Hash specific code will know that there is no point in calling them (or even worse, call them and expect them to do something). Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-5-bgray@linux.ibm.com --- .../include/asm/book3s/64/tlbflush-hash.h | 28 ------------------- arch/powerpc/include/asm/book3s/64/tlbflush.h | 27 ++++++------------ 2 files changed, 9 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 751921f6db46..a9ef40dc263e 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -65,13 +65,6 @@ extern void flush_hash_range(unsigned long number, int local); extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, pmd_t *pmdp, unsigned int psize, int ssize, unsigned long flags); -static inline void hash__local_flush_tlb_mm(struct mm_struct *mm) -{ -} - -static inline void hash__flush_tlb_mm(struct mm_struct *mm) -{ -} static inline void hash__local_flush_all_mm(struct mm_struct *mm) { @@ -95,27 +88,6 @@ static inline void hash__flush_all_mm(struct mm_struct *mm) WARN_ON_ONCE(1); } -static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ -} - -static inline void hash__flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ -} - -static inline void hash__flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void hash__flush_tlb_kernel_range(unsigned long start, - unsigned long end) -{ -} - - struct mmu_gather; extern void hash__tlb_flush(struct mmu_gather *tlb); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 67655cd60545..2254a40f0564 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -47,8 +47,7 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (radix_enabled()) - return radix__flush_pmd_tlb_range(vma, start, end); - return hash__flush_tlb_range(vma, start, end); + radix__flush_pmd_tlb_range(vma, start, end); } #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE @@ -57,39 +56,34 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long end) { if (radix_enabled()) - return radix__flush_hugetlb_tlb_range(vma, start, end); - return hash__flush_tlb_range(vma, start, end); + radix__flush_hugetlb_tlb_range(vma, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (radix_enabled()) - return radix__flush_tlb_range(vma, start, end); - return hash__flush_tlb_range(vma, start, end); + radix__flush_tlb_range(vma, start, end); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { if (radix_enabled()) - return radix__flush_tlb_kernel_range(start, end); - return hash__flush_tlb_kernel_range(start, end); + radix__flush_tlb_kernel_range(start, end); } static inline void local_flush_tlb_mm(struct mm_struct *mm) { if (radix_enabled()) - return radix__local_flush_tlb_mm(mm); - return hash__local_flush_tlb_mm(mm); + radix__local_flush_tlb_mm(mm); } static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { if (radix_enabled()) - return radix__local_flush_tlb_page(vma, vmaddr); - return hash__local_flush_tlb_page(vma, vmaddr); + radix__local_flush_tlb_page(vma, vmaddr); } static inline void local_flush_all_mm(struct mm_struct *mm) @@ -102,24 +96,21 @@ static inline void local_flush_all_mm(struct mm_struct *mm) static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) - return radix__tlb_flush(tlb); - return hash__tlb_flush(tlb); + radix__tlb_flush(tlb); } #ifdef CONFIG_SMP static inline void flush_tlb_mm(struct mm_struct *mm) { if (radix_enabled()) - return radix__flush_tlb_mm(mm); - return hash__flush_tlb_mm(mm); + radix__flush_tlb_mm(mm); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { if (radix_enabled()) - return radix__flush_tlb_page(vma, vmaddr); - return hash__flush_tlb_page(vma, vmaddr); + radix__flush_tlb_page(vma, vmaddr); } static inline void flush_all_mm(struct mm_struct *mm) From 0f0a0a6091e678b1a75078ecd6b02176f3228dbb Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:08 +1100 Subject: [PATCH 100/181] cxl: Use radix__flush_all_mm instead of generic flush_all_mm The generic implementation of this function isn't really generic (Hash is not implemented). Unfortunately, the runtime warnings cannot be replaced with BUILD_BUG's, so it seems safer not to provide a stub in the first place. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-6-bgray@linux.ibm.com --- arch/powerpc/include/asm/mmu_context.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index c1ea270bb848..57f5017111f4 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -151,8 +151,8 @@ static inline void mm_context_remove_copro(struct mm_struct *mm) * nMMU and/or PSL need to be cleaned up. * * Both the 'copros' and 'active_cpus' counts are looked at in - * flush_all_mm() to determine the scope (local/global) of the - * TLBIs, so we need to flush first before decrementing + * radix__flush_all_mm() to determine the scope (local/global) + * of the TLBIs, so we need to flush first before decrementing * 'copros'. If this API is used by several callers for the * same context, it can lead to over-flushing. It's hopefully * not common enough to be a problem. @@ -164,7 +164,7 @@ static inline void mm_context_remove_copro(struct mm_struct *mm) * in-between. */ if (radix_enabled()) { - flush_all_mm(mm); + radix__flush_all_mm(mm); c = atomic_dec_if_positive(&mm->context.copros); /* Detect imbalance between add and remove */ From d34471c9bd5d47ab148dd68817631a4238f755c4 Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:09 +1100 Subject: [PATCH 101/181] powerpc/mm: Remove flush_all_mm, local_flush_all_mm These functions were introduced for "cxl: Enable global TLBIs for cxl contexts" [1], which ended up using them for Radix only. They were never implemented on Hash (and creating an implementation appears to be difficult), so nothing can actually rely on them. They behave differently to the existing surrounding functions too, in that they actually need to do something on Hash. The other functions are primarily for use in generic code that expects their definitions, but Hash updates the TLB during PTE updates. After replacing the only usage with the Radix specific version, there are no more users of these functions, and given they are not implemented anyway it is safe to delete them. [1]: https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20170903181513.29635-1-fbarrat@linux.vnet.ibm.com/ Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-7-bgray@linux.ibm.com --- .../include/asm/book3s/64/tlbflush-hash.h | 22 ------------------- arch/powerpc/include/asm/book3s/64/tlbflush.h | 15 ------------- 2 files changed, 37 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index a9ef40dc263e..146287d9580f 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -66,28 +66,6 @@ extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, pmd_t *pmdp, unsigned int psize, int ssize, unsigned long flags); -static inline void hash__local_flush_all_mm(struct mm_struct *mm) -{ - /* - * There's no Page Walk Cache for hash, so what is needed is - * the same as flush_tlb_mm(), which doesn't really make sense - * with hash. So the only thing we could do is flush the - * entire LPID! Punt for now, as it's not being used. - */ - WARN_ON_ONCE(1); -} - -static inline void hash__flush_all_mm(struct mm_struct *mm) -{ - /* - * There's no Page Walk Cache for hash, so what is needed is - * the same as flush_tlb_mm(), which doesn't really make sense - * with hash. So the only thing we could do is flush the - * entire LPID! Punt for now, as it's not being used. - */ - WARN_ON_ONCE(1); -} - struct mmu_gather; extern void hash__tlb_flush(struct mmu_gather *tlb); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 2254a40f0564..c56a0aee8124 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -86,13 +86,6 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, radix__local_flush_tlb_page(vma, vmaddr); } -static inline void local_flush_all_mm(struct mm_struct *mm) -{ - if (radix_enabled()) - return radix__local_flush_all_mm(mm); - return hash__local_flush_all_mm(mm); -} - static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) @@ -112,17 +105,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, if (radix_enabled()) radix__flush_tlb_page(vma, vmaddr); } - -static inline void flush_all_mm(struct mm_struct *mm) -{ - if (radix_enabled()) - return radix__flush_all_mm(mm); - return hash__flush_all_mm(mm); -} #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) -#define flush_all_mm(mm) local_flush_all_mm(mm) #endif /* CONFIG_SMP */ #define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault From 274d842fa1efd9449e62222c8896e0be11621f1f Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:10 +1100 Subject: [PATCH 102/181] powerpc/tlb: Add local flush for page given mm_struct and psize Adds a local TLB flush operation that works given an mm_struct, VA to flush, and page size representation. Most implementations mirror the surrounding code. The book3s/32/tlbflush.h implementation is left as a BUILD_BUG because it is more complicated and not required for anything as yet. This removes the need to create a vm_area_struct, which the temporary patching mm work does not need. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-8-bgray@linux.ibm.com --- arch/powerpc/include/asm/book3s/32/tlbflush.h | 9 +++++++++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 7 +++++++ arch/powerpc/include/asm/nohash/tlbflush.h | 7 +++++++ arch/powerpc/mm/nohash/tlb.c | 8 ++++++++ 4 files changed, 31 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/tlbflush.h b/arch/powerpc/include/asm/book3s/32/tlbflush.h index ba1743c52b56..4be572908124 100644 --- a/arch/powerpc/include/asm/book3s/32/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/32/tlbflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H #define _ASM_POWERPC_BOOK3S_32_TLBFLUSH_H +#include <linux/build_bug.h> + #define MMU_NO_CONTEXT (0) /* * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx @@ -74,6 +76,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, { flush_tlb_page(vma, vmaddr); } + +static inline void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + BUILD_BUG(); +} + static inline void local_flush_tlb_mm(struct mm_struct *mm) { flush_tlb_mm(mm); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index c56a0aee8124..dd39313242b4 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -86,6 +86,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, radix__local_flush_tlb_page(vma, vmaddr); } +static inline void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + if (radix_enabled()) + radix__local_flush_tlb_page_psize(mm, vmaddr, psize); +} + static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) diff --git a/arch/powerpc/include/asm/nohash/tlbflush.h b/arch/powerpc/include/asm/nohash/tlbflush.h index bdaf34ad41ea..9a2cf83ea4f1 100644 --- a/arch/powerpc/include/asm/nohash/tlbflush.h +++ b/arch/powerpc/include/asm/nohash/tlbflush.h @@ -45,6 +45,12 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned lon asm volatile ("tlbie %0; sync" : : "r" (vmaddr) : "memory"); } +static inline void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + asm volatile ("tlbie %0; sync" : : "r" (vmaddr) : "memory"); +} + static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { start &= PAGE_MASK; @@ -58,6 +64,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void local_flush_tlb_mm(struct mm_struct *mm); extern void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +void local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, int tsize, int ind); diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index 2c15c86c7015..a903b308acc5 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -184,6 +184,14 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(local_flush_tlb_page); + +void local_flush_tlb_page_psize(struct mm_struct *mm, + unsigned long vmaddr, int psize) +{ + __local_flush_tlb_page(mm, vmaddr, mmu_get_tsize(psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page_psize); + #endif /* From 9f61521c7a284e799050cd2adacc9a611bd2b491 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 28 Nov 2022 13:11:13 +1000 Subject: [PATCH 103/181] powerpc/qspinlock: powerpc qspinlock implementation Add a powerpc specific implementation of queued spinlocks. This is the build framework with a very simple (non-queued) spinlock implementation to begin with. Later changes add queueing, and other features and optimisations one-at-a-time. It is done this way to more easily see how the queued spinlocks are built, and to make performance and correctness bisects more useful. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> [mpe: Drop paravirt.h & processor.h changes to fix 32-bit build] [mpe: Fix 32-bit build of qspinlock.o & disallow GENERIC_LOCKBREAK per Nick] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/CONLLQB6DCJU.2ZPOS7T6S5GRR@bobo --- arch/powerpc/Kconfig | 3 +- arch/powerpc/include/asm/qspinlock.h | 98 +++++++------------ arch/powerpc/include/asm/qspinlock_paravirt.h | 7 -- arch/powerpc/include/asm/qspinlock_types.h | 13 +++ arch/powerpc/include/asm/spinlock.h | 2 +- arch/powerpc/include/asm/spinlock_types.h | 2 +- arch/powerpc/lib/Makefile | 4 +- arch/powerpc/lib/qspinlock.c | 17 ++++ 8 files changed, 71 insertions(+), 75 deletions(-) delete mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h create mode 100644 arch/powerpc/include/asm/qspinlock_types.h create mode 100644 arch/powerpc/lib/qspinlock.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 699df27b0e2f..7fbdf22ce9a9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -96,7 +96,7 @@ config LOCKDEP_SUPPORT config GENERIC_LOCKBREAK bool default y - depends on SMP && PREEMPTION + depends on SMP && PREEMPTION && !PPC_QUEUED_SPINLOCKS config GENERIC_HWEIGHT bool @@ -154,7 +154,6 @@ config PPC select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS - select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index b676c4fb90fd..5e6257313557 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,83 +2,55 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <asm-generic/qspinlock_types.h> +#include <linux/atomic.h> +#include <linux/compiler.h> +#include <asm/qspinlock_types.h> #include <asm/paravirt.h> -#define _Q_PENDING_LOOPS (1 << 9) /* not tuned */ - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -extern void __pv_queued_spin_unlock(struct qspinlock *lock); - -static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - if (!is_shared_processor()) - native_queued_spin_lock_slowpath(lock, val); - else - __pv_queued_spin_lock_slowpath(lock, val); + return atomic_read(&lock->val); } -#define queued_spin_unlock queued_spin_unlock -static inline void queued_spin_unlock(struct qspinlock *lock) +static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - if (!is_shared_processor()) - smp_store_release(&lock->locked, 0); - else - __pv_queued_spin_unlock(lock); + return !atomic_read(&lock.val); } -#else -extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); -#endif +static __always_inline int queued_spin_is_contended(struct qspinlock *lock) +{ + return 0; +} + +static __always_inline int queued_spin_trylock(struct qspinlock *lock) +{ + return atomic_cmpxchg_acquire(&lock->val, 0, 1) == 0; +} + +void queued_spin_lock_slowpath(struct qspinlock *lock); static __always_inline void queued_spin_lock(struct qspinlock *lock) { - u32 val = 0; - - if (likely(arch_atomic_try_cmpxchg_lock(&lock->val, &val, _Q_LOCKED_VAL))) - return; - - queued_spin_lock_slowpath(lock, val); + if (!queued_spin_trylock(lock)) + queued_spin_lock_slowpath(lock); } -#define queued_spin_lock queued_spin_lock + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + atomic_set_release(&lock->val, 0); +} + +#define arch_spin_is_locked(l) queued_spin_is_locked(l) +#define arch_spin_is_contended(l) queued_spin_is_contended(l) +#define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l) +#define arch_spin_lock(l) queued_spin_lock(l) +#define arch_spin_trylock(l) queued_spin_trylock(l) +#define arch_spin_unlock(l) queued_spin_unlock(l) #ifdef CONFIG_PARAVIRT_SPINLOCKS -#define SPIN_THRESHOLD (1<<15) /* not tuned */ - -static __always_inline void pv_wait(u8 *ptr, u8 val) -{ - if (*ptr != val) - return; - yield_to_any(); - /* - * We could pass in a CPU here if waiting in the queue and yield to - * the previous CPU in the queue. - */ -} - -static __always_inline void pv_kick(int cpu) -{ - prod_cpu(cpu); -} - -extern void __pv_init_lock_hash(void); - -static inline void pv_spinlocks_init(void) -{ - __pv_init_lock_hash(); -} - +void pv_spinlocks_init(void); +#else +static inline void pv_spinlocks_init(void) { } #endif -/* - * Queued spinlocks rely heavily on smp_cond_load_relaxed() to busy-wait, - * which was found to have performance problems if implemented with - * the preferred spin_begin()/spin_end() SMT priority pattern. Use the - * generic version instead. - */ - -#include <asm-generic/qspinlock.h> - #endif /* _ASM_POWERPC_QSPINLOCK_H */ diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h deleted file mode 100644 index 6b60e7736a47..000000000000 --- a/arch/powerpc/include/asm/qspinlock_paravirt.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_QSPINLOCK_PARAVIRT_H -#define _ASM_POWERPC_QSPINLOCK_PARAVIRT_H - -EXPORT_SYMBOL(__pv_queued_spin_unlock); - -#endif /* _ASM_POWERPC_QSPINLOCK_PARAVIRT_H */ diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h new file mode 100644 index 000000000000..59606bc0c774 --- /dev/null +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_POWERPC_QSPINLOCK_TYPES_H +#define _ASM_POWERPC_QSPINLOCK_TYPES_H + +#include <linux/types.h> + +typedef struct qspinlock { + atomic_t val; +} arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) } + +#endif /* _ASM_POWERPC_QSPINLOCK_TYPES_H */ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index bd75872a6334..7dafca8e3f02 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -13,7 +13,7 @@ /* See include/linux/spinlock.h */ #define smp_mb__after_spinlock() smp_mb() -#ifndef CONFIG_PARAVIRT_SPINLOCKS +#ifndef CONFIG_PPC_QUEUED_SPINLOCKS static inline void pv_spinlocks_init(void) { } #endif diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index d5f8a74ed2e8..40b01446cf75 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -7,7 +7,7 @@ #endif #ifdef CONFIG_PPC_QUEUED_SPINLOCKS -#include <asm-generic/qspinlock_types.h> +#include <asm/qspinlock_types.h> #include <asm-generic/qrwlock_types.h> #else #include <asm/simple_spinlock_types.h> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 8560c912186d..4de71cbf6e8e 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -52,7 +52,9 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \ obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ memcpy_64.o copy_mc_64.o -ifndef CONFIG_PPC_QUEUED_SPINLOCKS +ifdef CONFIG_PPC_QUEUED_SPINLOCKS +obj-$(CONFIG_SMP) += qspinlock.o +else obj64-$(CONFIG_SMP) += locks.o endif diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c new file mode 100644 index 000000000000..1c669b5b4607 --- /dev/null +++ b/arch/powerpc/lib/qspinlock.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/export.h> +#include <linux/processor.h> +#include <asm/qspinlock.h> + +void queued_spin_lock_slowpath(struct qspinlock *lock) +{ + while (!queued_spin_trylock(lock)) + cpu_relax(); +} +EXPORT_SYMBOL(queued_spin_lock_slowpath); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void pv_spinlocks_init(void) +{ +} +#endif From 84990b169557428c318df87b7836cd15f65b62dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:16 +1000 Subject: [PATCH 104/181] powerpc/qspinlock: add mcs queueing for contended waiters This forms the basis of the qspinlock slow path. Like generic qspinlocks and unlike the vanilla MCS algorithm, the lock owner does not participate in the queue, only waiters. The first waiter spins on the lock word, then when the lock is released it takes ownership and unqueues the next waiter. This is how qspinlocks can be implemented with the spinlock API -- lock owners don't need a node, only waiters do. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-2-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 10 +- arch/powerpc/include/asm/qspinlock_types.h | 23 +++ arch/powerpc/lib/qspinlock.c | 187 ++++++++++++++++++++- 3 files changed, 214 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 5e6257313557..6946dba5d087 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -19,12 +19,12 @@ static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { - return 0; + return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); } static __always_inline int queued_spin_trylock(struct qspinlock *lock) { - return atomic_cmpxchg_acquire(&lock->val, 0, 1) == 0; + return atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0; } void queued_spin_lock_slowpath(struct qspinlock *lock); @@ -37,7 +37,11 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) static inline void queued_spin_unlock(struct qspinlock *lock) { - atomic_set_release(&lock->val, 0); + for (;;) { + int val = atomic_read(&lock->val); + if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val) + return; + } } #define arch_spin_is_locked(l) queued_spin_is_locked(l) diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 59606bc0c774..20a36dfb14e2 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -10,4 +10,27 @@ typedef struct qspinlock { #define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) } +/* + * Bitfields in the lock word: + * + * 0: locked bit + * 1-16: unused bits + * 17-31: tail cpu (+1) + */ +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +/* 0x00000001 */ +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 1 +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) + +/* 0xfffe0000 */ +#define _Q_TAIL_CPU_OFFSET 17 +#define _Q_TAIL_CPU_BITS 15 +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#if CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS) +#error "qspinlock does not support such large CONFIG_NR_CPUS" +#endif + #endif /* _ASM_POWERPC_QSPINLOCK_TYPES_H */ diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 1c669b5b4607..86504628501e 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -1,12 +1,193 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/compiler.h> #include <linux/export.h> -#include <linux/processor.h> +#include <linux/percpu.h> +#include <linux/smp.h> #include <asm/qspinlock.h> +#define MAX_NODES 4 + +struct qnode { + struct qnode *next; + struct qspinlock *lock; + u8 locked; /* 1 if lock acquired */ +}; + +struct qnodes { + int count; + struct qnode nodes[MAX_NODES]; +}; + +static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); + +static inline int encode_tail_cpu(int cpu) +{ + return (cpu + 1) << _Q_TAIL_CPU_OFFSET; +} + +static inline int decode_tail_cpu(int val) +{ + return (val >> _Q_TAIL_CPU_OFFSET) - 1; +} + +/* + * Try to acquire the lock if it was not already locked. If the tail matches + * mytail then clear it, otherwise leave it unchnaged. Return previous value. + * + * This is used by the head of the queue to acquire the lock and clean up + * its tail if it was the last one queued. + */ +static __always_inline int set_locked_clean_tail(struct qspinlock *lock, int tail) +{ + int val = atomic_read(&lock->val); + + BUG_ON(val & _Q_LOCKED_VAL); + + /* If we're the last queued, must clean up the tail. */ + if ((val & _Q_TAIL_CPU_MASK) == tail) { + if (atomic_cmpxchg_acquire(&lock->val, val, _Q_LOCKED_VAL) == val) + return val; + /* Another waiter must have enqueued */ + val = atomic_read(&lock->val); + BUG_ON(val & _Q_LOCKED_VAL); + } + + /* We must be the owner, just set the lock bit and acquire */ + atomic_or(_Q_LOCKED_VAL, &lock->val); + __atomic_acquire_fence(); + + return val; +} + +/* + * Publish our tail, replacing previous tail. Return previous value. + * + * This provides a release barrier for publishing node, this pairs with the + * acquire barrier in get_tail_qnode() when the next CPU finds this tail + * value. + */ +static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) +{ + for (;;) { + int val = atomic_read(&lock->val); + int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; + int old; + + old = atomic_cmpxchg_release(&lock->val, val, newval); + if (old == val) + return old; + } +} + +static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) +{ + int cpu = decode_tail_cpu(val); + struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); + int idx; + + /* + * After publishing the new tail and finding a previous tail in the + * previous val (which is the control dependency), this barrier + * orders the release barrier in publish_tail_cpu performed by the + * last CPU, with subsequently looking at its qnode structures + * after the barrier. + */ + smp_acquire__after_ctrl_dep(); + + for (idx = 0; idx < MAX_NODES; idx++) { + struct qnode *qnode = &qnodesp->nodes[idx]; + if (qnode->lock == lock) + return qnode; + } + + BUG(); +} + +static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) +{ + struct qnodes *qnodesp; + struct qnode *next, *node; + int val, old, tail; + int idx; + + BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); + + qnodesp = this_cpu_ptr(&qnodes); + if (unlikely(qnodesp->count >= MAX_NODES)) { + while (!queued_spin_trylock(lock)) + cpu_relax(); + return; + } + + idx = qnodesp->count++; + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node = &qnodesp->nodes[idx]; + node->next = NULL; + node->lock = lock; + node->locked = 0; + + tail = encode_tail_cpu(smp_processor_id()); + + old = publish_tail_cpu(lock, tail); + + /* + * If there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_CPU_MASK) { + struct qnode *prev = get_tail_qnode(lock, old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + /* Wait for mcs node lock to be released */ + while (!node->locked) + cpu_relax(); + + smp_rmb(); /* acquire barrier for the mcs lock */ + } + + /* We're at the head of the waitqueue, wait for the lock. */ + for (;;) { + val = atomic_read(&lock->val); + if (!(val & _Q_LOCKED_VAL)) + break; + + cpu_relax(); + } + + /* If we're the last queued, must clean up the tail. */ + old = set_locked_clean_tail(lock, tail); + if ((old & _Q_TAIL_CPU_MASK) == tail) + goto release; /* Another waiter must have enqueued */ + + /* There is a next, must wait for node->next != NULL (MCS protocol) */ + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + + /* + * Unlock the next mcs waiter node. Release barrier is not required + * here because the acquirer is only accessing the lock word, and + * the acquire barrier we took the lock with orders that update vs + * this store to locked. The corresponding barrier is the smp_rmb() + * acquire barrier for mcs lock, above. + */ + WRITE_ONCE(next->locked, 1); + +release: + qnodesp->count--; /* release the node */ +} + void queued_spin_lock_slowpath(struct qspinlock *lock) { - while (!queued_spin_trylock(lock)) - cpu_relax(); + queued_spin_lock_mcs_queue(lock); } EXPORT_SYMBOL(queued_spin_lock_slowpath); From 4c93c2e4b9e8988511c06b9c042f23d4b8f593ad Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:17 +1000 Subject: [PATCH 105/181] powerpc/qspinlock: use a half-word store to unlock to avoid larx/stcx. The first 16 bits of the lock are only modified by the owner, and other modifications always use atomic operations on the entire 32 bits, so unlocks can use plain stores on the 16 bits. This is the same kind of optimisation done by core qspinlock code. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-3-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 6 +----- arch/powerpc/include/asm/qspinlock_types.h | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 6946dba5d087..713f6629f6fb 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -37,11 +37,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) static inline void queued_spin_unlock(struct qspinlock *lock) { - for (;;) { - int val = atomic_read(&lock->val); - if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val) - return; - } + smp_store_release(&lock->locked, 0); } #define arch_spin_is_locked(l) queued_spin_is_locked(l) diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 20a36dfb14e2..fe87181c59e5 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -3,12 +3,27 @@ #define _ASM_POWERPC_QSPINLOCK_TYPES_H #include <linux/types.h> +#include <asm/byteorder.h> typedef struct qspinlock { - atomic_t val; + union { + atomic_t val; + +#ifdef __LITTLE_ENDIAN + struct { + u16 locked; + u8 reserved[2]; + }; +#else + struct { + u8 reserved[2]; + u16 locked; + }; +#endif + }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } /* * Bitfields in the lock word: From b3a73b7db2b6cb3b2e5bfda5518a0e92230ef673 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:18 +1000 Subject: [PATCH 106/181] powerpc/qspinlock: convert atomic operations to assembly This uses more optimal ll/sc style access patterns (rather than cmpxchg), and also sets the EH=1 lock hint on those operations which acquire ownership of the lock. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-4-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 24 +++++-- arch/powerpc/include/asm/qspinlock_types.h | 4 +- arch/powerpc/lib/qspinlock.c | 74 +++++++++++++--------- 3 files changed, 64 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 713f6629f6fb..c16e1f0674b5 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -2,29 +2,43 @@ #ifndef _ASM_POWERPC_QSPINLOCK_H #define _ASM_POWERPC_QSPINLOCK_H -#include <linux/atomic.h> #include <linux/compiler.h> #include <asm/qspinlock_types.h> #include <asm/paravirt.h> static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { - return atomic_read(&lock->val); + return READ_ONCE(lock->val); } static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { - return !atomic_read(&lock.val); + return !lock.val; } static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { - return !!(atomic_read(&lock->val) & _Q_TAIL_CPU_MASK); + return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } static __always_inline int queued_spin_trylock(struct qspinlock *lock) { - return atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0; + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" +" cmpwi 0,%0,0 \n" +" bne- 2f \n" +" stwcx. %2,0,%1 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_LOCKED_VAL), + "i" (IS_ENABLED(CONFIG_PPC64)) + : "cr0", "memory"); + + return likely(prev == 0); } void queued_spin_lock_slowpath(struct qspinlock *lock); diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index fe87181c59e5..b9a5a52fa670 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -7,7 +7,7 @@ typedef struct qspinlock { union { - atomic_t val; + u32 val; #ifdef __LITTLE_ENDIAN struct { @@ -23,7 +23,7 @@ typedef struct qspinlock { }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = 0 } } /* * Bitfields in the lock word: diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 86504628501e..645d9affacfd 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/atomic.h> #include <linux/bug.h> #include <linux/compiler.h> #include <linux/export.h> @@ -22,12 +21,12 @@ struct qnodes { static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); -static inline int encode_tail_cpu(int cpu) +static inline u32 encode_tail_cpu(int cpu) { return (cpu + 1) << _Q_TAIL_CPU_OFFSET; } -static inline int decode_tail_cpu(int val) +static inline int decode_tail_cpu(u32 val) { return (val >> _Q_TAIL_CPU_OFFSET) - 1; } @@ -39,26 +38,34 @@ static inline int decode_tail_cpu(int val) * This is used by the head of the queue to acquire the lock and clean up * its tail if it was the last one queued. */ -static __always_inline int set_locked_clean_tail(struct qspinlock *lock, int tail) +static __always_inline u32 set_locked_clean_tail(struct qspinlock *lock, u32 tail) { - int val = atomic_read(&lock->val); + u32 newval = _Q_LOCKED_VAL; + u32 prev, tmp; - BUG_ON(val & _Q_LOCKED_VAL); + asm volatile( +"1: lwarx %0,0,%2,%6 # set_locked_clean_tail \n" + /* Test whether the lock tail == tail */ +" and %1,%0,%5 \n" +" cmpw 0,%1,%3 \n" + /* Merge the new locked value */ +" or %1,%1,%4 \n" +" bne 2f \n" + /* If the lock tail matched, then clear it, otherwise leave it. */ +" andc %1,%1,%5 \n" +"2: stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"3: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r"(tail), "r" (newval), + "r" (_Q_TAIL_CPU_MASK), + "i" (IS_ENABLED(CONFIG_PPC64)) + : "cr0", "memory"); - /* If we're the last queued, must clean up the tail. */ - if ((val & _Q_TAIL_CPU_MASK) == tail) { - if (atomic_cmpxchg_acquire(&lock->val, val, _Q_LOCKED_VAL) == val) - return val; - /* Another waiter must have enqueued */ - val = atomic_read(&lock->val); - BUG_ON(val & _Q_LOCKED_VAL); - } + BUG_ON(prev & _Q_LOCKED_VAL); - /* We must be the owner, just set the lock bit and acquire */ - atomic_or(_Q_LOCKED_VAL, &lock->val); - __atomic_acquire_fence(); - - return val; + return prev; } /* @@ -68,20 +75,25 @@ static __always_inline int set_locked_clean_tail(struct qspinlock *lock, int tai * acquire barrier in get_tail_qnode() when the next CPU finds this tail * value. */ -static __always_inline int publish_tail_cpu(struct qspinlock *lock, int tail) +static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) { - for (;;) { - int val = atomic_read(&lock->val); - int newval = (val & ~_Q_TAIL_CPU_MASK) | tail; - int old; + u32 prev, tmp; - old = atomic_cmpxchg_release(&lock->val, val, newval); - if (old == val) - return old; - } + asm volatile( +"\t" PPC_RELEASE_BARRIER " \n" +"1: lwarx %0,0,%2 # publish_tail_cpu \n" +" andc %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" + : "=&r" (prev), "=&r"(tmp) + : "r" (&lock->val), "r" (tail), "r"(_Q_TAIL_CPU_MASK) + : "cr0", "memory"); + + return prev; } -static struct qnode *get_tail_qnode(struct qspinlock *lock, int val) +static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = decode_tail_cpu(val); struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); @@ -109,7 +121,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) { struct qnodes *qnodesp; struct qnode *next, *node; - int val, old, tail; + u32 val, old, tail; int idx; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -156,7 +168,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) /* We're at the head of the waitqueue, wait for the lock. */ for (;;) { - val = atomic_read(&lock->val); + val = READ_ONCE(lock->val); if (!(val & _Q_LOCKED_VAL)) break; From 6aa42f883c438ea132a28801bef3f86f3883d14c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:19 +1000 Subject: [PATCH 107/181] powerpc/qspinlock: allow new waiters to steal the lock before queueing Allow new waiters to "steal" the lock before queueing. That is, to acquire it while other CPUs have queued. This particularly helps paravirt performance when physical CPUs are oversubscribed, by keeping the lock from becoming a strict FIFO and vCPU preemption causing queue train wrecks. The new __queued_spin_trylock_steal() function is put in qspinlock.h to save having to move it, because it will be used there by a later change. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-5-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 23 ++++++ arch/powerpc/lib/qspinlock.c | 110 ++++++++++++++++++++++++--- 2 files changed, 124 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index c16e1f0674b5..cebd2c89c08d 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -41,6 +41,29 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) return likely(prev == 0); } +static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) +{ + u32 prev, tmp; + + /* Trylock may get ahead of queued nodes if it finds unlocked */ + asm volatile( +"1: lwarx %0,0,%2,%5 # __queued_spin_trylock_steal \n" +" andc. %1,%0,%4 \n" +" bne- 2f \n" +" and %1,%0,%4 \n" +" or %1,%1,%3 \n" +" stwcx. %1,0,%2 \n" +" bne- 1b \n" +"\t" PPC_ACQUIRE_BARRIER " \n" +"2: \n" + : "=&r" (prev), "=&r" (tmp) + : "r" (&lock->val), "r" (_Q_LOCKED_VAL), "r" (_Q_TAIL_CPU_MASK), + "i" (IS_ENABLED(CONFIG_PPC64)) + : "cr0", "memory"); + + return likely(!(prev & ~_Q_TAIL_CPU_MASK)); +} + void queued_spin_lock_slowpath(struct qspinlock *lock); static __always_inline void queued_spin_lock(struct qspinlock *lock) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 645d9affacfd..6ffd3261064c 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -19,8 +19,17 @@ struct qnodes { struct qnode nodes[MAX_NODES]; }; +/* Tuning parameters */ +static int steal_spins __read_mostly = (1 << 5); +static bool maybe_stealers __read_mostly = true; + static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); +static __always_inline int get_steal_spins(void) +{ + return steal_spins; +} + static inline u32 encode_tail_cpu(int cpu) { return (cpu + 1) << _Q_TAIL_CPU_OFFSET; @@ -38,33 +47,35 @@ static inline int decode_tail_cpu(u32 val) * This is used by the head of the queue to acquire the lock and clean up * its tail if it was the last one queued. */ -static __always_inline u32 set_locked_clean_tail(struct qspinlock *lock, u32 tail) +static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail) { u32 newval = _Q_LOCKED_VAL; u32 prev, tmp; asm volatile( -"1: lwarx %0,0,%2,%6 # set_locked_clean_tail \n" - /* Test whether the lock tail == tail */ -" and %1,%0,%5 \n" +"1: lwarx %0,0,%2,%7 # trylock_clean_tail \n" + /* This test is necessary if there could be stealers */ +" andi. %1,%0,%5 \n" +" bne 3f \n" + /* Test whether the lock tail == mytail */ +" and %1,%0,%6 \n" " cmpw 0,%1,%3 \n" /* Merge the new locked value */ " or %1,%1,%4 \n" " bne 2f \n" /* If the lock tail matched, then clear it, otherwise leave it. */ -" andc %1,%1,%5 \n" +" andc %1,%1,%6 \n" "2: stwcx. %1,0,%2 \n" " bne- 1b \n" "\t" PPC_ACQUIRE_BARRIER " \n" "3: \n" : "=&r" (prev), "=&r" (tmp) : "r" (&lock->val), "r"(tail), "r" (newval), + "i" (_Q_LOCKED_VAL), "r" (_Q_TAIL_CPU_MASK), "i" (IS_ENABLED(CONFIG_PPC64)) : "cr0", "memory"); - BUG_ON(prev & _Q_LOCKED_VAL); - return prev; } @@ -117,6 +128,30 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) BUG(); } +static inline bool try_to_steal_lock(struct qspinlock *lock) +{ + int iters = 0; + + if (!steal_spins) + return false; + + /* Attempt to steal the lock */ + do { + u32 val = READ_ONCE(lock->val); + + if (unlikely(!(val & _Q_LOCKED_VAL))) { + if (__queued_spin_trylock_steal(lock)) + return true; + } else { + cpu_relax(); + } + + iters++; + } while (iters < get_steal_spins()); + + return false; +} + static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) { struct qnodes *qnodesp; @@ -166,6 +201,7 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) smp_rmb(); /* acquire barrier for the mcs lock */ } +again: /* We're at the head of the waitqueue, wait for the lock. */ for (;;) { val = READ_ONCE(lock->val); @@ -176,9 +212,14 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) } /* If we're the last queued, must clean up the tail. */ - old = set_locked_clean_tail(lock, tail); + old = trylock_clean_tail(lock, tail); + if (unlikely(old & _Q_LOCKED_VAL)) { + BUG_ON(!maybe_stealers); + goto again; /* Can only be true if maybe_stealers. */ + } + if ((old & _Q_TAIL_CPU_MASK) == tail) - goto release; /* Another waiter must have enqueued */ + goto release; /* We were the tail, no next. */ /* There is a next, must wait for node->next != NULL (MCS protocol) */ while (!(next = READ_ONCE(node->next))) @@ -199,6 +240,9 @@ release: void queued_spin_lock_slowpath(struct qspinlock *lock) { + if (try_to_steal_lock(lock)) + return; + queued_spin_lock_mcs_queue(lock); } EXPORT_SYMBOL(queued_spin_lock_slowpath); @@ -208,3 +252,51 @@ void pv_spinlocks_init(void) { } #endif + +#include <linux/debugfs.h> +static int steal_spins_set(void *data, u64 val) +{ + static DEFINE_MUTEX(lock); + + /* + * The lock slow path has a !maybe_stealers case that can assume + * the head of queue will not see concurrent waiters. That waiter + * is unsafe in the presence of stealers, so must keep them away + * from one another. + */ + + mutex_lock(&lock); + if (val && !steal_spins) { + maybe_stealers = true; + /* wait for queue head waiter to go away */ + synchronize_rcu(); + steal_spins = val; + } else if (!val && steal_spins) { + steal_spins = val; + /* wait for all possible stealers to go away */ + synchronize_rcu(); + maybe_stealers = false; + } else { + steal_spins = val; + } + mutex_unlock(&lock); + + return 0; +} + +static int steal_spins_get(void *data, u64 *val) +{ + *val = steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n"); + +static __init int spinlock_debugfs_init(void) +{ + debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); + + return 0; +} +device_initcall(spinlock_debugfs_init); From 0944534ef4d5cf39c8133575524be0be3337dd62 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:20 +1000 Subject: [PATCH 108/181] powerpc/qspinlock: theft prevention to control latency Give the queue head the ability to stop stealers. After a number of spins without successfully acquiring the lock, the queue head sets this, which halts stealing and will assure it is the next owner. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-6-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock_types.h | 8 +++- arch/powerpc/lib/qspinlock.c | 53 ++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index b9a5a52fa670..1911a8a16237 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -29,7 +29,8 @@ typedef struct qspinlock { * Bitfields in the lock word: * * 0: locked bit - * 1-16: unused bits + * 1-15: unused bits + * 16: must queue bit * 17-31: tail cpu (+1) */ #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ @@ -39,6 +40,11 @@ typedef struct qspinlock { #define _Q_LOCKED_BITS 1 #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +/* 0x00010000 */ +#define _Q_MUST_Q_OFFSET 16 +#define _Q_MUST_Q_BITS 1 +#define _Q_MUST_Q_VAL (1U << _Q_MUST_Q_OFFSET) + /* 0xfffe0000 */ #define _Q_TAIL_CPU_OFFSET 17 #define _Q_TAIL_CPU_BITS 15 diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 6ffd3261064c..9cd442d46b9f 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -22,6 +22,7 @@ struct qnodes { /* Tuning parameters */ static int steal_spins __read_mostly = (1 << 5); static bool maybe_stealers __read_mostly = true; +static int head_spins __read_mostly = (1 << 8); static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -30,6 +31,11 @@ static __always_inline int get_steal_spins(void) return steal_spins; } +static __always_inline int get_head_spins(void) +{ + return head_spins; +} + static inline u32 encode_tail_cpu(int cpu) { return (cpu + 1) << _Q_TAIL_CPU_OFFSET; @@ -104,6 +110,22 @@ static __always_inline u32 publish_tail_cpu(struct qspinlock *lock, u32 tail) return prev; } +static __always_inline u32 set_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # set_mustq \n" +" or %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = decode_tail_cpu(val); @@ -139,6 +161,9 @@ static inline bool try_to_steal_lock(struct qspinlock *lock) do { u32 val = READ_ONCE(lock->val); + if (val & _Q_MUST_Q_VAL) + break; + if (unlikely(!(val & _Q_LOCKED_VAL))) { if (__queued_spin_trylock_steal(lock)) return true; @@ -157,7 +182,9 @@ static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) struct qnodes *qnodesp; struct qnode *next, *node; u32 val, old, tail; + bool mustq = false; int idx; + int iters = 0; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -209,6 +236,15 @@ again: break; cpu_relax(); + if (!maybe_stealers) + continue; + iters++; + + if (!mustq && iters >= get_head_spins()) { + mustq = true; + set_mustq(lock); + val |= _Q_MUST_Q_VAL; + } } /* If we're the last queued, must clean up the tail. */ @@ -293,9 +329,26 @@ static int steal_spins_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n"); +static int head_spins_set(void *data, u64 val) +{ + head_spins = val; + + return 0; +} + +static int head_spins_get(void *data, u64 *val) +{ + *val = head_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n"); + static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); + debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); return 0; } From e1a31e7fd7130628cfd229253da2b4630e7a809c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:21 +1000 Subject: [PATCH 109/181] powerpc/qspinlock: store owner CPU in lock word Store the owner CPU number in the lock word so it may be yielded to, as powerpc's paravirtualised simple spinlocks do. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-7-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 12 ++++++++++-- arch/powerpc/include/asm/qspinlock_types.h | 12 +++++++++++- arch/powerpc/lib/qspinlock.c | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index cebd2c89c08d..9572a2ef974d 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -21,8 +21,15 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock) return !!(READ_ONCE(lock->val) & _Q_TAIL_CPU_MASK); } +static __always_inline u32 queued_spin_encode_locked_val(void) +{ + /* XXX: make this use lock value in paca like simple spinlocks? */ + return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); +} + static __always_inline int queued_spin_trylock(struct qspinlock *lock) { + u32 new = queued_spin_encode_locked_val(); u32 prev; asm volatile( @@ -34,7 +41,7 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (prev) - : "r" (&lock->val), "r" (_Q_LOCKED_VAL), + : "r" (&lock->val), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64)) : "cr0", "memory"); @@ -43,6 +50,7 @@ static __always_inline int queued_spin_trylock(struct qspinlock *lock) static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) { + u32 new = queued_spin_encode_locked_val(); u32 prev, tmp; /* Trylock may get ahead of queued nodes if it finds unlocked */ @@ -57,7 +65,7 @@ static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (prev), "=&r" (tmp) - : "r" (&lock->val), "r" (_Q_LOCKED_VAL), "r" (_Q_TAIL_CPU_MASK), + : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), "i" (IS_ENABLED(CONFIG_PPC64)) : "cr0", "memory"); diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index 1911a8a16237..adfeed4aa495 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -29,7 +29,8 @@ typedef struct qspinlock { * Bitfields in the lock word: * * 0: locked bit - * 1-15: unused bits + * 1-14: lock holder cpu + * 15: unused bit * 16: must queue bit * 17-31: tail cpu (+1) */ @@ -40,6 +41,15 @@ typedef struct qspinlock { #define _Q_LOCKED_BITS 1 #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +/* 0x00007ffe */ +#define _Q_OWNER_CPU_OFFSET 1 +#define _Q_OWNER_CPU_BITS 14 +#define _Q_OWNER_CPU_MASK _Q_SET_MASK(OWNER_CPU) + +#if CONFIG_NR_CPUS > (1U << _Q_OWNER_CPU_BITS) +#error "qspinlock does not support such large CONFIG_NR_CPUS" +#endif + /* 0x00010000 */ #define _Q_MUST_Q_OFFSET 16 #define _Q_MUST_Q_BITS 1 diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 9cd442d46b9f..4d74db0e565f 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -55,7 +55,7 @@ static inline int decode_tail_cpu(u32 val) */ static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail) { - u32 newval = _Q_LOCKED_VAL; + u32 newval = queued_spin_encode_locked_val(); u32 prev, tmp; asm volatile( From 085f03311bcede99550e08a1f7cad41bf758b460 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:22 +1000 Subject: [PATCH 110/181] powerpc/qspinlock: paravirt yield to lock owner Waiters spinning on the lock word should yield to the lock owner if the vCPU is preempted. This improves performance when the hypervisor has oversubscribed physical CPUs. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-8-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 99 +++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 4d74db0e565f..18e21574e6c5 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -5,6 +5,7 @@ #include <linux/percpu.h> #include <linux/smp.h> #include <asm/qspinlock.h> +#include <asm/paravirt.h> #define MAX_NODES 4 @@ -24,14 +25,16 @@ static int steal_spins __read_mostly = (1 << 5); static bool maybe_stealers __read_mostly = true; static int head_spins __read_mostly = (1 << 8); +static bool pv_yield_owner __read_mostly = true; + static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); -static __always_inline int get_steal_spins(void) +static __always_inline int get_steal_spins(bool paravirt) { return steal_spins; } -static __always_inline int get_head_spins(void) +static __always_inline int get_head_spins(bool paravirt) { return head_spins; } @@ -46,6 +49,11 @@ static inline int decode_tail_cpu(u32 val) return (val >> _Q_TAIL_CPU_OFFSET) - 1; } +static inline int get_owner_cpu(u32 val) +{ + return (val & _Q_OWNER_CPU_MASK) >> _Q_OWNER_CPU_OFFSET; +} + /* * Try to acquire the lock if it was not already locked. If the tail matches * mytail then clear it, otherwise leave it unchnaged. Return previous value. @@ -150,7 +158,45 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) BUG(); } -static inline bool try_to_steal_lock(struct qspinlock *lock) +static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + int owner; + u32 yield_count; + + BUG_ON(!(val & _Q_LOCKED_VAL)); + + if (!paravirt) + goto relax; + + if (!pv_yield_owner) + goto relax; + + owner = get_owner_cpu(val); + yield_count = yield_count_of(owner); + + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + /* + * Read the lock word after sampling the yield count. On the other side + * there may a wmb because the yield count update is done by the + * hypervisor preemption and the value update by the OS, however this + * ordering might reduce the chance of out of order accesses and + * improve the heuristic. + */ + smp_rmb(); + + if (READ_ONCE(lock->val) == val) { + yield_to_preempted(owner, yield_count); + /* Don't relax if we yielded. Maybe we should? */ + return; + } +relax: + cpu_relax(); +} + + +static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) { int iters = 0; @@ -168,16 +214,16 @@ static inline bool try_to_steal_lock(struct qspinlock *lock) if (__queued_spin_trylock_steal(lock)) return true; } else { - cpu_relax(); + yield_to_locked_owner(lock, val, paravirt); } iters++; - } while (iters < get_steal_spins()); + } while (iters < get_steal_spins(paravirt)); return false; } -static inline void queued_spin_lock_mcs_queue(struct qspinlock *lock) +static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, bool paravirt) { struct qnodes *qnodesp; struct qnode *next, *node; @@ -235,12 +281,12 @@ again: if (!(val & _Q_LOCKED_VAL)) break; - cpu_relax(); + yield_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; iters++; - if (!mustq && iters >= get_head_spins()) { + if (!mustq && iters >= get_head_spins(paravirt)) { mustq = true; set_mustq(lock); val |= _Q_MUST_Q_VAL; @@ -276,10 +322,20 @@ release: void queued_spin_lock_slowpath(struct qspinlock *lock) { - if (try_to_steal_lock(lock)) - return; - - queued_spin_lock_mcs_queue(lock); + /* + * This looks funny, but it induces the compiler to inline both + * sides of the branch rather than share code as when the condition + * is passed as the paravirt argument to the functions. + */ + if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { + if (try_to_steal_lock(lock, true)) + return; + queued_spin_lock_mcs_queue(lock, true); + } else { + if (try_to_steal_lock(lock, false)) + return; + queued_spin_lock_mcs_queue(lock, false); + } } EXPORT_SYMBOL(queued_spin_lock_slowpath); @@ -345,10 +401,29 @@ static int head_spins_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_head_spins, head_spins_get, head_spins_set, "%llu\n"); +static int pv_yield_owner_set(void *data, u64 val) +{ + pv_yield_owner = !!val; + + return 0; +} + +static int pv_yield_owner_get(void *data, u64 *val) +{ + *val = pv_yield_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n"); + static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); + if (is_shared_processor()) { + debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); + } return 0; } From bd48287b2cf4cd6e95576db3a94fd2a7cdf9832d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:23 +1000 Subject: [PATCH 111/181] powerpc/qspinlock: implement option to yield to previous node Queued waiters which are not at the head of the queue don't spin on the lock word but their qnode lock word, waiting for the previous queued CPU to release them. Add an option which allows these waiters to yield to the previous CPU if its vCPU is preempted. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-9-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 46 +++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 18e21574e6c5..41afd8e68918 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -26,6 +26,7 @@ static bool maybe_stealers __read_mostly = true; static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; +static bool pv_yield_prev __read_mostly = true; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -195,6 +196,32 @@ relax: cpu_relax(); } +static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) +{ + int prev_cpu = decode_tail_cpu(val); + u32 yield_count; + + if (!paravirt) + goto relax; + + if (!pv_yield_prev) + goto relax; + + yield_count = yield_count_of(prev_cpu); + if ((yield_count & 1) == 0) + goto relax; /* owner vcpu is running */ + + smp_rmb(); /* See yield_to_locked_owner comment */ + + if (!node->locked) { + yield_to_preempted(prev_cpu, yield_count); + return; + } + +relax: + cpu_relax(); +} + static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) { @@ -269,7 +296,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b /* Wait for mcs node lock to be released */ while (!node->locked) - cpu_relax(); + yield_to_prev(lock, node, old, paravirt); smp_rmb(); /* acquire barrier for the mcs lock */ } @@ -417,12 +444,29 @@ static int pv_yield_owner_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n"); +static int pv_yield_prev_set(void *data, u64 val) +{ + pv_yield_prev = !!val; + + return 0; +} + +static int pv_yield_prev_get(void *data, u64 *val) +{ + *val = pv_yield_prev; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n"); + static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); if (is_shared_processor()) { debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); + debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); } return 0; From b4c3cdc1a698a2f6168768d0bed4bf062723722e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:24 +1000 Subject: [PATCH 112/181] powerpc/qspinlock: allow stealing when head of queue yields If the head of queue is preventing stealing but it finds the owner vCPU is preempted, it will yield its cycles to the owner which could cause it to become preempted. Add an option to re-allow stealers before yielding, and disallow them again after returning from the yield. Disable this option by default for now, i.e., no logical change. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-10-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 59 ++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 41afd8e68918..c1f3b699b63f 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -26,6 +26,7 @@ static bool maybe_stealers __read_mostly = true; static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; +static bool pv_yield_allow_steal __read_mostly = false; static bool pv_yield_prev __read_mostly = true; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -135,6 +136,22 @@ static __always_inline u32 set_mustq(struct qspinlock *lock) return prev; } +static __always_inline u32 clear_mustq(struct qspinlock *lock) +{ + u32 prev; + + asm volatile( +"1: lwarx %0,0,%1 # clear_mustq \n" +" andc %0,%0,%2 \n" +" stwcx. %0,0,%1 \n" +" bne- 1b \n" + : "=&r" (prev) + : "r" (&lock->val), "r" (_Q_MUST_Q_VAL) + : "cr0", "memory"); + + return prev; +} + static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = decode_tail_cpu(val); @@ -159,7 +176,7 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) BUG(); } -static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) { int owner; u32 yield_count; @@ -188,7 +205,11 @@ static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 va smp_rmb(); if (READ_ONCE(lock->val) == val) { + if (mustq) + clear_mustq(lock); yield_to_preempted(owner, yield_count); + if (mustq) + set_mustq(lock); /* Don't relax if we yielded. Maybe we should? */ return; } @@ -196,6 +217,21 @@ relax: cpu_relax(); } +static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + __yield_to_locked_owner(lock, val, paravirt, false); +} + +static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +{ + bool mustq = false; + + if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal) + mustq = true; + + __yield_to_locked_owner(lock, val, paravirt, mustq); +} + static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) { int prev_cpu = decode_tail_cpu(val); @@ -211,7 +247,7 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode * if ((yield_count & 1) == 0) goto relax; /* owner vcpu is running */ - smp_rmb(); /* See yield_to_locked_owner comment */ + smp_rmb(); /* See __yield_to_locked_owner comment */ if (!node->locked) { yield_to_preempted(prev_cpu, yield_count); @@ -308,7 +344,7 @@ again: if (!(val & _Q_LOCKED_VAL)) break; - yield_to_locked_owner(lock, val, paravirt); + yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; iters++; @@ -444,6 +480,22 @@ static int pv_yield_owner_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_owner, pv_yield_owner_get, pv_yield_owner_set, "%llu\n"); +static int pv_yield_allow_steal_set(void *data, u64 val) +{ + pv_yield_allow_steal = !!val; + + return 0; +} + +static int pv_yield_allow_steal_get(void *data, u64 *val) +{ + *val = pv_yield_allow_steal; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n"); + static int pv_yield_prev_set(void *data, u64 val) { pv_yield_prev = !!val; @@ -466,6 +518,7 @@ static __init int spinlock_debugfs_init(void) debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); if (is_shared_processor()) { debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); + debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); } From 28db61e207ea3890d286cff3141c1ce67346074d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:25 +1000 Subject: [PATCH 113/181] powerpc/qspinlock: allow propagation of yield CPU down the queue Having all CPUs poll the lock word for the owner CPU that should be yielded to defeats most of the purpose of using MCS queueing for scalability. Yet it may be desirable for queued waiters to yield to a preempted owner. With this change, queue waiters never sample the owner CPU directly from the lock word. The queue head (which is spinning on the lock) propagates the owner CPU back to the next waiter if it finds the owner has been preempted. That waiter then propagates the owner CPU back to the next waiter, and so on. s390 addresses this problem differenty, by having queued waiters sample the lock word to find the owner at a low frequency. That has the advantage of being simpler, the advantage of propagation is that the lock word never has to be accesed by queued waiters, and the transfer of cache lines to transmit the owner data is only required when lock holder vCPU preemption occurs. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-11-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 79 ++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index c1f3b699b63f..c45f30c9a19e 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -12,6 +12,7 @@ struct qnode { struct qnode *next; struct qspinlock *lock; + int yield_cpu; u8 locked; /* 1 if lock acquired */ }; @@ -28,6 +29,7 @@ static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; static bool pv_yield_allow_steal __read_mostly = false; static bool pv_yield_prev __read_mostly = true; +static bool pv_yield_propagate_owner __read_mostly = true; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -232,14 +234,67 @@ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u __yield_to_locked_owner(lock, val, paravirt, mustq); } +static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt) +{ + struct qnode *next; + int owner; + + if (!paravirt) + return; + if (!pv_yield_propagate_owner) + return; + + owner = get_owner_cpu(val); + if (*set_yield_cpu == owner) + return; + + next = READ_ONCE(node->next); + if (!next) + return; + + if (vcpu_is_preempted(owner)) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } else if (*set_yield_cpu != -1) { + next->yield_cpu = owner; + *set_yield_cpu = owner; + } +} + static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) { int prev_cpu = decode_tail_cpu(val); u32 yield_count; + int yield_cpu; if (!paravirt) goto relax; + if (!pv_yield_propagate_owner) + goto yield_prev; + + yield_cpu = READ_ONCE(node->yield_cpu); + if (yield_cpu == -1) { + /* Propagate back the -1 CPU */ + if (node->next && node->next->yield_cpu != -1) + node->next->yield_cpu = yield_cpu; + goto yield_prev; + } + + yield_count = yield_count_of(yield_cpu); + if ((yield_count & 1) == 0) + goto yield_prev; /* owner vcpu is running */ + + smp_rmb(); + + if (yield_cpu == node->yield_cpu) { + if (node->next && node->next->yield_cpu != yield_cpu) + node->next->yield_cpu = yield_cpu; + yield_to_preempted(yield_cpu, yield_count); + return; + } + +yield_prev: if (!pv_yield_prev) goto relax; @@ -293,6 +348,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b u32 val, old, tail; bool mustq = false; int idx; + int set_yield_cpu = -1; int iters = 0; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -314,6 +370,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b node = &qnodesp->nodes[idx]; node->next = NULL; node->lock = lock; + node->yield_cpu = -1; node->locked = 0; tail = encode_tail_cpu(smp_processor_id()); @@ -334,6 +391,10 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b while (!node->locked) yield_to_prev(lock, node, old, paravirt); + /* Clear out stale propagated yield_cpu */ + if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1) + node->yield_cpu = -1; + smp_rmb(); /* acquire barrier for the mcs lock */ } @@ -344,6 +405,7 @@ again: if (!(val & _Q_LOCKED_VAL)) break; + propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; @@ -512,6 +574,22 @@ static int pv_yield_prev_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n"); +static int pv_yield_propagate_owner_set(void *data, u64 val) +{ + pv_yield_propagate_owner = !!val; + + return 0; +} + +static int pv_yield_propagate_owner_get(void *data, u64 *val) +{ + *val = pv_yield_propagate_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); + static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); @@ -520,6 +598,7 @@ static __init int spinlock_debugfs_init(void) debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); + debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); } return 0; From be742c573fdafcfa1752642ca1c7aaf08c258128 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:26 +1000 Subject: [PATCH 114/181] powerpc/qspinlock: add ability to prod new queue head CPU After the head of the queue acquires the lock, it releases the next waiter in the queue to become the new head. Add an option to prod the new head if its vCPU was preempted. This may only have an effect if queue waiters are yielding. Disable this option by default for now, i.e., no logical change. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-12-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index c45f30c9a19e..2f6c0bed25ea 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -12,6 +12,7 @@ struct qnode { struct qnode *next; struct qspinlock *lock; + int cpu; int yield_cpu; u8 locked; /* 1 if lock acquired */ }; @@ -30,6 +31,7 @@ static bool pv_yield_owner __read_mostly = true; static bool pv_yield_allow_steal __read_mostly = false; static bool pv_yield_prev __read_mostly = true; static bool pv_yield_propagate_owner __read_mostly = true; +static bool pv_prod_head __read_mostly = false; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -370,10 +372,11 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b node = &qnodesp->nodes[idx]; node->next = NULL; node->lock = lock; + node->cpu = smp_processor_id(); node->yield_cpu = -1; node->locked = 0; - tail = encode_tail_cpu(smp_processor_id()); + tail = encode_tail_cpu(node->cpu); old = publish_tail_cpu(lock, tail); @@ -439,7 +442,14 @@ again: * this store to locked. The corresponding barrier is the smp_rmb() * acquire barrier for mcs lock, above. */ - WRITE_ONCE(next->locked, 1); + if (paravirt && pv_prod_head) { + int next_cpu = next->cpu; + WRITE_ONCE(next->locked, 1); + if (vcpu_is_preempted(next_cpu)) + prod_cpu(next_cpu); + } else { + WRITE_ONCE(next->locked, 1); + } release: qnodesp->count--; /* release the node */ @@ -590,6 +600,22 @@ static int pv_yield_propagate_owner_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); +static int pv_prod_head_set(void *data, u64 val) +{ + pv_prod_head = !!val; + + return 0; +} + +static int pv_prod_head_get(void *data, u64 *val) +{ + *val = pv_prod_head; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, "%llu\n"); + static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); @@ -599,6 +625,7 @@ static __init int spinlock_debugfs_init(void) debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); + debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); } return 0; From f61ab43cc1a6146d6eef7e0713a452c3677ad13e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:27 +1000 Subject: [PATCH 115/181] powerpc/qspinlock: allow lock stealing in trylock and lock fastpath This change allows trylock to steal the lock. It also allows the initial lock attempt to steal the lock rather than bailing out and going to the slow path. This gives trylock more strength: without this a continually-contended lock will never permit a trylock to succeed. With this change, the trylock has a small but non-zero chance. It also gives the lock fastpath most of the benefit of passing the reservation back through to the steal loop in the slow path without the complexity. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-13-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 22 ++++++++++++++++++++-- arch/powerpc/lib/qspinlock.c | 9 +++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 9572a2ef974d..93b1c976db8a 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -6,6 +6,15 @@ #include <asm/qspinlock_types.h> #include <asm/paravirt.h> +/* + * The trylock itself may steal. This makes trylocks slightly stronger, and + * might make spin locks slightly more efficient when stealing. + * + * This is compile-time, so if true then there may always be stealers, so the + * nosteal paths become unused. + */ +#define _Q_SPIN_TRY_LOCK_STEAL 1 + static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { return READ_ONCE(lock->val); @@ -27,13 +36,14 @@ static __always_inline u32 queued_spin_encode_locked_val(void) return _Q_LOCKED_VAL | (smp_processor_id() << _Q_OWNER_CPU_OFFSET); } -static __always_inline int queued_spin_trylock(struct qspinlock *lock) +static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) { u32 new = queued_spin_encode_locked_val(); u32 prev; + /* Trylock succeeds only when unlocked and no queued nodes */ asm volatile( -"1: lwarx %0,0,%1,%3 # queued_spin_trylock \n" +"1: lwarx %0,0,%1,%3 # __queued_spin_trylock_nosteal \n" " cmpwi 0,%0,0 \n" " bne- 2f \n" " stwcx. %2,0,%1 \n" @@ -72,6 +82,14 @@ static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) return likely(!(prev & ~_Q_TAIL_CPU_MASK)); } +static __always_inline int queued_spin_trylock(struct qspinlock *lock) +{ + if (!_Q_SPIN_TRY_LOCK_STEAL) + return __queued_spin_trylock_nosteal(lock); + else + return __queued_spin_trylock_steal(lock); +} + void queued_spin_lock_slowpath(struct qspinlock *lock); static __always_inline void queued_spin_lock(struct qspinlock *lock) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 2f6c0bed25ea..8e5b8bc3f094 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -24,7 +24,11 @@ struct qnodes { /* Tuning parameters */ static int steal_spins __read_mostly = (1 << 5); +#if _Q_SPIN_TRY_LOCK_STEAL == 1 +static const bool maybe_stealers = true; +#else static bool maybe_stealers __read_mostly = true; +#endif static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; @@ -483,6 +487,10 @@ void pv_spinlocks_init(void) #include <linux/debugfs.h> static int steal_spins_set(void *data, u64 val) { +#if _Q_SPIN_TRY_LOCK_STEAL == 1 + /* MAYBE_STEAL remains true */ + steal_spins = val; +#else static DEFINE_MUTEX(lock); /* @@ -507,6 +515,7 @@ static int steal_spins_set(void *data, u64 val) steal_spins = val; } mutex_unlock(&lock); +#endif return 0; } From 71c235027ce7940434acd3f553602ad8b5d36469 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:28 +1000 Subject: [PATCH 116/181] powerpc/qspinlock: use spin_begin/end API Use the spin_begin/spin_cpu_relax/spin_end APIs in qspinlock, which helps to prevent threads issuing a lot of expensive priority nops which may not have much effect due to immediately executing low then medium priority. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-14-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 39 ++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 8e5b8bc3f094..36aff7defda8 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -184,6 +184,7 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) BUG(); } +/* Called inside spin_begin() */ static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) { int owner; @@ -203,6 +204,8 @@ static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 if ((yield_count & 1) == 0) goto relax; /* owner vcpu is running */ + spin_end(); + /* * Read the lock word after sampling the yield count. On the other side * there may a wmb because the yield count update is done by the @@ -218,18 +221,22 @@ static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 yield_to_preempted(owner, yield_count); if (mustq) set_mustq(lock); + spin_begin(); /* Don't relax if we yielded. Maybe we should? */ return; } + spin_begin(); relax: - cpu_relax(); + spin_cpu_relax(); } +/* Called inside spin_begin() */ static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) { __yield_to_locked_owner(lock, val, paravirt, false); } +/* Called inside spin_begin() */ static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) { bool mustq = false; @@ -267,6 +274,7 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int } } +/* Called inside spin_begin() */ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) { int prev_cpu = decode_tail_cpu(val); @@ -291,14 +299,18 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode * if ((yield_count & 1) == 0) goto yield_prev; /* owner vcpu is running */ + spin_end(); + smp_rmb(); if (yield_cpu == node->yield_cpu) { if (node->next && node->next->yield_cpu != yield_cpu) node->next->yield_cpu = yield_cpu; yield_to_preempted(yield_cpu, yield_count); + spin_begin(); return; } + spin_begin(); yield_prev: if (!pv_yield_prev) @@ -308,15 +320,19 @@ yield_prev: if ((yield_count & 1) == 0) goto relax; /* owner vcpu is running */ + spin_end(); + smp_rmb(); /* See __yield_to_locked_owner comment */ if (!node->locked) { yield_to_preempted(prev_cpu, yield_count); + spin_begin(); return; } + spin_begin(); relax: - cpu_relax(); + spin_cpu_relax(); } @@ -328,6 +344,8 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav return false; /* Attempt to steal the lock */ + spin_begin(); + do { u32 val = READ_ONCE(lock->val); @@ -335,8 +353,10 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav break; if (unlikely(!(val & _Q_LOCKED_VAL))) { + spin_end(); if (__queued_spin_trylock_steal(lock)) return true; + spin_begin(); } else { yield_to_locked_owner(lock, val, paravirt); } @@ -344,6 +364,8 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav iters++; } while (iters < get_steal_spins(paravirt)); + spin_end(); + return false; } @@ -395,8 +417,10 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b WRITE_ONCE(prev->next, node); /* Wait for mcs node lock to be released */ + spin_begin(); while (!node->locked) yield_to_prev(lock, node, old, paravirt); + spin_end(); /* Clear out stale propagated yield_cpu */ if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1) @@ -407,6 +431,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b again: /* We're at the head of the waitqueue, wait for the lock. */ + spin_begin(); for (;;) { val = READ_ONCE(lock->val); if (!(val & _Q_LOCKED_VAL)) @@ -424,6 +449,7 @@ again: val |= _Q_MUST_Q_VAL; } } + spin_end(); /* If we're the last queued, must clean up the tail. */ old = trylock_clean_tail(lock, tail); @@ -436,8 +462,13 @@ again: goto release; /* We were the tail, no next. */ /* There is a next, must wait for node->next != NULL (MCS protocol) */ - while (!(next = READ_ONCE(node->next))) - cpu_relax(); + next = READ_ONCE(node->next); + if (!next) { + spin_begin(); + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + spin_end(); + } /* * Unlock the next mcs waiter node. Release barrier is not required From cc79701114154efe79663ba47d9e51aad2ed3c78 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:29 +1000 Subject: [PATCH 117/181] powerpc/qspinlock: reduce remote node steal spins Allow for a reduction in the number of times a CPU from a different node than the owner can attempt to steal the lock before queueing. This could bias the transfer behaviour of the lock across the machine and reduce NUMA crossings. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-15-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 43 +++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 36aff7defda8..8c6b5ef87118 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -4,6 +4,7 @@ #include <linux/export.h> #include <linux/percpu.h> #include <linux/smp.h> +#include <linux/topology.h> #include <asm/qspinlock.h> #include <asm/paravirt.h> @@ -24,6 +25,7 @@ struct qnodes { /* Tuning parameters */ static int steal_spins __read_mostly = (1 << 5); +static int remote_steal_spins __read_mostly = (1 << 2); #if _Q_SPIN_TRY_LOCK_STEAL == 1 static const bool maybe_stealers = true; #else @@ -44,6 +46,11 @@ static __always_inline int get_steal_spins(bool paravirt) return steal_spins; } +static __always_inline int get_remote_steal_spins(bool paravirt) +{ + return remote_steal_spins; +} + static __always_inline int get_head_spins(bool paravirt) { return head_spins; @@ -335,10 +342,24 @@ relax: spin_cpu_relax(); } +static __always_inline bool steal_break(u32 val, int iters, bool paravirt) +{ + if (iters >= get_steal_spins(paravirt)) + return true; + + if (IS_ENABLED(CONFIG_NUMA) && + (iters >= get_remote_steal_spins(paravirt))) { + int cpu = get_owner_cpu(val); + if (numa_node_id() != cpu_to_node(cpu)) + return true; + } + return false; +} static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) { int iters = 0; + u32 val; if (!steal_spins) return false; @@ -347,8 +368,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav spin_begin(); do { - u32 val = READ_ONCE(lock->val); - + val = READ_ONCE(lock->val); if (val & _Q_MUST_Q_VAL) break; @@ -362,7 +382,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav } iters++; - } while (iters < get_steal_spins(paravirt)); + } while (!steal_break(val, iters, paravirt)); spin_end(); @@ -560,6 +580,22 @@ static int steal_spins_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_steal_spins, steal_spins_get, steal_spins_set, "%llu\n"); +static int remote_steal_spins_set(void *data, u64 val) +{ + remote_steal_spins = val; + + return 0; +} + +static int remote_steal_spins_get(void *data, u64 *val) +{ + *val = remote_steal_spins; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_remote_steal_spins, remote_steal_spins_get, remote_steal_spins_set, "%llu\n"); + static int head_spins_set(void *data, u64 val) { head_spins = val; @@ -659,6 +695,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_pv_prod_head, pv_prod_head_get, pv_prod_head_set, " static __init int spinlock_debugfs_init(void) { debugfs_create_file("qspl_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_steal_spins); + debugfs_create_file("qspl_remote_steal_spins", 0600, arch_debugfs_dir, NULL, &fops_remote_steal_spins); debugfs_create_file("qspl_head_spins", 0600, arch_debugfs_dir, NULL, &fops_head_spins); if (is_shared_processor()) { debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); From 39dfc73596b48bb50cf7e4f3f54e38427dda5b4e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:30 +1000 Subject: [PATCH 118/181] powerpc/qspinlock: allow indefinite spinning on a preempted owner Provide an option that holds off queueing indefinitely while the lock owner is preempted. This could reduce queueing latencies for very overcommitted vcpu situations. This is disabled by default. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-16-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 77 +++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 8c6b5ef87118..eeaaecfd5b77 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -35,6 +35,7 @@ static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; static bool pv_yield_allow_steal __read_mostly = false; +static bool pv_spin_on_preempted_owner __read_mostly = false; static bool pv_yield_prev __read_mostly = true; static bool pv_yield_propagate_owner __read_mostly = true; static bool pv_prod_head __read_mostly = false; @@ -191,11 +192,12 @@ static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) BUG(); } -/* Called inside spin_begin() */ -static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt, bool mustq) { int owner; u32 yield_count; + bool preempted = false; BUG_ON(!(val & _Q_LOCKED_VAL)); @@ -213,6 +215,8 @@ static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 spin_end(); + preempted = true; + /* * Read the lock word after sampling the yield count. On the other side * there may a wmb because the yield count update is done by the @@ -229,29 +233,32 @@ static __always_inline void __yield_to_locked_owner(struct qspinlock *lock, u32 if (mustq) set_mustq(lock); spin_begin(); + /* Don't relax if we yielded. Maybe we should? */ - return; + return preempted; } spin_begin(); relax: spin_cpu_relax(); + + return preempted; } -/* Called inside spin_begin() */ -static __always_inline void yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) { - __yield_to_locked_owner(lock, val, paravirt, false); + return __yield_to_locked_owner(lock, val, paravirt, false); } -/* Called inside spin_begin() */ -static __always_inline void yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) +/* Called inside spin_begin(). Returns whether or not the vCPU was preempted. */ +static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u32 val, bool paravirt) { bool mustq = false; if ((val & _Q_MUST_Q_VAL) && pv_yield_allow_steal) mustq = true; - __yield_to_locked_owner(lock, val, paravirt, mustq); + return __yield_to_locked_owner(lock, val, paravirt, mustq); } static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt) @@ -361,13 +368,16 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav int iters = 0; u32 val; - if (!steal_spins) + if (!steal_spins) { + /* XXX: should spin_on_preempted_owner do anything here? */ return false; + } /* Attempt to steal the lock */ spin_begin(); - do { + bool preempted = false; + val = READ_ONCE(lock->val); if (val & _Q_MUST_Q_VAL) break; @@ -378,10 +388,23 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav return true; spin_begin(); } else { - yield_to_locked_owner(lock, val, paravirt); + preempted = yield_to_locked_owner(lock, val, paravirt); } - iters++; + if (preempted) { + if (!pv_spin_on_preempted_owner) + iters++; + /* + * pv_spin_on_preempted_owner don't increase iters + * while the owner is preempted -- we won't interfere + * with it by definition. This could introduce some + * latency issue if we continually observe preempted + * owners, but hopefully that's a rare corner case of + * a badly oversubscribed system. + */ + } else { + iters++; + } } while (!steal_break(val, iters, paravirt)); spin_end(); @@ -453,15 +476,22 @@ again: /* We're at the head of the waitqueue, wait for the lock. */ spin_begin(); for (;;) { + bool preempted; + val = READ_ONCE(lock->val); if (!(val & _Q_LOCKED_VAL)) break; propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); - yield_head_to_locked_owner(lock, val, paravirt); + preempted = yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; - iters++; + if (preempted) { + if (!pv_spin_on_preempted_owner) + iters++; + } else { + iters++; + } if (!mustq && iters >= get_head_spins(paravirt)) { mustq = true; @@ -644,6 +674,22 @@ static int pv_yield_allow_steal_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_allow_steal, pv_yield_allow_steal_get, pv_yield_allow_steal_set, "%llu\n"); +static int pv_spin_on_preempted_owner_set(void *data, u64 val) +{ + pv_spin_on_preempted_owner = !!val; + + return 0; +} + +static int pv_spin_on_preempted_owner_get(void *data, u64 *val) +{ + *val = pv_spin_on_preempted_owner; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n"); + static int pv_yield_prev_set(void *data, u64 val) { pv_yield_prev = !!val; @@ -700,6 +746,7 @@ static __init int spinlock_debugfs_init(void) if (is_shared_processor()) { debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); + debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); From 12b459a5ebf3308e718bc1dd48acb7c4cf7f1a75 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:31 +1000 Subject: [PATCH 119/181] powerpc/qspinlock: provide accounting and options for sleepy locks Finding the owner or a queued waiter on a lock with a preempted vcpu is indicative of an oversubscribed guest causing the lock to get into trouble. Provide some options to detect this situation and have new CPUs avoid queueing for a longer time (more steal iterations) to minimise the problems caused by vcpu preemption on the queue. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-17-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock_types.h | 7 +- arch/powerpc/lib/qspinlock.c | 242 +++++++++++++++++++-- 2 files changed, 230 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h index adfeed4aa495..4766a7aa03cb 100644 --- a/arch/powerpc/include/asm/qspinlock_types.h +++ b/arch/powerpc/include/asm/qspinlock_types.h @@ -30,7 +30,7 @@ typedef struct qspinlock { * * 0: locked bit * 1-14: lock holder cpu - * 15: unused bit + * 15: lock owner or queuer vcpus observed to be preempted bit * 16: must queue bit * 17-31: tail cpu (+1) */ @@ -50,6 +50,11 @@ typedef struct qspinlock { #error "qspinlock does not support such large CONFIG_NR_CPUS" #endif +/* 0x00008000 */ +#define _Q_SLEEPY_OFFSET 15 +#define _Q_SLEEPY_BITS 1 +#define _Q_SLEEPY_VAL (1U << _Q_SLEEPY_OFFSET) + /* 0x00010000 */ #define _Q_MUST_Q_OFFSET 16 #define _Q_MUST_Q_BITS 1 diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index eeaaecfd5b77..0f33a07c1d19 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -5,6 +5,7 @@ #include <linux/percpu.h> #include <linux/smp.h> #include <linux/topology.h> +#include <linux/sched/clock.h> #include <asm/qspinlock.h> #include <asm/paravirt.h> @@ -36,25 +37,56 @@ static int head_spins __read_mostly = (1 << 8); static bool pv_yield_owner __read_mostly = true; static bool pv_yield_allow_steal __read_mostly = false; static bool pv_spin_on_preempted_owner __read_mostly = false; +static bool pv_sleepy_lock __read_mostly = true; +static bool pv_sleepy_lock_sticky __read_mostly = false; +static u64 pv_sleepy_lock_interval_ns __read_mostly = 0; +static int pv_sleepy_lock_factor __read_mostly = 256; static bool pv_yield_prev __read_mostly = true; static bool pv_yield_propagate_owner __read_mostly = true; static bool pv_prod_head __read_mostly = false; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); +static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock); -static __always_inline int get_steal_spins(bool paravirt) +static __always_inline bool recently_sleepy(void) { - return steal_spins; + /* pv_sleepy_lock is true when this is called */ + if (pv_sleepy_lock_interval_ns) { + u64 seen = this_cpu_read(sleepy_lock_seen_clock); + + if (seen) { + u64 delta = sched_clock() - seen; + if (delta < pv_sleepy_lock_interval_ns) + return true; + this_cpu_write(sleepy_lock_seen_clock, 0); + } + } + + return false; } -static __always_inline int get_remote_steal_spins(bool paravirt) +static __always_inline int get_steal_spins(bool paravirt, bool sleepy) { - return remote_steal_spins; + if (paravirt && sleepy) + return steal_spins * pv_sleepy_lock_factor; + else + return steal_spins; } -static __always_inline int get_head_spins(bool paravirt) +static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy) { - return head_spins; + if (paravirt && sleepy) + return remote_steal_spins * pv_sleepy_lock_factor; + else + return remote_steal_spins; +} + +static __always_inline int get_head_spins(bool paravirt, bool sleepy) +{ + if (paravirt && sleepy) + return head_spins * pv_sleepy_lock_factor; + else + return head_spins; } static inline u32 encode_tail_cpu(int cpu) @@ -168,6 +200,56 @@ static __always_inline u32 clear_mustq(struct qspinlock *lock) return prev; } +static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old) +{ + u32 prev; + u32 new = old | _Q_SLEEPY_VAL; + + BUG_ON(!(old & _Q_LOCKED_VAL)); + BUG_ON(old & _Q_SLEEPY_VAL); + + asm volatile( +"1: lwarx %0,0,%1 # try_set_sleepy \n" +" cmpw 0,%0,%2 \n" +" bne- 2f \n" +" stwcx. %3,0,%1 \n" +" bne- 1b \n" +"2: \n" + : "=&r" (prev) + : "r" (&lock->val), "r"(old), "r" (new) + : "cr0", "memory"); + + return likely(prev == old); +} + +static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } +} + +static __always_inline void seen_sleepy_lock(void) +{ + if (pv_sleepy_lock && pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); +} + +static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val) +{ + if (pv_sleepy_lock) { + if (pv_sleepy_lock_interval_ns) + this_cpu_write(sleepy_lock_seen_clock, sched_clock()); + if (val & _Q_LOCKED_VAL) { + if (!(val & _Q_SLEEPY_VAL)) + try_set_sleepy(lock, val); + } + } +} + static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) { int cpu = decode_tail_cpu(val); @@ -215,6 +297,7 @@ static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32 spin_end(); + seen_sleepy_owner(lock, val); preempted = true; /* @@ -289,11 +372,12 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int } /* Called inside spin_begin() */ -static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) +static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) { int prev_cpu = decode_tail_cpu(val); u32 yield_count; int yield_cpu; + bool preempted = false; if (!paravirt) goto relax; @@ -315,6 +399,9 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode * spin_end(); + preempted = true; + seen_sleepy_node(lock, val); + smp_rmb(); if (yield_cpu == node->yield_cpu) { @@ -322,7 +409,7 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode * node->next->yield_cpu = yield_cpu; yield_to_preempted(yield_cpu, yield_count); spin_begin(); - return; + return preempted; } spin_begin(); @@ -336,26 +423,31 @@ yield_prev: spin_end(); + preempted = true; + seen_sleepy_node(lock, val); + smp_rmb(); /* See __yield_to_locked_owner comment */ if (!node->locked) { yield_to_preempted(prev_cpu, yield_count); spin_begin(); - return; + return preempted; } spin_begin(); relax: spin_cpu_relax(); + + return preempted; } -static __always_inline bool steal_break(u32 val, int iters, bool paravirt) +static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy) { - if (iters >= get_steal_spins(paravirt)) + if (iters >= get_steal_spins(paravirt, sleepy)) return true; if (IS_ENABLED(CONFIG_NUMA) && - (iters >= get_remote_steal_spins(paravirt))) { + (iters >= get_remote_steal_spins(paravirt, sleepy))) { int cpu = get_owner_cpu(val); if (numa_node_id() != cpu_to_node(cpu)) return true; @@ -365,6 +457,8 @@ static __always_inline bool steal_break(u32 val, int iters, bool paravirt) static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt) { + bool seen_preempted = false; + bool sleepy = false; int iters = 0; u32 val; @@ -391,7 +485,25 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav preempted = yield_to_locked_owner(lock, val, paravirt); } + if (paravirt && pv_sleepy_lock) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + if (preempted) { + seen_preempted = true; + sleepy = true; if (!pv_spin_on_preempted_owner) iters++; /* @@ -405,7 +517,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav } else { iters++; } - } while (!steal_break(val, iters, paravirt)); + } while (!steal_break(val, iters, paravirt, sleepy)); spin_end(); @@ -417,6 +529,8 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b struct qnodes *qnodesp; struct qnode *next, *node; u32 val, old, tail; + bool seen_preempted = false; + bool sleepy = false; bool mustq = false; int idx; int set_yield_cpu = -1; @@ -461,8 +575,10 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b /* Wait for mcs node lock to be released */ spin_begin(); - while (!node->locked) - yield_to_prev(lock, node, old, paravirt); + while (!node->locked) { + if (yield_to_prev(lock, node, old, paravirt)) + seen_preempted = true; + } spin_end(); /* Clear out stale propagated yield_cpu */ @@ -472,8 +588,8 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b smp_rmb(); /* acquire barrier for the mcs lock */ } -again: /* We're at the head of the waitqueue, wait for the lock. */ +again: spin_begin(); for (;;) { bool preempted; @@ -482,18 +598,40 @@ again: if (!(val & _Q_LOCKED_VAL)) break; + if (paravirt && pv_sleepy_lock && maybe_stealers) { + if (!sleepy) { + if (val & _Q_SLEEPY_VAL) { + seen_sleepy_lock(); + sleepy = true; + } else if (recently_sleepy()) { + sleepy = true; + } + } + if (pv_sleepy_lock_sticky && seen_preempted && + !(val & _Q_SLEEPY_VAL)) { + if (try_set_sleepy(lock, val)) + val |= _Q_SLEEPY_VAL; + } + } + propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); preempted = yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; - if (preempted) { + + if (preempted) + seen_preempted = true; + + if (paravirt && preempted) { + sleepy = true; + if (!pv_spin_on_preempted_owner) iters++; } else { iters++; } - if (!mustq && iters >= get_head_spins(paravirt)) { + if (!mustq && iters >= get_head_spins(paravirt, sleepy)) { mustq = true; set_mustq(lock); val |= _Q_MUST_Q_VAL; @@ -690,6 +828,70 @@ static int pv_spin_on_preempted_owner_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n"); +static int pv_sleepy_lock_set(void *data, u64 val) +{ + pv_sleepy_lock = !!val; + + return 0; +} + +static int pv_sleepy_lock_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n"); + +static int pv_sleepy_lock_sticky_set(void *data, u64 val) +{ + pv_sleepy_lock_sticky = !!val; + + return 0; +} + +static int pv_sleepy_lock_sticky_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_sticky; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n"); + +static int pv_sleepy_lock_interval_ns_set(void *data, u64 val) +{ + pv_sleepy_lock_interval_ns = val; + + return 0; +} + +static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_interval_ns; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n"); + +static int pv_sleepy_lock_factor_set(void *data, u64 val) +{ + pv_sleepy_lock_factor = val; + + return 0; +} + +static int pv_sleepy_lock_factor_get(void *data, u64 *val) +{ + *val = pv_sleepy_lock_factor; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n"); + static int pv_yield_prev_set(void *data, u64 val) { pv_yield_prev = !!val; @@ -747,6 +949,10 @@ static __init int spinlock_debugfs_init(void) debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner); debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal); debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner); + debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock); + debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky); + debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns); + debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); From 0b2199841a7952d01a717b465df028b40b2cf3e9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sat, 26 Nov 2022 19:59:32 +1000 Subject: [PATCH 120/181] powerpc/qspinlock: add compile-time tuning adjustments This adds compile-time options that allow the EH lock hint bit to be enabled or disabled, and adds some new options that may or may not help matters. To help with experimentation and tuning. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221126095932.1234527-18-npiggin@gmail.com --- arch/powerpc/include/asm/qspinlock.h | 61 ++++++++++++++++++++++++++-- arch/powerpc/lib/qspinlock.c | 39 ++++++++++++++++-- 2 files changed, 94 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h index 93b1c976db8a..28a53fb69b38 100644 --- a/arch/powerpc/include/asm/qspinlock.h +++ b/arch/powerpc/include/asm/qspinlock.h @@ -6,15 +6,68 @@ #include <asm/qspinlock_types.h> #include <asm/paravirt.h> +#ifdef CONFIG_PPC64 +/* + * Use the EH=1 hint for accesses that result in the lock being acquired. + * The hardware is supposed to optimise this pattern by holding the lock + * cacheline longer, and releasing when a store to the same memory (the + * unlock) is performed. + */ +#define _Q_SPIN_EH_HINT 1 +#else +#define _Q_SPIN_EH_HINT 0 +#endif + /* * The trylock itself may steal. This makes trylocks slightly stronger, and - * might make spin locks slightly more efficient when stealing. + * makes locks slightly more efficient when stealing. * * This is compile-time, so if true then there may always be stealers, so the * nosteal paths become unused. */ #define _Q_SPIN_TRY_LOCK_STEAL 1 +/* + * Put a speculation barrier after testing the lock/node and finding it + * busy. Try to prevent pointless speculation in slow paths. + * + * Slows down the lockstorm microbenchmark with no stealing, where locking + * is purely FIFO through the queue. May have more benefit in real workload + * where speculating into the wrong place could have a greater cost. + */ +#define _Q_SPIN_SPEC_BARRIER 0 + +#ifdef CONFIG_PPC64 +/* + * Execute a miso instruction after passing the MCS lock ownership to the + * queue head. Miso is intended to make stores visible to other CPUs sooner. + * + * This seems to make the lockstorm microbenchmark nospin test go slightly + * faster on POWER10, but disable for now. + */ +#define _Q_SPIN_MISO 0 +#else +#define _Q_SPIN_MISO 0 +#endif + +#ifdef CONFIG_PPC64 +/* + * This executes miso after an unlock of the lock word, having ownership + * pass to the next CPU sooner. This will slow the uncontended path to some + * degree. Not evidence it helps yet. + */ +#define _Q_SPIN_MISO_UNLOCK 0 +#else +#define _Q_SPIN_MISO_UNLOCK 0 +#endif + +/* + * Seems to slow down lockstorm microbenchmark, suspect queue node just + * has to become shared again right afterwards when its waiter spins on + * the lock field. + */ +#define _Q_SPIN_PREFETCH_NEXT 0 + static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { return READ_ONCE(lock->val); @@ -52,7 +105,7 @@ static __always_inline int __queued_spin_trylock_nosteal(struct qspinlock *lock) "2: \n" : "=&r" (prev) : "r" (&lock->val), "r" (new), - "i" (IS_ENABLED(CONFIG_PPC64)) + "i" (_Q_SPIN_EH_HINT) : "cr0", "memory"); return likely(prev == 0); @@ -76,7 +129,7 @@ static __always_inline int __queued_spin_trylock_steal(struct qspinlock *lock) "2: \n" : "=&r" (prev), "=&r" (tmp) : "r" (&lock->val), "r" (new), "r" (_Q_TAIL_CPU_MASK), - "i" (IS_ENABLED(CONFIG_PPC64)) + "i" (_Q_SPIN_EH_HINT) : "cr0", "memory"); return likely(!(prev & ~_Q_TAIL_CPU_MASK)); @@ -101,6 +154,8 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock) static inline void queued_spin_unlock(struct qspinlock *lock) { smp_store_release(&lock->locked, 0); + if (_Q_SPIN_MISO_UNLOCK) + asm volatile("miso" ::: "memory"); } #define arch_spin_is_locked(l) queued_spin_is_locked(l) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 0f33a07c1d19..1cf5d3e75250 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -48,6 +48,12 @@ static bool pv_prod_head __read_mostly = false; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock); +#if _Q_SPIN_SPEC_BARRIER == 1 +#define spec_barrier() do { asm volatile("ori 31,31,0" ::: "memory"); } while (0) +#else +#define spec_barrier() do { } while (0) +#endif + static __always_inline bool recently_sleepy(void) { /* pv_sleepy_lock is true when this is called */ @@ -137,7 +143,7 @@ static __always_inline u32 trylock_clean_tail(struct qspinlock *lock, u32 tail) : "r" (&lock->val), "r"(tail), "r" (newval), "i" (_Q_LOCKED_VAL), "r" (_Q_TAIL_CPU_MASK), - "i" (IS_ENABLED(CONFIG_PPC64)) + "i" (_Q_SPIN_EH_HINT) : "cr0", "memory"); return prev; @@ -475,6 +481,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav val = READ_ONCE(lock->val); if (val & _Q_MUST_Q_VAL) break; + spec_barrier(); if (unlikely(!(val & _Q_LOCKED_VAL))) { spin_end(); @@ -540,6 +547,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b qnodesp = this_cpu_ptr(&qnodes); if (unlikely(qnodesp->count >= MAX_NODES)) { + spec_barrier(); while (!queued_spin_trylock(lock)) cpu_relax(); return; @@ -576,9 +584,12 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b /* Wait for mcs node lock to be released */ spin_begin(); while (!node->locked) { + spec_barrier(); + if (yield_to_prev(lock, node, old, paravirt)) seen_preempted = true; } + spec_barrier(); spin_end(); /* Clear out stale propagated yield_cpu */ @@ -586,6 +597,17 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b node->yield_cpu = -1; smp_rmb(); /* acquire barrier for the mcs lock */ + + /* + * Generic qspinlocks have this prefetch here, but it seems + * like it could cause additional line transitions because + * the waiter will keep loading from it. + */ + if (_Q_SPIN_PREFETCH_NEXT) { + next = READ_ONCE(node->next); + if (next) + prefetchw(next); + } } /* We're at the head of the waitqueue, wait for the lock. */ @@ -597,6 +619,7 @@ again: val = READ_ONCE(lock->val); if (!(val & _Q_LOCKED_VAL)) break; + spec_barrier(); if (paravirt && pv_sleepy_lock && maybe_stealers) { if (!sleepy) { @@ -637,6 +660,7 @@ again: val |= _Q_MUST_Q_VAL; } } + spec_barrier(); spin_end(); /* If we're the last queued, must clean up the tail. */ @@ -657,6 +681,7 @@ again: cpu_relax(); spin_end(); } + spec_barrier(); /* * Unlock the next mcs waiter node. Release barrier is not required @@ -668,10 +693,14 @@ again: if (paravirt && pv_prod_head) { int next_cpu = next->cpu; WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); if (vcpu_is_preempted(next_cpu)) prod_cpu(next_cpu); } else { WRITE_ONCE(next->locked, 1); + if (_Q_SPIN_MISO) + asm volatile("miso" ::: "memory"); } release: @@ -686,12 +715,16 @@ void queued_spin_lock_slowpath(struct qspinlock *lock) * is passed as the paravirt argument to the functions. */ if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && is_shared_processor()) { - if (try_to_steal_lock(lock, true)) + if (try_to_steal_lock(lock, true)) { + spec_barrier(); return; + } queued_spin_lock_mcs_queue(lock, true); } else { - if (try_to_steal_lock(lock, false)) + if (try_to_steal_lock(lock, false)) { + spec_barrier(); return; + } queued_spin_lock_mcs_queue(lock, false); } } From c28c15b6d28a776538482101522cbcd9f906b15c Mon Sep 17 00:00:00 2001 From: "Christopher M. Riedl" <cmr@bluescreens.de> Date: Wed, 9 Nov 2022 15:51:11 +1100 Subject: [PATCH 121/181] powerpc/code-patching: Use temporary mm for Radix MMU x86 supports the notion of a temporary mm which restricts access to temporary PTEs to a single CPU. A temporary mm is useful for situations where a CPU needs to perform sensitive operations (such as patching a STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing said mappings to other CPUs. Another benefit is that other CPU TLBs do not need to be flushed when the temporary mm is torn down. Mappings in the temporary mm can be set in the userspace portion of the address-space. Interrupts must be disabled while the temporary mm is in use. HW breakpoints, which may have been set by userspace as watchpoints on addresses now within the temporary mm, are saved and disabled when loading the temporary mm. The HW breakpoints are restored when unloading the temporary mm. All HW breakpoints are indiscriminately disabled while the temporary mm is in use - this may include breakpoints set by perf. Use the `poking_init` init hook to prepare a temporary mm and patching address. Initialize the temporary mm using mm_alloc(). Choose a randomized patching address inside the temporary mm userspace address space. The patching address is randomized between PAGE_SIZE and DEFAULT_MAP_WINDOW-PAGE_SIZE. Bits of entropy with 64K page size on BOOK3S_64: bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE) PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB bits of entropy = log2(128TB / 64K) bits of entropy = 31 The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU operates - by default the space above DEFAULT_MAP_WINDOW is not available. Currently the Hash MMU does not use a temporary mm so technically this upper limit isn't necessary; however, a larger randomization range does not further "harden" this overall approach and future work may introduce patching with a temporary mm on Hash as well. Randomization occurs only once during initialization for each CPU as it comes online. The patching page is mapped with PAGE_KERNEL to set EAA[0] for the PTE which ignores the AMR (so no need to unlock/lock KUAP) according to PowerISA v3.0b Figure 35 on Radix. Based on x86 implementation: commit 4fc19708b165 ("x86/alternatives: Initialize temporary mm for patching") and: commit b3fd8e83ada0 ("x86/alternatives: Use temporary mm for text poking") From: Benjamin Gray <bgray@linux.ibm.com> Synchronisation is done according to ISA 3.1B Book 3 Chapter 13 "Synchronization Requirements for Context Alterations". Switching the mm is a change to the PID, which requires a CSI before and after the change, and a hwsync between the last instruction that performs address translation for an associated storage access. Instruction fetch is an associated storage access, but the instruction address mappings are not being changed, so it should not matter which context they use. We must still perform a hwsync to guard arbitrary prior code that may have accessed a userspace address. TLB invalidation is local and VA specific. Local because only this core used the patching mm, and VA specific because we only care that the writable mapping is purged. Leaving the other mappings intact is more efficient, especially when performing many code patches in a row (e.g., as ftrace would). Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de> Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> [mpe: Use mm_alloc() per 107b6828a7cd ("x86/mm: Use mm_alloc() in poking_init()")] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-9-bgray@linux.ibm.com --- arch/powerpc/lib/code-patching.c | 177 ++++++++++++++++++++++++++++++- 1 file changed, 172 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 3055eef7dcdc..a1902241ff5d 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -4,12 +4,17 @@ */ #include <linux/kprobes.h> +#include <linux/mmu_context.h> +#include <linux/random.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/cpuhotplug.h> #include <linux/uaccess.h> #include <linux/jump_label.h> +#include <asm/debug.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/page.h> #include <asm/code-patching.h> @@ -42,11 +47,54 @@ int raw_patch_instruction(u32 *addr, ppc_inst_t instr) } #ifdef CONFIG_STRICT_KERNEL_RWX + static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); +static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm); +static DEFINE_PER_CPU(unsigned long, cpu_patching_addr); +static DEFINE_PER_CPU(pte_t *, cpu_patching_pte); static int map_patch_area(void *addr, unsigned long text_poke_addr); static void unmap_patch_area(unsigned long addr); +static bool mm_patch_enabled(void) +{ + return IS_ENABLED(CONFIG_SMP) && radix_enabled(); +} + +/* + * The following applies for Radix MMU. Hash MMU has different requirements, + * and so is not supported. + * + * Changing mm requires context synchronising instructions on both sides of + * the context switch, as well as a hwsync between the last instruction for + * which the address of an associated storage access was translated using + * the current context. + * + * switch_mm_irqs_off() performs an isync after the context switch. It is + * the responsibility of the caller to perform the CSI and hwsync before + * starting/stopping the temp mm. + */ +static struct mm_struct *start_using_temp_mm(struct mm_struct *temp_mm) +{ + struct mm_struct *orig_mm = current->active_mm; + + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(orig_mm, temp_mm, current); + + WARN_ON(!mm_is_thread_local(temp_mm)); + + suspend_breakpoints(); + return orig_mm; +} + +static void stop_using_temp_mm(struct mm_struct *temp_mm, + struct mm_struct *orig_mm) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(temp_mm, orig_mm, current); + restore_breakpoints(); +} + static int text_area_cpu_up(unsigned int cpu) { struct vm_struct *area; @@ -79,14 +127,86 @@ static int text_area_cpu_down(unsigned int cpu) return 0; } +static void put_patching_mm(struct mm_struct *mm, unsigned long patching_addr) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm); + free_pgd_range(&tlb, patching_addr, patching_addr + PAGE_SIZE, 0, 0); + mmput(mm); +} + +static int text_area_cpu_up_mm(unsigned int cpu) +{ + struct mm_struct *mm; + unsigned long addr; + pte_t *pte; + spinlock_t *ptl; + + mm = mm_alloc(); + if (WARN_ON(!mm)) + goto fail_no_mm; + + /* + * Choose a random page-aligned address from the interval + * [PAGE_SIZE .. DEFAULT_MAP_WINDOW - PAGE_SIZE]. + * The lower address bound is PAGE_SIZE to avoid the zero-page. + */ + addr = (1 + (get_random_long() % (DEFAULT_MAP_WINDOW / PAGE_SIZE - 2))) << PAGE_SHIFT; + + /* + * PTE allocation uses GFP_KERNEL which means we need to + * pre-allocate the PTE here because we cannot do the + * allocation during patching when IRQs are disabled. + * + * Using get_locked_pte() to avoid open coding, the lock + * is unnecessary. + */ + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto fail_no_pte; + pte_unmap_unlock(pte, ptl); + + this_cpu_write(cpu_patching_mm, mm); + this_cpu_write(cpu_patching_addr, addr); + this_cpu_write(cpu_patching_pte, pte); + + return 0; + +fail_no_pte: + put_patching_mm(mm, addr); +fail_no_mm: + return -ENOMEM; +} + +static int text_area_cpu_down_mm(unsigned int cpu) +{ + put_patching_mm(this_cpu_read(cpu_patching_mm), + this_cpu_read(cpu_patching_addr)); + + this_cpu_write(cpu_patching_mm, NULL); + this_cpu_write(cpu_patching_addr, 0); + this_cpu_write(cpu_patching_pte, NULL); + + return 0; +} + static __ro_after_init DEFINE_STATIC_KEY_FALSE(poking_init_done); void __init poking_init(void) { - int ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "powerpc/text_poke:online", - text_area_cpu_up, - text_area_cpu_down); + int ret; + + if (mm_patch_enabled()) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke_mm:online", + text_area_cpu_up_mm, + text_area_cpu_down_mm); + else + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "powerpc/text_poke:online", + text_area_cpu_up, + text_area_cpu_down); /* cpuhp_setup_state returns >= 0 on success */ if (WARN_ON(ret < 0)) @@ -148,6 +268,50 @@ static void unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } +static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) +{ + int err; + u32 *patch_addr; + unsigned long text_poke_addr; + pte_t *pte; + unsigned long pfn = get_patch_pfn(addr); + struct mm_struct *patching_mm; + struct mm_struct *orig_mm; + + patching_mm = __this_cpu_read(cpu_patching_mm); + pte = __this_cpu_read(cpu_patching_pte); + text_poke_addr = __this_cpu_read(cpu_patching_addr); + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + + /* order PTE update before use, also serves as the hwsync */ + asm volatile("ptesync": : :"memory"); + + /* order context switch after arbitrary prior code */ + isync(); + + orig_mm = start_using_temp_mm(patching_mm); + + err = __patch_instruction(addr, instr, patch_addr); + + /* hwsync performed by __patch_instruction (sync) if successful */ + if (err) + mb(); /* sync */ + + /* context synchronisation performed by __patch_instruction (isync or exception) */ + stop_using_temp_mm(patching_mm, orig_mm); + + pte_clear(patching_mm, text_poke_addr, pte); + /* + * ptesync to order PTE update before TLB invalidation done + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) + */ + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + + return err; +} + static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) { int err; @@ -187,7 +351,10 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) return raw_patch_instruction(addr, instr); local_irq_save(flags); - err = __do_patch_instruction(addr, instr); + if (mm_patch_enabled()) + err = __do_patch_instruction_mm(addr, instr); + else + err = __do_patch_instruction(addr, instr); local_irq_restore(flags); return err; From 2f228ee1ade5d8d1f26cf94863a36c5693023c58 Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Wed, 9 Nov 2022 15:51:12 +1100 Subject: [PATCH 122/181] powerpc/code-patching: Consolidate and cache per-cpu patching context With the temp mm context support, there are CPU local variables to hold the patch address and pte. Use these in the non-temp mm path as well instead of adding a level of indirection through the text_poke_area vm_struct and pointer chasing the pte. As both paths use these fields now, there is no need to let unreferenced variables be dropped by the compiler, so it is cleaner to merge them into a single context struct. This has the additional benefit of removing a redundant CPU local pointer, as only one of cpu_patching_mm / text_poke_area is ever used, while remaining well-typed. It also groups each CPU's data into a single cacheline. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> [mpe: Shorten name to 'area' as suggested by Christophe] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221109045112.187069-10-bgray@linux.ibm.com --- arch/powerpc/lib/code-patching.c | 49 +++++++++++++++++++------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index a1902241ff5d..5b8f87db1217 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -48,10 +48,16 @@ int raw_patch_instruction(u32 *addr, ppc_inst_t instr) #ifdef CONFIG_STRICT_KERNEL_RWX -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); -static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm); -static DEFINE_PER_CPU(unsigned long, cpu_patching_addr); -static DEFINE_PER_CPU(pte_t *, cpu_patching_pte); +struct patch_context { + union { + struct vm_struct *area; + struct mm_struct *mm; + }; + unsigned long addr; + pte_t *pte; +}; + +static DEFINE_PER_CPU(struct patch_context, cpu_patching_context); static int map_patch_area(void *addr, unsigned long text_poke_addr); static void unmap_patch_area(unsigned long addr); @@ -116,14 +122,19 @@ static int text_area_cpu_up(unsigned int cpu) unmap_patch_area(addr); - this_cpu_write(text_poke_area, area); + this_cpu_write(cpu_patching_context.area, area); + this_cpu_write(cpu_patching_context.addr, addr); + this_cpu_write(cpu_patching_context.pte, virt_to_kpte(addr)); return 0; } static int text_area_cpu_down(unsigned int cpu) { - free_vm_area(this_cpu_read(text_poke_area)); + free_vm_area(this_cpu_read(cpu_patching_context.area)); + this_cpu_write(cpu_patching_context.area, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + this_cpu_write(cpu_patching_context.pte, NULL); return 0; } @@ -167,9 +178,9 @@ static int text_area_cpu_up_mm(unsigned int cpu) goto fail_no_pte; pte_unmap_unlock(pte, ptl); - this_cpu_write(cpu_patching_mm, mm); - this_cpu_write(cpu_patching_addr, addr); - this_cpu_write(cpu_patching_pte, pte); + this_cpu_write(cpu_patching_context.mm, mm); + this_cpu_write(cpu_patching_context.addr, addr); + this_cpu_write(cpu_patching_context.pte, pte); return 0; @@ -181,12 +192,12 @@ fail_no_mm: static int text_area_cpu_down_mm(unsigned int cpu) { - put_patching_mm(this_cpu_read(cpu_patching_mm), - this_cpu_read(cpu_patching_addr)); + put_patching_mm(this_cpu_read(cpu_patching_context.mm), + this_cpu_read(cpu_patching_context.addr)); - this_cpu_write(cpu_patching_mm, NULL); - this_cpu_write(cpu_patching_addr, 0); - this_cpu_write(cpu_patching_pte, NULL); + this_cpu_write(cpu_patching_context.mm, NULL); + this_cpu_write(cpu_patching_context.addr, 0); + this_cpu_write(cpu_patching_context.pte, NULL); return 0; } @@ -278,9 +289,9 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) struct mm_struct *patching_mm; struct mm_struct *orig_mm; - patching_mm = __this_cpu_read(cpu_patching_mm); - pte = __this_cpu_read(cpu_patching_pte); - text_poke_addr = __this_cpu_read(cpu_patching_addr); + patching_mm = __this_cpu_read(cpu_patching_context.mm); + pte = __this_cpu_read(cpu_patching_context.pte); + text_poke_addr = __this_cpu_read(cpu_patching_context.addr); patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); @@ -320,10 +331,10 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) pte_t *pte; unsigned long pfn = get_patch_pfn(addr); - text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr & PAGE_MASK; + text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); - pte = virt_to_kpte(text_poke_addr); + pte = __this_cpu_read(cpu_patching_context.pte); __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); /* See ptesync comment in radix__set_pte_at() */ if (radix_enabled()) From f9231a996e229c13d23f907352c2cea84bd1c30a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 28 Nov 2022 14:15:36 +1000 Subject: [PATCH 123/181] module: add module_elf_check_arch for module-specific checks The elf_check_arch() function is also used to test compatibility of usermode binaries. Kernel modules may have more specific requirements, for example powerpc would like to test for ABI version compatibility. Add a weak module_elf_check_arch() that defaults to true, and call it from elf_validity_check(). Signed-off-by: Jessica Yu <jeyu@kernel.org> [np: added changelog, adjust name, rebase] Acked-by: Luis Chamberlain <mcgrof@kernel.org> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041539.1742489-2-npiggin@gmail.com --- include/linux/moduleloader.h | 3 +++ kernel/module/main.c | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 9e09d11ffe5b..7b4587a19189 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -13,6 +13,9 @@ * must be implemented by each architecture. */ +/* arch may override to do additional checking of ELF header architecture */ +bool module_elf_check_arch(Elf_Ehdr *hdr); + /* Adjust arch-specific sections. Return 0 on success. */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/kernel/module/main.c b/kernel/module/main.c index d02d39c7174e..7b3f6fb0d428 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1674,6 +1674,11 @@ static int elf_validity_check(struct load_info *info) info->hdr->e_machine); goto no_exec; } + if (!module_elf_check_arch(info->hdr)) { + pr_err("Invalid module architecture in ELF header: %u\n", + info->hdr->e_machine); + goto no_exec; + } if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) { pr_err("Invalid ELF section header size\n"); goto no_exec; @@ -2247,6 +2252,11 @@ static void flush_module_icache(const struct module *mod) (unsigned long)mod->core_layout.base + mod->core_layout.size); } +bool __weak module_elf_check_arch(Elf_Ehdr *hdr) +{ + return true; +} + int __weak module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, From de3d098dd1fc635535e3689c5d4aa0684242adde Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 28 Nov 2022 14:15:37 +1000 Subject: [PATCH 124/181] powerpc/64: Add module check for ELF ABI version Override the generic module ELF check to provide a check for the ELF ABI version. This becomes important if we allow big-endian ELF ABI V2 builds but it doesn't hurt to check now. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041539.1742489-3-npiggin@gmail.com --- arch/powerpc/kernel/module_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 7e45dc98df8a..ff045644f13f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -31,6 +31,16 @@ this, and makes other things simpler. Anton? --RR. */ +bool module_elf_check_arch(Elf_Ehdr *hdr) +{ + unsigned long abi_level = hdr->e_flags & 0x3; + + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + return abi_level == 2; + else + return abi_level < 2; +} + #ifdef CONFIG_PPC64_ELF_ABI_V2 static func_desc_t func_desc(unsigned long addr) From 505ea33089dcfc3ee3201b0fcb94751165805413 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 28 Nov 2022 14:15:38 +1000 Subject: [PATCH 125/181] powerpc/64: Add big-endian ELFv2 flavour to crypto VMX asm generation This allows asm generation for big-endian ELFv2 builds. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041539.1742489-4-npiggin@gmail.com --- drivers/crypto/vmx/Makefile | 12 +++++++++++- drivers/crypto/vmx/ppc-xlate.pl | 10 ++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/crypto/vmx/Makefile b/drivers/crypto/vmx/Makefile index 2560cfea1dec..e33c7238e7f8 100644 --- a/drivers/crypto/vmx/Makefile +++ b/drivers/crypto/vmx/Makefile @@ -2,8 +2,18 @@ obj-$(CONFIG_CRYPTO_DEV_VMX_ENCRYPT) += vmx-crypto.o vmx-crypto-objs := vmx.o aesp8-ppc.o ghashp8-ppc.o aes.o aes_cbc.o aes_ctr.o aes_xts.o ghash.o +ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) +override flavour := linux-ppc64le +else +ifdef CONFIG_PPC64_ELF_ABI_V2 +override flavour := linux-ppc64-elfv2 +else +override flavour := linux-ppc64 +endif +endif + quiet_cmd_perl = PERL $@ - cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@ + cmd_perl = $(PERL) $< $(flavour) > $@ targets += aesp8-ppc.S ghashp8-ppc.S diff --git a/drivers/crypto/vmx/ppc-xlate.pl b/drivers/crypto/vmx/ppc-xlate.pl index 36db2ef09e5b..b583898c11ae 100644 --- a/drivers/crypto/vmx/ppc-xlate.pl +++ b/drivers/crypto/vmx/ppc-xlate.pl @@ -9,6 +9,8 @@ open STDOUT,">$output" || die "can't open $output: $!"; my %GLOBALS; my $dotinlocallabels=($flavour=~/linux/)?1:0; +my $elfv2abi=(($flavour =~ /linux-ppc64le/) or ($flavour =~ /linux-ppc64-elfv2/))?1:0; +my $dotfunctions=($elfv2abi=~1)?0:1; ################################################################ # directives which need special treatment on different platforms @@ -40,7 +42,7 @@ my $globl = sub { }; my $text = sub { my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text"; - $ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/); + $ret = ".abiversion 2\n".$ret if ($elfv2abi); $ret; }; my $machine = sub { @@ -56,8 +58,8 @@ my $size = sub { if ($flavour =~ /linux/) { shift; my $name = shift; $name =~ s|^[\.\_]||; - my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name; - $ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/); + my $ret = ".size $name,.-".($dotfunctions?".":"").$name; + $ret .= "\n.size .$name,.-.$name" if ($dotfunctions); $ret; } else @@ -142,7 +144,7 @@ my $vmr = sub { # Some ABIs specify vrsave, special-purpose register #256, as reserved # for system use. -my $no_vrsave = ($flavour =~ /linux-ppc64le/); +my $no_vrsave = ($elfv2abi); my $mtspr = sub { my ($f,$idx,$ra) = @_; if ($idx == 256 && $no_vrsave) { From 5017b45946722bdd20ac255c9ae7273b78d1f12e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 28 Nov 2022 14:15:39 +1000 Subject: [PATCH 126/181] powerpc/64: Option to build big-endian with ELFv2 ABI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide an option to build big-endian kernels using the ELFv2 ABI. This works on GCC only for now. Clang is rumored to support this, but core build files need updating first, at least. This gives big-endian kernels useful advantages of the ELFv2 ABI, e.g., less stack usage, -mprofile-kernel support, better compatibility with eBPF tools. BE+ELFv2 is not officially supported by the GNU toolchain, but it works fine in testing and has been used by some userspace for some time (e.g., Void Linux). Tested-by: Michal Suchánek <msuchanek@suse.de> Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Joel Stanley <joel@jms.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041539.1742489-5-npiggin@gmail.com --- arch/powerpc/Kconfig | 21 +++++++++++++++++++++ arch/powerpc/platforms/Kconfig.cputype | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index abaf1ef1795c..1a134c9769f8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1,6 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 source "arch/powerpc/platforms/Kconfig.cputype" +config CC_HAS_ELFV2 + def_bool PPC64 && $(cc-option, -mabi=elfv2) + config 32BIT bool default y if PPC32 @@ -586,6 +589,24 @@ config KEXEC_FILE config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE +config PPC64_BIG_ENDIAN_ELF_ABI_V2 + bool "Build big-endian kernel using ELF ABI V2 (EXPERIMENTAL)" + depends on PPC64 && CPU_BIG_ENDIAN + depends on CC_HAS_ELFV2 + depends on LD_IS_BFD && LD_VERSION >= 22400 + default n + help + This builds the kernel image using the "Power Architecture 64-Bit ELF + V2 ABI Specification", which has a reduced stack overhead and faster + function calls. This internal kernel ABI option does not affect + userspace compatibility. + + The V2 ABI is standard for 64-bit little-endian, but for big-endian + it is less well tested by kernel and toolchain. However some distros + build userspace this way, and it can produce a functioning kernel. + + This requires GCC and binutils 2.24 or newer. + config RELOCATABLE bool "Build a relocatable kernel" depends on PPC64 || (FLATMEM && (44x || PPC_85xx)) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7bac213b4125..9563336e3348 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -580,10 +580,10 @@ config CPU_LITTLE_ENDIAN endchoice config PPC64_ELF_ABI_V1 - def_bool PPC64 && CPU_BIG_ENDIAN + def_bool PPC64 && (CPU_BIG_ENDIAN && !PPC64_BIG_ENDIAN_ELF_ABI_V2) config PPC64_ELF_ABI_V2 - def_bool PPC64 && CPU_LITTLE_ENDIAN + def_bool PPC64 && !PPC64_ELF_ABI_V1 config PPC64_BOOT_WRAPPER def_bool n From d6aee468e4ecbfec46a3eafae4d31d6efc0d4da4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:27 +1000 Subject: [PATCH 127/181] powerpc/64: Remove asm interrupt tracing call helpers These are now unused. Remove. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-3-npiggin@gmail.com --- arch/powerpc/include/asm/irqflags.h | 58 ----------------------------- 1 file changed, 58 deletions(-) diff --git a/arch/powerpc/include/asm/irqflags.h b/arch/powerpc/include/asm/irqflags.h index 1a6c1ce17735..47d46712928a 100644 --- a/arch/powerpc/include/asm/irqflags.h +++ b/arch/powerpc/include/asm/irqflags.h @@ -11,64 +11,6 @@ */ #include <asm/hw_irq.h> -#else -#ifdef CONFIG_TRACE_IRQFLAGS -#ifdef CONFIG_IRQSOFF_TRACER -/* - * Since the ftrace irqsoff latency trace checks CALLER_ADDR1, - * which is the stack frame here, we need to force a stack frame - * in case we came from user space. - */ -#define TRACE_WITH_FRAME_BUFFER(func) \ - mflr r0; \ - stdu r1, -STACK_FRAME_OVERHEAD(r1); \ - std r0, 16(r1); \ - stdu r1, -STACK_FRAME_OVERHEAD(r1); \ - bl func; \ - ld r1, 0(r1); \ - ld r1, 0(r1); -#else -#define TRACE_WITH_FRAME_BUFFER(func) \ - bl func; -#endif - -/* - * These are calls to C code, so the caller must be prepared for volatiles to - * be clobbered. - */ -#define TRACE_ENABLE_INTS TRACE_WITH_FRAME_BUFFER(trace_hardirqs_on) -#define TRACE_DISABLE_INTS TRACE_WITH_FRAME_BUFFER(trace_hardirqs_off) - -/* - * This is used by assembly code to soft-disable interrupts first and - * reconcile irq state. - * - * NB: This may call C code, so the caller must be prepared for volatiles to - * be clobbered. - */ -#define RECONCILE_IRQ_STATE(__rA, __rB) \ - lbz __rA,PACAIRQSOFTMASK(r13); \ - lbz __rB,PACAIRQHAPPENED(r13); \ - andi. __rA,__rA,IRQS_DISABLED; \ - li __rA,IRQS_DISABLED; \ - ori __rB,__rB,PACA_IRQ_HARD_DIS; \ - stb __rB,PACAIRQHAPPENED(r13); \ - bne 44f; \ - stb __rA,PACAIRQSOFTMASK(r13); \ - TRACE_DISABLE_INTS; \ -44: - -#else -#define TRACE_ENABLE_INTS -#define TRACE_DISABLE_INTS - -#define RECONCILE_IRQ_STATE(__rA, __rB) \ - lbz __rA,PACAIRQHAPPENED(r13); \ - li __rB,IRQS_DISABLED; \ - ori __rA,__rA,PACA_IRQ_HARD_DIS; \ - stb __rB,PACAIRQSOFTMASK(r13); \ - stb __rA,PACAIRQHAPPENED(r13) -#endif #endif #endif From 32c5209214bd8d4f8c4e9d9b630ef4c671f58e79 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:28 +1000 Subject: [PATCH 128/181] powerpc/perf: callchain validate kernel stack pointer bounds The interrupt frame detection and loads from the hypothetical pt_regs are not bounds-checked. The next-frame validation only bounds-checks STACK_FRAME_OVERHEAD, which does not include the pt_regs. Add another test for this. The user could set r1 to be equal to the address matching the first interrupt frame - STACK_INT_FRAME_SIZE, which is in the previous page due to the kernel redzone, and induce the kernel to load the marker from there. Possibly this could cause a crash at least. If the user could induce the previous page to contain a valid marker, then it might be able to direct perf to read specific memory addresses in a way that could be transmitted back to the user in the perf data. Fixes: 20002ded4d93 ("perf_counter: powerpc: Add callchain support") Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-4-npiggin@gmail.com --- arch/powerpc/perf/callchain.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 082f6d0308a4..8718289c051d 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -61,6 +61,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re next_sp = fp[0]; if (next_sp == sp + STACK_INT_FRAME_SIZE && + validate_sp(sp, current, STACK_INT_FRAME_SIZE) && fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { /* * This looks like an interrupt frame for an From bc0677363d0ffaec0c56685291e97b080116976c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:29 +1000 Subject: [PATCH 129/181] powerpc: Rearrange copy_thread child stack creation This makes it a bit clearer where the stack frame is created, and will allow easier use of some of the stack offset constants in a later change. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-5-npiggin@gmail.com --- arch/powerpc/kernel/process.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5265da2d8034..f93703ea4a12 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1755,13 +1755,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) klp_init_thread_info(p); + /* Create initial stack frame. */ + sp -= (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD); + ((unsigned long *)sp)[0] = 0; + /* Copy registers */ - sp -= sizeof(struct pt_regs); - childregs = (struct pt_regs *) sp; + childregs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); if (unlikely(args->fn)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gpr[1] = sp + sizeof(struct pt_regs); + childregs->gpr[1] = sp + (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD); /* function */ if (args->fn) childregs->gpr[14] = ppc_function_entry((void *)args->fn); @@ -1796,7 +1799,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) f = ret_from_fork; } childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX); - sp -= STACK_FRAME_OVERHEAD; /* * The way this works is that at some point in the future @@ -1806,7 +1808,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ - ((unsigned long *)sp)[0] = 0; sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *) sp; sp -= STACK_FRAME_OVERHEAD; From baa49d81a94bb4170e7f2f4d97016772117d0f60 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:30 +1000 Subject: [PATCH 130/181] powerpc/pseries: hvcall stack frame overhead This call may use the min size stack frame. The scratch space used is in the caller's parameter area frame, not this function's frame. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-6-npiggin@gmail.com --- arch/powerpc/platforms/pseries/hvCall.S | 38 +++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S index 762eb15d3bd4..783c16ad648b 100644 --- a/arch/powerpc/platforms/pseries/hvCall.S +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -27,7 +27,9 @@ hcall_tracepoint_refcount: /* * precall must preserve all registers. use unused STK_PARAM() - * areas to save snapshots and opcode. + * areas to save snapshots and opcode. STK_PARAM() in the caller's + * frame will be available even on ELFv2 because these are all + * variadic functions. */ #define HCALL_INST_PRECALL(FIRST_REG) \ mflr r0; \ @@ -41,29 +43,29 @@ hcall_tracepoint_refcount: std r10,STK_PARAM(R10)(r1); \ std r0,16(r1); \ addi r4,r1,STK_PARAM(FIRST_REG); \ - stdu r1,-STACK_FRAME_OVERHEAD(r1); \ + stdu r1,-STACK_FRAME_MIN_SIZE(r1); \ bl __trace_hcall_entry; \ - ld r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ - ld r4,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1); \ - ld r5,STACK_FRAME_OVERHEAD+STK_PARAM(R5)(r1); \ - ld r6,STACK_FRAME_OVERHEAD+STK_PARAM(R6)(r1); \ - ld r7,STACK_FRAME_OVERHEAD+STK_PARAM(R7)(r1); \ - ld r8,STACK_FRAME_OVERHEAD+STK_PARAM(R8)(r1); \ - ld r9,STACK_FRAME_OVERHEAD+STK_PARAM(R9)(r1); \ - ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R10)(r1) + ld r3,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ + ld r4,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1); \ + ld r5,STACK_FRAME_MIN_SIZE+STK_PARAM(R5)(r1); \ + ld r6,STACK_FRAME_MIN_SIZE+STK_PARAM(R6)(r1); \ + ld r7,STACK_FRAME_MIN_SIZE+STK_PARAM(R7)(r1); \ + ld r8,STACK_FRAME_MIN_SIZE+STK_PARAM(R8)(r1); \ + ld r9,STACK_FRAME_MIN_SIZE+STK_PARAM(R9)(r1); \ + ld r10,STACK_FRAME_MIN_SIZE+STK_PARAM(R10)(r1) /* * postcall is performed immediately before function return which * allows liberal use of volatile registers. */ #define __HCALL_INST_POSTCALL \ - ld r0,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ - std r3,STACK_FRAME_OVERHEAD+STK_PARAM(R3)(r1); \ + ld r0,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ + std r3,STACK_FRAME_MIN_SIZE+STK_PARAM(R3)(r1); \ mr r4,r3; \ mr r3,r0; \ bl __trace_hcall_exit; \ - ld r0,STACK_FRAME_OVERHEAD+16(r1); \ - addi r1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,STACK_FRAME_MIN_SIZE+16(r1); \ + addi r1,r1,STACK_FRAME_MIN_SIZE; \ ld r3,STK_PARAM(R3)(r1); \ mtlr r0 @@ -303,14 +305,14 @@ plpar_hcall9_trace: mr r7,r8 mr r8,r9 mr r9,r10 - ld r10,STACK_FRAME_OVERHEAD+STK_PARAM(R11)(r1) - ld r11,STACK_FRAME_OVERHEAD+STK_PARAM(R12)(r1) - ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R13)(r1) + ld r10,STACK_FRAME_MIN_SIZE+STK_PARAM(R11)(r1) + ld r11,STACK_FRAME_MIN_SIZE+STK_PARAM(R12)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R13)(r1) HVSC mr r0,r12 - ld r12,STACK_FRAME_OVERHEAD+STK_PARAM(R4)(r1) + ld r12,STACK_FRAME_MIN_SIZE+STK_PARAM(R4)(r1) std r4,0(r12) std r5,8(r12) std r6,16(r12) From 37195b820d32c23bdefce3f460ed7de48a57e5e4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:31 +1000 Subject: [PATCH 131/181] powerpc: simplify ppc_save_regs Adjust the pt_regs pointer so the interrupt frame offsets can be used to save registers. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-7-npiggin@gmail.com --- arch/powerpc/kernel/ppc_save_regs.S | 57 ++++++++--------------------- 1 file changed, 15 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index 2d4d21bb46a9..6e86f3bf4673 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -21,60 +21,33 @@ * different ABIs, though). */ _GLOBAL(ppc_save_regs) - PPC_STL r0,0*SZL(r3) + /* This allows stack frame accessor macros and offsets to be used */ + subi r3,r3,STACK_FRAME_OVERHEAD + PPC_STL r0,GPR0(r3) #ifdef CONFIG_PPC32 - stmw r2, 2*SZL(r3) + stmw r2,GPR2(r3) #else - PPC_STL r2,2*SZL(r3) - PPC_STL r3,3*SZL(r3) - PPC_STL r4,4*SZL(r3) - PPC_STL r5,5*SZL(r3) - PPC_STL r6,6*SZL(r3) - PPC_STL r7,7*SZL(r3) - PPC_STL r8,8*SZL(r3) - PPC_STL r9,9*SZL(r3) - PPC_STL r10,10*SZL(r3) - PPC_STL r11,11*SZL(r3) - PPC_STL r12,12*SZL(r3) - PPC_STL r13,13*SZL(r3) - PPC_STL r14,14*SZL(r3) - PPC_STL r15,15*SZL(r3) - PPC_STL r16,16*SZL(r3) - PPC_STL r17,17*SZL(r3) - PPC_STL r18,18*SZL(r3) - PPC_STL r19,19*SZL(r3) - PPC_STL r20,20*SZL(r3) - PPC_STL r21,21*SZL(r3) - PPC_STL r22,22*SZL(r3) - PPC_STL r23,23*SZL(r3) - PPC_STL r24,24*SZL(r3) - PPC_STL r25,25*SZL(r3) - PPC_STL r26,26*SZL(r3) - PPC_STL r27,27*SZL(r3) - PPC_STL r28,28*SZL(r3) - PPC_STL r29,29*SZL(r3) - PPC_STL r30,30*SZL(r3) - PPC_STL r31,31*SZL(r3) + SAVE_GPRS(2, 31, r3) lbz r0,PACAIRQSOFTMASK(r13) - PPC_STL r0,SOFTE-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,SOFTE(r3) #endif /* go up one stack frame for SP */ PPC_LL r4,0(r1) - PPC_STL r4,1*SZL(r3) + PPC_STL r4,GPR1(r3) /* get caller's LR */ PPC_LL r0,LRSAVE(r4) - PPC_STL r0,_LINK-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_LINK(r3) mflr r0 - PPC_STL r0,_NIP-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_NIP(r3) mfmsr r0 - PPC_STL r0,_MSR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_MSR(r3) mfctr r0 - PPC_STL r0,_CTR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CTR(r3) mfxer r0 - PPC_STL r0,_XER-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_XER(r3) mfcr r0 - PPC_STL r0,_CCR-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_CCR(r3) li r0,0 - PPC_STL r0,_TRAP-STACK_FRAME_OVERHEAD(r3) - PPC_STL r0,ORIG_GPR3-STACK_FRAME_OVERHEAD(r3) + PPC_STL r0,_TRAP(r3) + PPC_STL r0,ORIG_GPR3(r3) blr From c03be0a3f3cc656eab5c427b78959b8f1b169a11 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:32 +1000 Subject: [PATCH 132/181] powerpc: add definition for pt_regs offset within an interrupt frame This is a common offset that currently uses the overloaded STACK_FRAME_OVERHEAD constant. It's easier to read and more flexible to use a specific regs offset for this. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-8-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 2 + arch/powerpc/kernel/asm-offsets.c | 7 +- arch/powerpc/kernel/entry_32.S | 6 +- arch/powerpc/kernel/exceptions-64e.S | 42 +++++----- arch/powerpc/kernel/exceptions-64s.S | 80 +++++++++---------- arch/powerpc/kernel/head_32.h | 2 +- arch/powerpc/kernel/head_85xx.S | 4 +- arch/powerpc/kernel/head_booke.h | 2 +- arch/powerpc/kernel/interrupt_64.S | 22 ++--- arch/powerpc/kernel/kgdb.c | 2 +- arch/powerpc/kernel/optprobes_head.S | 4 +- arch/powerpc/kernel/ppc_save_regs.S | 2 +- arch/powerpc/kernel/process.c | 4 +- arch/powerpc/kernel/tm.S | 8 +- arch/powerpc/kernel/trace/ftrace_mprofile.S | 2 +- .../lib/test_emulate_step_exec_instr.S | 2 +- arch/powerpc/perf/callchain.c | 2 +- arch/powerpc/xmon/xmon.c | 7 +- 18 files changed, 100 insertions(+), 100 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 2efec6d87049..a4ae67aa9b76 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -124,6 +124,7 @@ struct pt_regs #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + \ STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) +#define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_FRAME_MARKER 12 #ifdef CONFIG_PPC64_ELF_ABI_V2 @@ -143,6 +144,7 @@ struct pt_regs #define STACK_FRAME_OVERHEAD 16 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 1 /* Location of LR in stack frame */ #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) +#define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b4b661f631f5..68905c9f7c21 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -72,7 +72,7 @@ #endif #define STACK_PT_REGS_OFFSET(sym, val) \ - DEFINE(sym, STACK_FRAME_OVERHEAD + offsetof(struct pt_regs, val)) + DEFINE(sym, STACK_INT_FRAME_REGS + offsetof(struct pt_regs, val)) int main(void) { @@ -167,9 +167,8 @@ int main(void) OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); - /* Local pt_regs on stack for Transactional Memory funcs. */ - DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + - sizeof(struct pt_regs) + 16); + /* Local pt_regs on stack in int frame form, plus 16 bytes for TM */ + DEFINE(TM_FRAME_SIZE, STACK_INT_FRAME_SIZE + 16); #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3fc7c9886bb7..24c8d84a56c9 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -123,12 +123,12 @@ transfer_to_syscall: kuep_lock /* Calling convention has r3 = regs, r4 = orig r0 */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS mr r4,r0 bl system_call_exception ret_from_syscall: - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS li r5,0 bl syscall_exit_prepare #ifdef CONFIG_PPC_47x @@ -293,7 +293,7 @@ _ASM_NOKPROBE_SYMBOL(fast_exception_return) .globl interrupt_return interrupt_return: lwz r4,_MSR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS andi. r0,r4,MSR_PR beq .Lkernel_interrupt_return bl interrupt_exit_user_prepare diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 2f68fb2ee4fc..62033d022e0a 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -455,7 +455,7 @@ exc_##n##_bad_stack: \ EXCEPTION_COMMON(trapnum) \ ack(r8); \ CHECK_NAPPING(); \ - addi r3,r1,STACK_FRAME_OVERHEAD; \ + addi r3,r1,STACK_INT_FRAME_REGS; \ bl hdlr; \ b interrupt_return @@ -504,7 +504,7 @@ __end_interrupts: EXCEPTION_COMMON_CRIT(0x100) bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_nmi_exception b ret_from_crit_except @@ -515,7 +515,7 @@ __end_interrupts: EXCEPTION_COMMON_MC(0x000) bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl machine_check_exception b ret_from_mc_except @@ -570,7 +570,7 @@ __end_interrupts: std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) EXCEPTION_COMMON(0x700) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl program_check_exception REST_NVGPRS(r1) b interrupt_return @@ -586,7 +586,7 @@ __end_interrupts: beq- 1f bl load_up_fpu b fast_interrupt_return -1: addi r3,r1,STACK_FRAME_OVERHEAD +1: addi r3,r1,STACK_INT_FRAME_REGS bl kernel_fp_unavailable_exception b interrupt_return @@ -606,7 +606,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl altivec_unavailable_exception b interrupt_return @@ -616,7 +616,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) BOOKE_INTERRUPT_ALTIVEC_ASSIST, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x220) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_ALTIVEC BEGIN_FTR_SECTION bl altivec_assist_exception @@ -643,7 +643,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) EXCEPTION_COMMON_CRIT(0x9f0) bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_BOOKE_WDT bl WatchdogException #else @@ -664,7 +664,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) NORMAL_EXCEPTION_PROLOG(0xf20, BOOKE_INTERRUPT_AP_UNAVAIL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0xf20) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return @@ -731,7 +731,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ld r14,PACA_EXCRIT+EX_R14(r13) ld r15,PACA_EXCRIT+EX_R15(r13) EXCEPTION_COMMON_CRIT(0xd00) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl DebugException REST_NVGPRS(r1) b interrupt_return @@ -802,7 +802,7 @@ kernel_dbg_exc: ld r14,PACA_EXDBG+EX_R14(r13) ld r15,PACA_EXDBG+EX_R15(r13) EXCEPTION_COMMON_DBG(0xd08) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl DebugException REST_NVGPRS(r1) b interrupt_return @@ -812,7 +812,7 @@ kernel_dbg_exc: PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x260) CHECK_NAPPING() - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS /* * XXX: Returning from performance_monitor_exception taken as a * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return @@ -834,7 +834,7 @@ kernel_dbg_exc: EXCEPTION_COMMON_CRIT(0x2a0) bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_nmi_exception b ret_from_crit_except @@ -846,7 +846,7 @@ kernel_dbg_exc: GDBELL_EXCEPTION_PROLOG(0x2c0, BOOKE_INTERRUPT_GUEST_DBELL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x2c0) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return @@ -857,7 +857,7 @@ kernel_dbg_exc: EXCEPTION_COMMON_CRIT(0x2e0) bl special_reg_save CHECK_NAPPING(); - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_nmi_exception b ret_from_crit_except @@ -866,7 +866,7 @@ kernel_dbg_exc: NORMAL_EXCEPTION_PROLOG(0x310, BOOKE_INTERRUPT_HV_SYSCALL, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x310) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return @@ -875,7 +875,7 @@ kernel_dbg_exc: NORMAL_EXCEPTION_PROLOG(0x320, BOOKE_INTERRUPT_HV_PRIV, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x320) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return @@ -884,7 +884,7 @@ kernel_dbg_exc: NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR, PROLOG_ADDITION_NONE) EXCEPTION_COMMON(0x340) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return @@ -979,7 +979,7 @@ masked_interrupt_book3e_0x2c0: * original values stashed away in the PACA */ storage_fault_common: - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_page_fault b interrupt_return @@ -988,7 +988,7 @@ storage_fault_common: * continues here. */ alignment_more: - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl alignment_exception REST_NVGPRS(r1) b interrupt_return @@ -1069,7 +1069,7 @@ bad_stack_book3e: ZEROIZE_GPR(12) std r12,0(r11) LOAD_PACA_TOC() -1: addi r3,r1,STACK_FRAME_OVERHEAD +1: addi r3,r1,STACK_INT_FRAME_REGS bl kernel_bad_stack b 1b diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 651c36b056bd..29b78536ca59 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1061,7 +1061,7 @@ EXC_COMMON_BEGIN(system_reset_common) subi r1,r1,INT_FRAME_SIZE __GEN_COMMON_BODY system_reset - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl system_reset_exception /* Clear MSR_RI before setting SRR0 and SRR1. */ @@ -1208,7 +1208,7 @@ EXC_COMMON_BEGIN(machine_check_early_common) BEGIN_FTR_SECTION bl enable_machine_check END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS BEGIN_FTR_SECTION bl machine_check_early_boot END_FTR_SECTION(0, 1) // nop out after boot @@ -1298,7 +1298,7 @@ EXC_COMMON_BEGIN(machine_check_common) * save area: PACA_EXMC instead of PACA_EXGEN. */ GEN_COMMON machine_check - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl machine_check_exception_async b interrupt_return_srr @@ -1364,14 +1364,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) * This is the NMI version of the handler because we are called from * the early handler which is a true NMI. */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl machine_check_exception /* * We will not reach here. Even if we did, there is no way out. * Call unrecoverable_exception and die. */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unrecoverable_exception b . @@ -1422,7 +1422,7 @@ EXC_VIRT_END(data_access, 0x4300, 0x80) EXC_COMMON_BEGIN(data_access_common) GEN_COMMON data_access ld r4,_DSISR(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS andis. r0,r4,DSISR_DABRMATCH@h bne- 1f #ifdef CONFIG_PPC_64S_HASH_MMU @@ -1479,7 +1479,7 @@ EXC_COMMON_BEGIN(data_access_slb_common) #ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_slb_fault cmpdi r3,0 bne- 1f @@ -1493,7 +1493,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) li r3,-EFAULT #endif std r3,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_bad_segment_interrupt b interrupt_return_srr @@ -1525,7 +1525,7 @@ EXC_VIRT_BEGIN(instruction_access, 0x4400, 0x80) EXC_VIRT_END(instruction_access, 0x4400, 0x80) EXC_COMMON_BEGIN(instruction_access_common) GEN_COMMON instruction_access - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION bl do_hash_fault @@ -1567,7 +1567,7 @@ EXC_COMMON_BEGIN(instruction_access_slb_common) #ifdef CONFIG_PPC_64S_HASH_MMU BEGIN_MMU_FTR_SECTION /* HPT case, do SLB fault */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_slb_fault cmpdi r3,0 bne- 1f @@ -1581,7 +1581,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) li r3,-EFAULT #endif std r3,RESULT(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_bad_segment_interrupt b interrupt_return_srr @@ -1635,7 +1635,7 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) EXC_COMMON_BEGIN(hardware_interrupt_common) GEN_COMMON hardware_interrupt - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_IRQ BEGIN_FTR_SECTION b interrupt_return_hsrr @@ -1665,7 +1665,7 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) EXC_VIRT_END(alignment, 0x4600, 0x100) EXC_COMMON_BEGIN(alignment_common) GEN_COMMON alignment - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl alignment_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -1731,7 +1731,7 @@ EXC_COMMON_BEGIN(program_check_common) __GEN_COMMON_BODY program_check .Ldo_program_check: - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl program_check_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -1762,7 +1762,7 @@ EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) EXC_COMMON_BEGIN(fp_unavailable_common) GEN_COMMON fp_unavailable bne 1f /* if from user, just load it up */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl kernel_fp_unavailable_exception 0: trap EMIT_BUG_ENTRY 0b, __FILE__, __LINE__, 0 @@ -1780,7 +1780,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) b fast_interrupt_return_srr #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl fp_unavailable_tm b interrupt_return_srr #endif @@ -1824,7 +1824,7 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80) EXC_VIRT_END(decrementer, 0x4900, 0x80) EXC_COMMON_BEGIN(decrementer_common) GEN_COMMON decrementer - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl timer_interrupt b interrupt_return_srr @@ -1909,7 +1909,7 @@ EXC_VIRT_BEGIN(doorbell_super, 0x4a00, 0x100) EXC_VIRT_END(doorbell_super, 0x4a00, 0x100) EXC_COMMON_BEGIN(doorbell_super_common) GEN_COMMON doorbell_super - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_PPC_DOORBELL bl doorbell_exception #else @@ -2076,7 +2076,7 @@ EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) EXC_VIRT_END(single_step, 0x4d00, 0x100) EXC_COMMON_BEGIN(single_step_common) GEN_COMMON single_step - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl single_step_exception b interrupt_return_srr @@ -2110,7 +2110,7 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) EXC_COMMON_BEGIN(h_data_storage_common) GEN_COMMON h_data_storage - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS BEGIN_MMU_FTR_SECTION bl do_bad_page_fault_segv MMU_FTR_SECTION_ELSE @@ -2139,7 +2139,7 @@ EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) EXC_COMMON_BEGIN(h_instr_storage_common) GEN_COMMON h_instr_storage - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return_hsrr @@ -2162,7 +2162,7 @@ EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) EXC_COMMON_BEGIN(emulation_assist_common) GEN_COMMON emulation_assist - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl emulation_assist_interrupt REST_NVGPRS(r1) /* instruction emulation may change GPRs */ b interrupt_return_hsrr @@ -2222,7 +2222,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) __GEN_COMMON_BODY hmi_exception_early - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl hmi_exception_realmode cmpdi cr0,r3,0 bne 1f @@ -2240,7 +2240,7 @@ EXC_COMMON_BEGIN(hmi_exception_early_common) EXC_COMMON_BEGIN(hmi_exception_common) GEN_COMMON hmi_exception - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl handle_hmi_exception b interrupt_return_hsrr @@ -2274,7 +2274,7 @@ EXC_VIRT_BEGIN(h_doorbell, 0x4e80, 0x20) EXC_VIRT_END(h_doorbell, 0x4e80, 0x20) EXC_COMMON_BEGIN(h_doorbell_common) GEN_COMMON h_doorbell - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_PPC_DOORBELL bl doorbell_exception #else @@ -2310,7 +2310,7 @@ EXC_VIRT_BEGIN(h_virt_irq, 0x4ea0, 0x20) EXC_VIRT_END(h_virt_irq, 0x4ea0, 0x20) EXC_COMMON_BEGIN(h_virt_irq_common) GEN_COMMON h_virt_irq - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl do_IRQ b interrupt_return_hsrr @@ -2356,7 +2356,7 @@ EXC_VIRT_BEGIN(performance_monitor, 0x4f00, 0x20) EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS lbz r4,PACAIRQSOFTMASK(r13) cmpdi r4,IRQS_ENABLED bne 1f @@ -2410,14 +2410,14 @@ BEGIN_FTR_SECTION b fast_interrupt_return_srr #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl altivec_unavailable_tm b interrupt_return_srr #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl altivec_unavailable_exception b interrupt_return_srr @@ -2458,14 +2458,14 @@ BEGIN_FTR_SECTION b load_up_vsx #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl vsx_unavailable_tm b interrupt_return_srr #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl vsx_unavailable_exception b interrupt_return_srr @@ -2492,7 +2492,7 @@ EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) EXC_COMMON_BEGIN(facility_unavailable_common) GEN_COMMON facility_unavailable - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl facility_unavailable_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -2520,7 +2520,7 @@ EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) EXC_COMMON_BEGIN(h_facility_unavailable_common) GEN_COMMON h_facility_unavailable - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl facility_unavailable_exception REST_NVGPRS(r1) /* XXX Shouldn't be necessary in practice */ b interrupt_return_hsrr @@ -2550,7 +2550,7 @@ EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) EXC_COMMON_BEGIN(cbe_system_error_common) GEN_COMMON cbe_system_error - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl cbe_system_error_exception b interrupt_return_hsrr @@ -2581,7 +2581,7 @@ EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) EXC_COMMON_BEGIN(instruction_breakpoint_common) GEN_COMMON instruction_breakpoint - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl instruction_breakpoint_exception b interrupt_return_srr @@ -2703,7 +2703,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) EXC_COMMON_BEGIN(denorm_exception_common) GEN_COMMON denorm_exception - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unknown_exception b interrupt_return_hsrr @@ -2720,7 +2720,7 @@ EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) EXC_COMMON_BEGIN(cbe_maintenance_common) GEN_COMMON cbe_maintenance - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl cbe_maintenance_exception b interrupt_return_hsrr @@ -2745,7 +2745,7 @@ EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) EXC_VIRT_END(altivec_assist, 0x5700, 0x100) EXC_COMMON_BEGIN(altivec_assist_common) GEN_COMMON altivec_assist - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_ALTIVEC bl altivec_assist_exception REST_NVGPRS(r1) /* instruction emulation may change GPRs */ @@ -2767,7 +2767,7 @@ EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) EXC_COMMON_BEGIN(cbe_thermal_common) GEN_COMMON cbe_thermal - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl cbe_thermal_exception b interrupt_return_hsrr @@ -2800,7 +2800,7 @@ EXC_COMMON_BEGIN(soft_nmi_common) subi r1,r1,INT_FRAME_SIZE __GEN_COMMON_BODY soft_nmi - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl soft_nmi_interrupt /* Clear MSR_RI before setting SRR0 and SRR1. */ diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index c3286260a7d1..117d25330e13 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -127,7 +127,7 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) mfspr r10,SPRN_XER addi r2, r2, -THREAD stw r10,_XER(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS .endm .macro prepare_transfer_to_handler diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 52c0ab416326..24f39abf81df 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -972,10 +972,10 @@ _GLOBAL(__giveup_spe) li r4,THREAD_ACC evstddx evr6, r4, r3 /* save off accumulator */ beq 1f - lwz r4,_MSR-STACK_FRAME_OVERHEAD(r5) + lwz r4,_MSR-STACK_INT_FRAME_REGS(r5) lis r3,MSR_SPE@h andc r4,r4,r3 /* disable SPE for previous task */ - stw r4,_MSR-STACK_FRAME_OVERHEAD(r5) + stw r4,_MSR-STACK_INT_FRAME_REGS(r5) 1: blr #endif /* CONFIG_SPE */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 1cb9d0f7cbf2..3149ac20b18e 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -99,7 +99,7 @@ END_BTB_FLUSH_SECTION mfspr r10,SPRN_XER addi r2, r2, -THREAD stw r10,_XER(r1) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS .endm .macro prepare_transfer_to_handler diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index a019ed6fc839..49d585eae7c8 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -78,7 +78,7 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) std r12,_CCR(r1) std r3,ORIG_GPR3(r1) /* Calling convention has r3 = regs, r4 = orig r0 */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS mr r4,r0 LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r11,-16(r3) /* "regshere" marker */ @@ -99,7 +99,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) bl system_call_exception .Lsyscall_vectored_\name\()_exit: - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS li r5,1 /* scv */ bl syscall_exit_prepare std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ @@ -176,7 +176,7 @@ _ASM_NOKPROBE_SYMBOL(syscall_vectored_\name\()_restart) ld r1,PACA_EXIT_SAVE_R1(r13) LOAD_PACA_TOC() ld r3,RESULT(r1) - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) bl syscall_exit_restart @@ -251,7 +251,7 @@ END_BTB_FLUSH_SECTION std r12,_CCR(r1) std r3,ORIG_GPR3(r1) /* Calling convention has r3 = regs, r4 = orig r0 */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS mr r4,r0 LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r11,-16(r3) /* "regshere" marker */ @@ -278,7 +278,7 @@ END_BTB_FLUSH_SECTION bl system_call_exception .Lsyscall_exit: - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS li r5,0 /* !scv */ bl syscall_exit_prepare std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ @@ -357,7 +357,7 @@ _ASM_NOKPROBE_SYMBOL(syscall_restart) ld r1,PACA_EXIT_SAVE_R1(r13) LOAD_PACA_TOC() ld r3,RESULT(r1) - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) bl syscall_exit_restart @@ -388,7 +388,7 @@ _ASM_NOKPROBE_SYMBOL(fast_interrupt_return_srr) andi. r0,r5,MSR_RI li r3,0 /* 0 return value, no EMULATE_STACK_STORE */ bne+ .Lfast_kernel_interrupt_return_srr - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl unrecoverable_exception b . /* should not get here */ #else @@ -406,7 +406,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()) beq interrupt_return_\srr\()_kernel interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl interrupt_exit_user_prepare cmpdi r3,0 bne- .Lrestore_nvgprs_\srr @@ -503,7 +503,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) LOAD_PACA_TOC() - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) bl interrupt_exit_user_restart @@ -518,7 +518,7 @@ RESTART_TABLE(.Linterrupt_return_\srr\()_user_rst_start, .Linterrupt_return_\srr .balign IFETCH_ALIGN_BYTES interrupt_return_\srr\()_kernel: _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS bl interrupt_exit_kernel_prepare std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ @@ -684,7 +684,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel_restart) GET_PACA(r13) ld r1,PACA_EXIT_SAVE_R1(r13) LOAD_PACA_TOC() - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS li r11,IRQS_ALL_DISABLED stb r11,PACAIRQSOFTMASK(r13) bl interrupt_exit_kernel_restart diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 1a1e9995dae3..ebe4d1645ca1 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -191,7 +191,7 @@ static int kgdb_break_match(struct pt_regs *regs) void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + - STACK_FRAME_OVERHEAD); + STACK_INT_FRAME_REGS); unsigned long *ptr = gdb_regs; int reg; diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index cd4e7bc32609..35932f45fb4e 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -85,7 +85,7 @@ optprobe_template_op_address: TEMPLATE_FOR_IMM_LOAD_INSNS /* 2. pt_regs pointer in r4 */ - addi r4,r1,STACK_FRAME_OVERHEAD + addi r4,r1,STACK_INT_FRAME_REGS .global optprobe_template_call_handler optprobe_template_call_handler: @@ -96,7 +96,7 @@ optprobe_template_call_handler: * Parameters for instruction emulation: * 1. Pass SP in register r3. */ - addi r3,r1,STACK_FRAME_OVERHEAD + addi r3,r1,STACK_INT_FRAME_REGS .global optprobe_template_insn optprobe_template_insn: diff --git a/arch/powerpc/kernel/ppc_save_regs.S b/arch/powerpc/kernel/ppc_save_regs.S index 6e86f3bf4673..49813f982468 100644 --- a/arch/powerpc/kernel/ppc_save_regs.S +++ b/arch/powerpc/kernel/ppc_save_regs.S @@ -22,7 +22,7 @@ */ _GLOBAL(ppc_save_regs) /* This allows stack frame accessor macros and offsets to be used */ - subi r3,r3,STACK_FRAME_OVERHEAD + subi r3,r3,STACK_INT_FRAME_REGS PPC_STL r0,GPR0(r3) #ifdef CONFIG_PPC32 stmw r2,GPR2(r3) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f93703ea4a12..d7a581997d92 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2260,12 +2260,12 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, /* * See if this is an exception frame. - * We look for the "regshere" marker in the current frame. + * We look for the "regs" marker in the current frame. */ if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS) && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) - (sp + STACK_FRAME_OVERHEAD); + (sp + STACK_INT_FRAME_REGS); lr = regs->link; printk("%s--- interrupt: %lx at %pS\n", diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 5a0f023a26e9..9feab5e0485b 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -117,7 +117,7 @@ _GLOBAL(tm_reclaim) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) - /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ + /* We've a struct pt_regs at [r1+STACK_INT_FRAME_REGS]. */ std r3, STK_PARAM(R3)(r1) SAVE_NVGPRS(r1) @@ -222,7 +222,7 @@ _GLOBAL(tm_reclaim) * Make r7 look like an exception frame so that we can use the neat * GPRx(n) macros. r7 is NOT a pt_regs ptr! */ - subi r7, r7, STACK_FRAME_OVERHEAD + subi r7, r7, STACK_INT_FRAME_REGS /* Sync the userland GPRs 2-12, 14-31 to thread->regs: */ SAVE_GPR(0, r7) /* user r0 */ @@ -359,7 +359,7 @@ _GLOBAL(__tm_recheckpoint) stdu r1, -TM_FRAME_SIZE(r1) /* - * We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. + * We've a struct pt_regs at [r1+STACK_INT_FRAME_REGS]. * This is used for backing up the NVGPRs: */ SAVE_NVGPRS(r1) @@ -379,7 +379,7 @@ _GLOBAL(__tm_recheckpoint) * Make r7 look like an exception frame so that we can use the neat * GPRx(n) macros. r7 is now NOT a pt_regs ptr! */ - subi r7, r7, STACK_FRAME_OVERHEAD + subi r7, r7, STACK_INT_FRAME_REGS /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 diff --git a/arch/powerpc/kernel/trace/ftrace_mprofile.S b/arch/powerpc/kernel/trace/ftrace_mprofile.S index d031093bc436..ffb1db386849 100644 --- a/arch/powerpc/kernel/trace/ftrace_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_mprofile.S @@ -110,7 +110,7 @@ .endif /* Load &pt_regs in r6 for call below */ - addi r6, r1, STACK_FRAME_OVERHEAD + addi r6, r1, STACK_INT_FRAME_REGS .endm .macro ftrace_regs_exit allregs diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S b/arch/powerpc/lib/test_emulate_step_exec_instr.S index 5473f9d03df3..e2b646a4f7fa 100644 --- a/arch/powerpc/lib/test_emulate_step_exec_instr.S +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S @@ -16,7 +16,7 @@ _GLOBAL(exec_instr) /* * Stack frame layout (INT_FRAME_SIZE bytes) - * In-memory pt_regs (SP + STACK_FRAME_OVERHEAD) + * In-memory pt_regs (SP + STACK_INT_FRAME_REGS) * Scratch space (SP + 8) * Back chain (SP + 0) */ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 8718289c051d..9e254aed1f61 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -67,7 +67,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re * This looks like an interrupt frame for an * interrupt that occurred in the kernel */ - regs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); + regs = (struct pt_regs *)(sp + STACK_INT_FRAME_REGS); next_ip = regs->nip; lr = regs->link; level = 0; diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e34d7809f6c9..a14eb4d815c2 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1782,14 +1782,13 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr, xmon_print_symbol(ip, " ", "\n"); } - /* Look for "regshere" marker to see if this is + /* Look for "regs" marker to see if this is an exception frame. */ if (mread(sp + MARKER_OFFSET, &marker, sizeof(unsigned long)) && marker == STACK_FRAME_REGS_MARKER) { - if (mread(sp + STACK_FRAME_OVERHEAD, ®s, sizeof(regs)) - != sizeof(regs)) { + if (mread(sp + STACK_INT_FRAME_REGS, ®s, sizeof(regs)) != sizeof(regs)) { printf("Couldn't read registers at %lx\n", - sp + STACK_FRAME_OVERHEAD); + sp + STACK_INT_FRAME_REGS); break; } printf("--- Exception: %lx %s at ", regs.trap, From d2e8ff9f1492f44c5a6d93f759eea27574d753de Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:33 +1000 Subject: [PATCH 133/181] powerpc: add a definition for the marker offset within the interrupt frame Define a constant rather than open-code the offset for the "regs" marker. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-9-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 2 ++ arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/exceptions-64e.S | 2 +- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kernel/head_32.h | 2 +- arch/powerpc/kernel/head_booke.h | 2 +- arch/powerpc/kernel/interrupt_64.S | 10 +++++----- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +- 8 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index a4ae67aa9b76..8a9f4cf8c4c5 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -125,6 +125,7 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + \ STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD +#define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) #define STACK_FRAME_MARKER 12 #ifdef CONFIG_PPC64_ELF_ABI_V2 @@ -145,6 +146,7 @@ struct pt_regs #define STACK_FRAME_LR_SAVE 1 /* Location of LR in stack frame */ #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD +#define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 8) #define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 24c8d84a56c9..2f61b7d3677c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -114,7 +114,7 @@ transfer_to_syscall: addi r12,r12,STACK_FRAME_REGS_MARKER@l stw r9,_MSR(r1) li r2, INTERRUPT_SYSCALL - stw r12,8(r1) + stw r12,STACK_INT_FRAME_MARKER(r1) stw r2,_TRAP(r1) SAVE_GPR(0, r1) SAVE_GPRS(3, 8, r1) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 62033d022e0a..b9cec22df9f9 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -391,7 +391,7 @@ exc_##n##_common: \ std r10,_CCR(r1); /* store orig CR in stackframe */ \ std r9,GPR1(r1); /* store stack frame back link */ \ std r11,SOFTE(r1); /* and save it to stackframe */ \ - std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + std r12,STACK_INT_FRAME_MARKER(r1); /* mark the frame */ \ std r3,_TRAP(r1); /* set trap number */ \ std r0,RESULT(r1); /* clear regs->result */ \ SAVE_NVGPRS(r1); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 29b78536ca59..ac3b0580224e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -591,7 +591,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) li r10,0 LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) std r10,RESULT(r1) /* clear regs->result */ - std r11,STACK_FRAME_OVERHEAD-16(r1) /* mark the frame */ + std r11,STACK_INT_FRAME_MARKER(r1) /* mark the frame */ .endm /* diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 117d25330e13..f8e2911478a7 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -112,7 +112,7 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) stw r0,GPR0(r1) lis r10,STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ addi r10,r10,STACK_FRAME_REGS_MARKER@l - stw r10,8(r1) + stw r10,STACK_INT_FRAME_MARKER(r1) li r10, \trapno stw r10,_TRAP(r1) SAVE_GPRS(3, 8, r1) diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 3149ac20b18e..37d43c172676 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -84,7 +84,7 @@ END_BTB_FLUSH_SECTION stw r0,GPR0(r1) lis r10, STACK_FRAME_REGS_MARKER@ha /* exception frame marker */ addi r10, r10, STACK_FRAME_REGS_MARKER@l - stw r10, 8(r1) + stw r10, STACK_INT_FRAME_MARKER(r1) li r10, \trapno stw r10,_TRAP(r1) SAVE_GPRS(3, 8, r1) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 49d585eae7c8..321992c1c9f9 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -77,11 +77,11 @@ _ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) std r11,_TRAP(r1) std r12,_CCR(r1) std r3,ORIG_GPR3(r1) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) + std r11,STACK_INT_FRAME_MARKER(r1) /* "regs" marker */ /* Calling convention has r3 = regs, r4 = orig r0 */ addi r3,r1,STACK_INT_FRAME_REGS mr r4,r0 - LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) - std r11,-16(r3) /* "regshere" marker */ BEGIN_FTR_SECTION HMT_MEDIUM @@ -250,11 +250,11 @@ END_BTB_FLUSH_SECTION std r11,_TRAP(r1) std r12,_CCR(r1) std r3,ORIG_GPR3(r1) + LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) + std r11,STACK_INT_FRAME_MARKER(r1) /* "regs" marker */ /* Calling convention has r3 = regs, r4 = orig r0 */ addi r3,r1,STACK_INT_FRAME_REGS mr r4,r0 - LOAD_REG_IMMEDIATE(r11, STACK_FRAME_REGS_MARKER) - std r11,-16(r3) /* "regshere" marker */ #ifdef CONFIG_PPC_BOOK3S li r11,1 @@ -637,7 +637,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) * Leaving a stale STACK_FRAME_REGS_MARKER on the stack can confuse * the reliable stack unwinder later on. Clear it. */ - std r0,STACK_FRAME_OVERHEAD-16(r1) + std r0,STACK_INT_FRAME_MARKER(r1) REST_GPRS(2, 5, r1) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 37f50861dd98..a9e162a1deec 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2729,7 +2729,7 @@ kvmppc_bad_host_intr: std r6, SOFTE(r1) LOAD_PACA_TOC() LOAD_REG_IMMEDIATE(3, STACK_FRAME_REGS_MARKER) - std r3, STACK_FRAME_OVERHEAD-16(r1) + std r3, STACK_INT_FRAME_MARKER(r1) /* * XXX On POWER7 and POWER8, we just spin here since we don't From e856e336924b0ecd0b7058e65e6b3e7266ee0b95 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:34 +1000 Subject: [PATCH 134/181] powerpc: Rename STACK_FRAME_MARKER and derive it from frame offset This is a count of longs from the stack pointer to the regs marker. Rename it to make it more distinct from the other byte offsets. It can be derived from the byte offset definitions just added. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-10-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 4 ++-- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/perf/callchain.c | 2 +- arch/powerpc/xmon/xmon.c | 3 +-- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 8a9f4cf8c4c5..fdd50648df56 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -126,7 +126,6 @@ struct pt_regs STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) -#define STACK_FRAME_MARKER 12 #ifdef CONFIG_PPC64_ELF_ABI_V2 #define STACK_FRAME_MIN_SIZE 32 @@ -147,7 +146,6 @@ struct pt_regs #define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 8) -#define STACK_FRAME_MARKER 2 #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD /* Size of stack frame allocated when calling signal handler. */ @@ -155,6 +153,8 @@ struct pt_regs #endif /* __powerpc64__ */ +#define STACK_INT_FRAME_MARKER_LONGS (STACK_INT_FRAME_MARKER/sizeof(long)) + #ifndef __ASSEMBLY__ #include <asm/paca.h> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index d7a581997d92..6c0a3c664266 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2263,7 +2263,7 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, * We look for the "regs" marker in the current frame. */ if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS) - && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + && stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) (sp + STACK_INT_FRAME_REGS); diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index a2443d61728e..7efa0ec9dd77 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -136,7 +136,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum /* Mark stacktraces with exception frames as unreliable. */ if (sp <= stack_end - STACK_INT_FRAME_SIZE && - stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { return -EINVAL; } diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index 9e254aed1f61..b01497ed5173 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -62,7 +62,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re if (next_sp == sp + STACK_INT_FRAME_SIZE && validate_sp(sp, current, STACK_INT_FRAME_SIZE) && - fp[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) { + fp[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { /* * This looks like an interrupt frame for an * interrupt that occurred in the kernel diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a14eb4d815c2..0da66bc4823d 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1721,7 +1721,6 @@ static void get_function_bounds(unsigned long pc, unsigned long *startp, } #define LRSAVE_OFFSET (STACK_FRAME_LR_SAVE * sizeof(unsigned long)) -#define MARKER_OFFSET (STACK_FRAME_MARKER * sizeof(unsigned long)) static void xmon_show_stack(unsigned long sp, unsigned long lr, unsigned long pc) @@ -1784,7 +1783,7 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr, /* Look for "regs" marker to see if this is an exception frame. */ - if (mread(sp + MARKER_OFFSET, &marker, sizeof(unsigned long)) + if (mread(sp + STACK_INT_FRAME_MARKER, &marker, sizeof(unsigned long)) && marker == STACK_FRAME_REGS_MARKER) { if (mread(sp + STACK_INT_FRAME_REGS, ®s, sizeof(regs)) != sizeof(regs)) { printf("Couldn't read registers at %lx\n", From 1223e5a20f7fb3c31c91a328d1a04ed26d5e889b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:35 +1000 Subject: [PATCH 135/181] powerpc: add a define for the user interrupt frame size The user interrupt frame is a different size from the kernel frame, so give it its own name. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-11-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 6 +++--- arch/powerpc/kernel/process.c | 6 +++--- arch/powerpc/kernel/stacktrace.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index fdd50648df56..705ce26ae887 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -122,8 +122,7 @@ struct pt_regs #define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ -#define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + \ - STACK_FRAME_OVERHEAD + KERNEL_REDZONE_SIZE) +#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) @@ -143,7 +142,7 @@ struct pt_regs #define KERNEL_REDZONE_SIZE 0 #define STACK_FRAME_OVERHEAD 16 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 1 /* Location of LR in stack frame */ -#define STACK_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) +#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 8) #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD @@ -153,6 +152,7 @@ struct pt_regs #endif /* __powerpc64__ */ +#define STACK_INT_FRAME_SIZE (KERNEL_REDZONE_SIZE + STACK_USER_INT_FRAME_SIZE) #define STACK_INT_FRAME_MARKER_LONGS (STACK_INT_FRAME_MARKER/sizeof(long)) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 6c0a3c664266..010a5ee746ae 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1756,15 +1756,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) klp_init_thread_info(p); /* Create initial stack frame. */ - sp -= (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD); + sp -= STACK_USER_INT_FRAME_SIZE; ((unsigned long *)sp)[0] = 0; /* Copy registers */ - childregs = (struct pt_regs *)(sp + STACK_FRAME_OVERHEAD); + childregs = (struct pt_regs *)(sp + STACK_INT_FRAME_REGS); if (unlikely(args->fn)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->gpr[1] = sp + (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD); + childregs->gpr[1] = sp + STACK_USER_INT_FRAME_SIZE; /* function */ if (args->fn) childregs->gpr[14] = ppc_function_entry((void *)args->fn); diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 7efa0ec9dd77..453ac317a6cf 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -77,7 +77,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum /* * For user tasks, this is the SP value loaded on * kernel entry, see "PACAKSAVE(r13)" in _switch() and - * system_call_common()/EXCEPTION_PROLOG_COMMON(). + * system_call_common(). * * Likewise for non-swapper kernel threads, * this also happens to be the top of the stack @@ -88,7 +88,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum * an unreliable stack trace until it's been * _switch()'ed to for the first time. */ - stack_end -= STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + stack_end -= STACK_USER_INT_FRAME_SIZE; } else { /* * idle tasks have a custom stack layout, From 6f291a03819e4051ebc870471d26915ef2e6ba31 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:36 +1000 Subject: [PATCH 136/181] powerpc: add a define for the switch frame size and regs offset This is open-coded in process.c, ppc32 uses a different define with the same value, and the C definition is name differently which makes it an extra indirection to grep for. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-12-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 6 ++++-- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_32.S | 6 +++--- arch/powerpc/kernel/process.c | 12 ++++++++---- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 705ce26ae887..412ef0749775 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -97,8 +97,6 @@ struct pt_regs #endif -#define STACK_FRAME_WITH_PT_REGS (STACK_FRAME_OVERHEAD + sizeof(struct pt_regs)) - // Always displays as "REGS" in memory dumps #ifdef CONFIG_CPU_BIG_ENDIAN #define STACK_FRAME_REGS_MARKER ASM_CONST(0x52454753) @@ -125,6 +123,8 @@ struct pt_regs #define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) +#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) +#define STACK_SWITCH_FRAME_REGS STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC64_ELF_ABI_V2 #define STACK_FRAME_MIN_SIZE 32 @@ -146,6 +146,8 @@ struct pt_regs #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 8) #define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD +#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) +#define STACK_SWITCH_FRAME_REGS STACK_FRAME_OVERHEAD /* Size of stack frame allocated when calling signal handler. */ #define __SIGNAL_FRAMESIZE 64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 68905c9f7c21..d24a59a98c0c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -260,7 +260,7 @@ int main(void) /* Interrupt register frame */ DEFINE(INT_FRAME_SIZE, STACK_INT_FRAME_SIZE); - DEFINE(SWITCH_FRAME_SIZE, STACK_FRAME_WITH_PT_REGS); + DEFINE(SWITCH_FRAME_SIZE, STACK_SWITCH_FRAME_SIZE); STACK_PT_REGS_OFFSET(GPR0, gpr[0]); STACK_PT_REGS_OFFSET(GPR1, gpr[1]); STACK_PT_REGS_OFFSET(GPR2, gpr[2]); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 2f61b7d3677c..6e99ec10be89 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -215,9 +215,9 @@ ret_from_kernel_thread: * in arch/ppc/kernel/process.c */ _GLOBAL(_switch) - stwu r1,-INT_FRAME_SIZE(r1) + stwu r1,-SWITCH_FRAME_SIZE(r1) mflr r0 - stw r0,INT_FRAME_SIZE+4(r1) + stw r0,SWITCH_FRAME_SIZE+4(r1) /* r3-r12 are caller saved -- Cort */ SAVE_NVGPRS(r1) stw r0,_NIP(r1) /* Return to switch caller */ @@ -248,7 +248,7 @@ _GLOBAL(_switch) lwz r4,_NIP(r1) /* Return to _switch caller in new task */ mtlr r4 - addi r1,r1,INT_FRAME_SIZE + addi r1,r1,SWITCH_FRAME_SIZE blr .globl fast_exception_return diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 010a5ee746ae..0cb5296c6c41 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1808,10 +1808,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ - sp -= sizeof(struct pt_regs); - kregs = (struct pt_regs *) sp; - sp -= STACK_FRAME_OVERHEAD; + sp -= STACK_SWITCH_FRAME_SIZE; + kregs = (struct pt_regs *)(sp + STACK_SWITCH_FRAME_REGS); p->thread.ksp = sp; + #ifdef CONFIG_HAVE_HW_BREAKPOINT for (i = 0; i < nr_wp_slots(); i++) p->thread.ptrace_bps[i] = NULL; @@ -2261,8 +2261,12 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, /* * See if this is an exception frame. * We look for the "regs" marker in the current frame. + * + * STACK_SWITCH_FRAME_SIZE being the smallest frame that + * could hold a pt_regs, if that does not fit then it can't + * have regs. */ - if (validate_sp(sp, tsk, STACK_FRAME_WITH_PT_REGS) + if (validate_sp(sp, tsk, STACK_SWITCH_FRAME_SIZE) && stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) (sp + STACK_INT_FRAME_REGS); From 6895dfc0474170c492191c126fcfc420f7771a09 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:37 +1000 Subject: [PATCH 137/181] powerpc: copy_thread fill in interrupt frame marker and back chain Backtraces will not recognise the fork system call interrupt without the regs marker. And regular interrupt entry from userspace creates the back chain to the user stack, so do this for the initial fork frame too, to be consistent. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-13-npiggin@gmail.com --- arch/powerpc/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 0cb5296c6c41..6b1d80bd370e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1757,12 +1757,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Create initial stack frame. */ sp -= STACK_USER_INT_FRAME_SIZE; - ((unsigned long *)sp)[0] = 0; + *(unsigned long *)(sp + STACK_INT_FRAME_MARKER) = STACK_FRAME_REGS_MARKER; /* Copy registers */ childregs = (struct pt_regs *)(sp + STACK_INT_FRAME_REGS); if (unlikely(args->fn)) { /* kernel thread */ + ((unsigned long *)sp)[0] = 0; memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + STACK_USER_INT_FRAME_SIZE; /* function */ @@ -1782,6 +1783,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) *childregs = *regs; if (usp) childregs->gpr[1] = usp; + ((unsigned long *)sp)[0] = childregs->gpr[1]; p->thread.regs = childregs; /* 64s sets this in ret_from_fork */ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) From edbd0387f3249cc7e102f86d4852a9a9f3bb1305 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:38 +1000 Subject: [PATCH 138/181] powerpc: copy_thread add a back chain to the switch stack frame Stack unwinders need LR and the back chain as a minimum. The switch stack uses regs->nip for its return pointer rather than lrsave, so that was not set in the fork frame, and neither was the back chain. This change sets those fields in the stack. With this and the previous change, a stack trace in the switch or interrupt stack goes from looking like this: Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 3 PID: 90 Comm: systemd Not tainted NIP: c000000000011060 LR: c000000000010f68 CTR: 0000000000007fff [ ... regs ... ] NIP [c000000000011060] _switch+0x160/0x17c LR [c000000000010f68] _switch+0x68/0x17c Call Trace: To this: Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries CPU: 0 PID: 93 Comm: systemd Not tainted NIP: c000000000011060 LR: c000000000010f68 CTR: 0000000000007fff [ ... regs ... ] NIP [c000000000011060] _switch+0x160/0x17c LR [c000000000010f68] _switch+0x68/0x17c Call Trace: [c000000005a93e10] [c00000000000cdbc] ret_from_fork_scv+0x0/0x54 --- interrupt: 3000 at 0x7fffa72f56d8 NIP: 00007fffa72f56d8 LR: 0000000000000000 CTR: 0000000000000000 [ ... regs ... ] NIP [00007fffa72f56d8] 0x7fffa72f56d8 LR [0000000000000000] 0x0 --- interrupt: 3000 Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-14-npiggin@gmail.com --- arch/powerpc/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 6b1d80bd370e..096b6ea52378 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1810,7 +1810,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * do some house keeping and then return from the fork or clone * system call, using the stack frame created above. */ + ((unsigned long *)sp)[STACK_FRAME_LR_SAVE] = (unsigned long)f; sp -= STACK_SWITCH_FRAME_SIZE; + ((unsigned long *)sp)[0] = sp + STACK_SWITCH_FRAME_SIZE; kregs = (struct pt_regs *)(sp + STACK_SWITCH_FRAME_REGS); p->thread.ksp = sp; From 4cefb0f6c555971b3e6544a9b15470f9d1f12089 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:39 +1000 Subject: [PATCH 139/181] powerpc: split validate_sp into two functions Most callers just want to validate an arbitrary kernel stack pointer, some need a particular size. Make the size case the exceptional one with an extra function. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-15-npiggin@gmail.com --- arch/powerpc/include/asm/processor.h | 15 ++++++++++++--- arch/powerpc/kernel/process.c | 23 ++++++++++++++--------- arch/powerpc/kernel/stacktrace.c | 2 +- arch/powerpc/perf/callchain.c | 6 +++--- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 631802999d59..e96c9b8c2a60 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -374,9 +374,18 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #endif -/* Check that a certain kernel stack pointer is valid in task_struct p */ -int validate_sp(unsigned long sp, struct task_struct *p, - unsigned long nbytes); +/* + * Check that a certain kernel stack pointer is a valid (minimum sized) + * stack frame in task_struct p. + */ +int validate_sp(unsigned long sp, struct task_struct *p); + +/* + * validate the stack frame of a particular minimum size, used for when we are + * looking at a certain object in the stack beyond the minimum. + */ +int validate_sp_size(unsigned long sp, struct task_struct *p, + unsigned long nbytes); /* * Prefetch macros. diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 096b6ea52378..9446bee8ca32 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2157,9 +2157,12 @@ static inline int valid_emergency_stack(unsigned long sp, struct task_struct *p, return 0; } - -int validate_sp(unsigned long sp, struct task_struct *p, - unsigned long nbytes) +/* + * validate the stack frame of a particular minimum size, used for when we are + * looking at a certain object in the stack beyond the minimum. + */ +int validate_sp_size(unsigned long sp, struct task_struct *p, + unsigned long nbytes) { unsigned long stack_page = (unsigned long)task_stack_page(p); @@ -2175,7 +2178,10 @@ int validate_sp(unsigned long sp, struct task_struct *p, return valid_emergency_stack(sp, p, nbytes); } -EXPORT_SYMBOL(validate_sp); +int validate_sp(unsigned long sp, struct task_struct *p) +{ + return validate_sp_size(sp, p, STACK_FRAME_OVERHEAD); +} static unsigned long ___get_wchan(struct task_struct *p) { @@ -2183,13 +2189,12 @@ static unsigned long ___get_wchan(struct task_struct *p) int count = 0; sp = p->thread.ksp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p)) return 0; do { sp = READ_ONCE_NOCHECK(*(unsigned long *)sp); - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || - task_is_running(p)) + if (!validate_sp(sp, p) || task_is_running(p)) return 0; if (count > 0) { ip = READ_ONCE_NOCHECK(((unsigned long *)sp)[STACK_FRAME_LR_SAVE]); @@ -2243,7 +2248,7 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, lr = 0; printk("%sCall Trace:\n", loglvl); do { - if (!validate_sp(sp, tsk, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, tsk)) break; stack = (unsigned long *) sp; @@ -2270,7 +2275,7 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, * could hold a pt_regs, if that does not fit then it can't * have regs. */ - if (validate_sp(sp, tsk, STACK_SWITCH_FRAME_SIZE) + if (validate_sp_size(sp, tsk, STACK_SWITCH_FRAME_SIZE) && stack[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { struct pt_regs *regs = (struct pt_regs *) (sp + STACK_INT_FRAME_REGS); diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 453ac317a6cf..1dbbf30f265e 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -43,7 +43,7 @@ void __no_sanitize_address arch_stack_walk(stack_trace_consume_fn consume_entry, unsigned long *stack = (unsigned long *) sp; unsigned long newsp, ip; - if (!validate_sp(sp, task, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, task)) return; newsp = stack[0]; diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index b01497ed5173..6b4434dd0ff3 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -27,7 +27,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp) { if (sp & 0xf) return 0; /* must be 16-byte aligned */ - if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, current)) return 0; if (sp >= prev_sp + STACK_FRAME_MIN_SIZE) return 1; @@ -53,7 +53,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re sp = regs->gpr[1]; perf_callchain_store(entry, perf_instruction_pointer(regs)); - if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, current)) return; for (;;) { @@ -61,7 +61,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re next_sp = fp[0]; if (next_sp == sp + STACK_INT_FRAME_SIZE && - validate_sp(sp, current, STACK_INT_FRAME_SIZE) && + validate_sp_size(sp, current, STACK_INT_FRAME_SIZE) && fp[STACK_INT_FRAME_MARKER_LONGS] == STACK_FRAME_REGS_MARKER) { /* * This looks like an interrupt frame for an From 90f1b43196c5e79f6c986a359011a19857984c27 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:40 +1000 Subject: [PATCH 140/181] powerpc: allow minimum sized kernel stack frames This affects only 64-bit ELFv2 kernels, and reduces the minimum asm-created stack frame size from 112 to 32 byte on those kernels. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-16-npiggin@gmail.com --- arch/powerpc/kernel/head_40x.S | 2 +- arch/powerpc/kernel/head_44x.S | 6 +++--- arch/powerpc/kernel/head_64.S | 6 +++--- arch/powerpc/kernel/head_85xx.S | 4 ++-- arch/powerpc/kernel/head_8xx.S | 2 +- arch/powerpc/kernel/head_book3s_32.S | 4 ++-- arch/powerpc/kernel/irq.c | 4 ++-- arch/powerpc/kernel/misc_32.S | 2 +- arch/powerpc/kernel/misc_64.S | 4 ++-- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- 12 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 088f500896c7..918547b93b5e 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -602,7 +602,7 @@ start_here: lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) bl early_init /* We have to do this with MMU on */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index f15cb9fdb692..63a85c16fef4 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -109,7 +109,7 @@ _GLOBAL(_start); lis r1,init_thread_union@h ori r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) bl early_init @@ -1012,7 +1012,7 @@ _GLOBAL(start_secondary_47x) */ lis r1,temp_boot_stack@h ori r1,r1,temp_boot_stack@l - addi r1,r1,1024-STACK_FRAME_OVERHEAD + addi r1,r1,1024-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) bl mmu_init_secondary @@ -1025,7 +1025,7 @@ _GLOBAL(start_secondary_47x) lwz r1,TASK_STACK(r2) /* Current stack pointer */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index dedcc6fe2263..b513d13bf79e 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -424,7 +424,7 @@ generic_secondary_common_init: /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD + subi r1,r1,STACK_FRAME_MIN_SIZE /* See if we need to call a cpu state restore handler */ LOAD_REG_ADDR(r23, cur_cpu_spec) @@ -780,7 +780,7 @@ _GLOBAL(pmac_secondary_start) /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) - subi r1,r1,STACK_FRAME_OVERHEAD + subi r1,r1,STACK_FRAME_MIN_SIZE b __secondary_start @@ -958,7 +958,7 @@ start_here_multiplatform: LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) add r1,r3,r1 li r0,0 - stdu r0,-STACK_FRAME_OVERHEAD(r1) + stdu r0,-STACK_FRAME_MIN_SIZE(r1) /* * Do very early kernel initializations, including initial hash table diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 24f39abf81df..d9bd377dec91 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -229,7 +229,7 @@ set_ivor: lis r1,init_thread_union@h ori r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) #ifdef CONFIG_SMP stw r24, TASK_CPU(r2) @@ -1044,7 +1044,7 @@ __secondary_start: lwz r1,TASK_STACK(r2) /* stack */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 stw r0,0(r1) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 0b05f2be66b9..cf546d0e5c40 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -537,7 +537,7 @@ start_here: ori r0, r0, STACK_END_MAGIC@l stw r0, 0(r1) li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) lis r6, swapper_pg_dir@ha tophys(r6,r6) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 519b60695167..40854d092dd3 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -840,7 +840,7 @@ __secondary_start: lwz r1,TASK_STACK(r1) /* stack */ - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r0,0 tophys(r3,r1) stw r0,0(r3) @@ -966,7 +966,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_HPTE_TABLE) lis r1,init_thread_union@ha addi r1,r1,init_thread_union@l li r0,0 - stwu r0,THREAD_SIZE-STACK_FRAME_OVERHEAD(r1) + stwu r0,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r1) /* * Do early platform-specific initialization, * and set up the MMU. diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 9ede61a5a469..c5b9ce887483 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -210,7 +210,7 @@ static __always_inline void call_do_softirq(const void *sp) PPC_LL " %%r1, 0(%%r1) ;" : // Outputs : // Inputs - [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_MIN_SIZE), [callee] "i" (__do_softirq) : // Clobbers "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", @@ -264,7 +264,7 @@ static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) : // Outputs "+r" (r3) : // Inputs - [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_OVERHEAD), + [sp] "b" (sp), [offset] "i" (THREAD_SIZE - STACK_FRAME_MIN_SIZE), [callee] "i" (__do_irq) : // Clobbers "lr", "xer", "ctr", "memory", "cr0", "cr1", "cr5", "cr6", diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index e5127b19fec2..daf8f87d2372 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -382,7 +382,7 @@ EXPORT_SYMBOL(__bswapdi2) _GLOBAL(start_secondary_resume) /* Reset stack */ rlwinm r1, r1, 0, 0, 31 - THREAD_SHIFT - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD + addi r1,r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE li r3,0 stw r3,0(r1) /* Zero the stack frame pointer */ bl start_secondary diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 36184cada00b..4bb6dd30c556 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -384,7 +384,7 @@ _GLOBAL(kexec_sequence) std r0,16(r1) /* switch stacks to newstack -- &kexec_stack.stack */ - stdu r1,THREAD_SIZE-STACK_FRAME_OVERHEAD(r3) + stdu r1,THREAD_SIZE-STACK_FRAME_MIN_SIZE(r3) mr r1,r3 li r0,0 @@ -401,7 +401,7 @@ _GLOBAL(kexec_sequence) std r26,-48(r1) std r25,-56(r1) - stdu r1,-STACK_FRAME_OVERHEAD-64(r1) + stdu r1,-STACK_FRAME_MIN_SIZE-64(r1) /* save args into preserved regs */ mr r31,r3 /* newstack (both) */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9446bee8ca32..edb46d0806ef 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2180,7 +2180,7 @@ int validate_sp_size(unsigned long sp, struct task_struct *p, int validate_sp(unsigned long sp, struct task_struct *p) { - return validate_sp_size(sp, p, STACK_FRAME_OVERHEAD); + return validate_sp_size(sp, p, STACK_FRAME_MIN_SIZE); } static unsigned long ___get_wchan(struct task_struct *p) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 0da6e59161cd..6b90f10a6c81 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1249,7 +1249,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + - THREAD_SIZE - STACK_FRAME_OVERHEAD; + THREAD_SIZE - STACK_FRAME_MIN_SIZE; #endif task_thread_info(idle)->cpu = cpu; secondary_current = current_set[cpu] = idle; diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 1dbbf30f265e..5de8597eaab8 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -94,7 +94,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum * idle tasks have a custom stack layout, * c.f. cpu_idle_thread_init(). */ - stack_end -= STACK_FRAME_OVERHEAD; + stack_end -= STACK_FRAME_MIN_SIZE; } if (task == current) From cd52414d5a6ccea6ce956ef05161fe824522a107 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:41 +1000 Subject: [PATCH 141/181] powerpc/64: ELFv2 use minimal stack frames in int and switch frame sizes Adjust the ELFv2 interrupt and switch frames to the minimum C ABI size, plus pt_regs, plus 16 bytes for the aligned regs marker for the int frame (and the switch frame needs to match that because it uses the same regs offset as the int frame). This saves 80 bytes of kernel stack per interrupt. It's the principle of getting our accounting right that's more important than the practical saving. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-17-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 412ef0749775..4ab606f390bc 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -120,16 +120,26 @@ struct pt_regs #define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ + +#ifdef CONFIG_PPC64_ELF_ABI_V2 +#define STACK_FRAME_MIN_SIZE 32 +#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE + 16) +#define STACK_INT_FRAME_REGS (STACK_FRAME_MIN_SIZE + 16) +#define STACK_INT_FRAME_MARKER STACK_FRAME_MIN_SIZE +#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE + 16) +#define STACK_SWITCH_FRAME_REGS (STACK_FRAME_MIN_SIZE + 16) +#else +/* + * The ELFv1 ABI specifies 48 bytes plus a minimum 64 byte parameter save + * area. This parameter area is not used by calls to C from interrupt entry, + * so the second from last one of those is used for the frame marker. + */ +#define STACK_FRAME_MIN_SIZE 112 #define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD #define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) #define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) #define STACK_SWITCH_FRAME_REGS STACK_FRAME_OVERHEAD - -#ifdef CONFIG_PPC64_ELF_ABI_V2 -#define STACK_FRAME_MIN_SIZE 32 -#else -#define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD #endif /* Size of dummy stack frame allocated when calling signal handler. */ From dfecd06bc5524517ed7737c30eaaf747338b280a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Sun, 27 Nov 2022 22:49:42 +1000 Subject: [PATCH 142/181] powerpc: remove STACK_FRAME_OVERHEAD This is equal to STACK_FRAME_MIN_SIZE on 32-bit and 64-bit ELFv1, and no longer used in 64-bit ELFv2, so replace STACK_FRAME_OVERHEAD occurrences with STACK_FRAME_MIN_SIZE. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221127124942.1665522-18-npiggin@gmail.com --- arch/powerpc/include/asm/ptrace.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 4ab606f390bc..0eb90a013346 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -118,7 +118,6 @@ struct pt_regs #define USER_REDZONE_SIZE 512 #define KERNEL_REDZONE_SIZE 288 -#define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ #ifdef CONFIG_PPC64_ELF_ABI_V2 @@ -135,11 +134,11 @@ struct pt_regs * so the second from last one of those is used for the frame marker. */ #define STACK_FRAME_MIN_SIZE 112 -#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) -#define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD -#define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 16) -#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) -#define STACK_SWITCH_FRAME_REGS STACK_FRAME_OVERHEAD +#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE) +#define STACK_INT_FRAME_REGS STACK_FRAME_MIN_SIZE +#define STACK_INT_FRAME_MARKER (STACK_FRAME_MIN_SIZE - 16) +#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE) +#define STACK_SWITCH_FRAME_REGS STACK_FRAME_MIN_SIZE #endif /* Size of dummy stack frame allocated when calling signal handler. */ @@ -150,14 +149,13 @@ struct pt_regs #define USER_REDZONE_SIZE 0 #define KERNEL_REDZONE_SIZE 0 -#define STACK_FRAME_OVERHEAD 16 /* size of minimum stack frame */ +#define STACK_FRAME_MIN_SIZE 16 #define STACK_FRAME_LR_SAVE 1 /* Location of LR in stack frame */ -#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) -#define STACK_INT_FRAME_REGS STACK_FRAME_OVERHEAD -#define STACK_INT_FRAME_MARKER (STACK_FRAME_OVERHEAD - 8) -#define STACK_FRAME_MIN_SIZE STACK_FRAME_OVERHEAD -#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_OVERHEAD) -#define STACK_SWITCH_FRAME_REGS STACK_FRAME_OVERHEAD +#define STACK_USER_INT_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE) +#define STACK_INT_FRAME_REGS STACK_FRAME_MIN_SIZE +#define STACK_INT_FRAME_MARKER (STACK_FRAME_MIN_SIZE - 8) +#define STACK_SWITCH_FRAME_SIZE (sizeof(struct pt_regs) + STACK_FRAME_MIN_SIZE) +#define STACK_SWITCH_FRAME_REGS STACK_FRAME_MIN_SIZE /* Size of stack frame allocated when calling signal handler. */ #define __SIGNAL_FRAMESIZE 64 From 6b34a099faa123488b13caf704562f4dbe483fc4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Mon, 24 Oct 2022 13:01:50 +1000 Subject: [PATCH 143/181] powerpc/64s/hash: add stress_hpt kernel boot option to increase hash faults This option increases the number of hash misses by limiting the number of kernel HPT entries, by keeping a per-CPU record of the last kernel HPTEs installed, and removing that from the hash table on the next hash insertion. A timer round-robins CPUs removing remaining kernel HPTEs and clearing the TLB (in the case of bare metal) to increase and slightly randomise kernel fault activity. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> [mpe: Add comment about NR_CPUS usage, fixup whitespace] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221024030150.852517-1-npiggin@gmail.com --- .../admin-guide/kernel-parameters.txt | 5 + arch/powerpc/mm/book3s64/hash_4k.c | 5 + arch/powerpc/mm/book3s64/hash_64k.c | 10 ++ arch/powerpc/mm/book3s64/hash_utils.c | 130 +++++++++++++++++- arch/powerpc/mm/book3s64/internal.h | 11 ++ 5 files changed, 160 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a465d5242774..9f3d256529d0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1042,6 +1042,11 @@ them frequently to increase the rate of SLB faults on kernel addresses. + stress_hpt [PPC] + Limits the number of kernel HPT entries in the hash + page table to increase the rate of hash page table + faults on kernel addresses. + disable= [IPV6] See Documentation/networking/ipv6.rst. diff --git a/arch/powerpc/mm/book3s64/hash_4k.c b/arch/powerpc/mm/book3s64/hash_4k.c index 7de1a8a0c62a..02acbfd05b46 100644 --- a/arch/powerpc/mm/book3s64/hash_4k.c +++ b/arch/powerpc/mm/book3s64/hash_4k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize, int subpg_prot) @@ -118,6 +120,9 @@ repeat: } new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; diff --git a/arch/powerpc/mm/book3s64/hash_64k.c b/arch/powerpc/mm/book3s64/hash_64k.c index 998c6817ed47..954af420f358 100644 --- a/arch/powerpc/mm/book3s64/hash_64k.c +++ b/arch/powerpc/mm/book3s64/hash_64k.c @@ -16,6 +16,8 @@ #include <asm/machdep.h> #include <asm/mmu.h> +#include "internal.h" + /* * Return true, if the entry has a slot value which * the software considers as invalid. @@ -216,6 +218,9 @@ repeat: new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE); new_pte |= H_PAGE_HASHPTE; + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -327,7 +332,12 @@ repeat: new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE); + + if (stress_hpt()) + hpt_do_stress(ea, hpte_group); } + *ptep = __pte(new_pte & ~H_PAGE_BUSY); + return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 6df4c6d38b66..80a148c57de8 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -471,7 +471,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long vend, return ret; } -static bool disable_1tb_segments = false; +static bool disable_1tb_segments __ro_after_init; static int __init parse_disable_1tb_segments(char *p) { @@ -480,6 +480,40 @@ static int __init parse_disable_1tb_segments(char *p) } early_param("disable_1tb_segments", parse_disable_1tb_segments); +bool stress_hpt_enabled __initdata; + +static int __init parse_stress_hpt(char *p) +{ + stress_hpt_enabled = true; + return 0; +} +early_param("stress_hpt", parse_stress_hpt); + +__ro_after_init DEFINE_STATIC_KEY_FALSE(stress_hpt_key); + +/* + * per-CPU array allocated if we enable stress_hpt. + */ +#define STRESS_MAX_GROUPS 16 +struct stress_hpt_struct { + unsigned long last_group[STRESS_MAX_GROUPS]; +}; + +static inline int stress_nr_groups(void) +{ + /* + * LPAR H_REMOVE flushes TLB, so need some number > 1 of entries + * to allow practical forward progress. Bare metal returns 1, which + * seems to help uncover more bugs. + */ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return STRESS_MAX_GROUPS; + else + return 1; +} + +static struct stress_hpt_struct *stress_hpt_struct; + static int __init htab_dt_scan_seg_sizes(unsigned long node, const char *uname, int depth, void *data) @@ -976,6 +1010,23 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, pr_info("Partition table %p\n", partition_tb); } +void hpt_clear_stress(void); +static struct timer_list stress_hpt_timer; +void stress_hpt_timer_fn(struct timer_list *timer) +{ + int next_cpu; + + hpt_clear_stress(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) + tlbiel_all(); + + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer_on(&stress_hpt_timer, next_cpu); +} + static void __init htab_initialize(void) { unsigned long table; @@ -995,6 +1046,20 @@ static void __init htab_initialize(void) if (stress_slb_enabled) static_branch_enable(&stress_slb_key); + if (stress_hpt_enabled) { + unsigned long tmp; + static_branch_enable(&stress_hpt_key); + // Too early to use nr_cpu_ids, so use NR_CPUS + tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, + 0, 0, MEMBLOCK_ALLOC_ANYWHERE); + memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); + stress_hpt_struct = __va(tmp); + + timer_setup(&stress_hpt_timer, stress_hpt_timer_fn, 0); + stress_hpt_timer.expires = jiffies + msecs_to_jiffies(10); + add_timer(&stress_hpt_timer); + } + /* * Calculate the required size of the htab. We want the number of * PTEGs to equal one half the number of real pages. @@ -1980,6 +2045,69 @@ repeat: return slot; } +void hpt_clear_stress(void) +{ + int cpu = raw_smp_processor_id(); + int g; + + for (g = 0; g < stress_nr_groups(); g++) { + unsigned long last_group; + last_group = stress_hpt_struct[cpu].last_group[g]; + + if (last_group != -1UL) { + int i; + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[g] = -1; + } + } +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group) +{ + unsigned long last_group; + int cpu = raw_smp_processor_id(); + + last_group = stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1]; + if (hpte_group == last_group) + return; + + if (last_group != -1UL) { + int i; + /* + * Concurrent CPUs might be inserting into this group, so + * give up after a number of iterations, to prevent a live + * lock. + */ + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (mmu_hash_ops.hpte_remove(last_group) == -1) + break; + } + stress_hpt_struct[cpu].last_group[stress_nr_groups() - 1] = -1; + } + + if (ea >= PAGE_OFFSET) { + /* + * We would really like to prefetch to get the TLB loaded, then + * remove the PTE before returning from fault interrupt, to + * increase the hash fault rate. + * + * Unfortunately QEMU TCG does not model the TLB in a way that + * makes this possible, and systemsim (mambo) emulator does not + * bring in TLBs with prefetches (although loads/stores do + * work for non-CI PTEs). + * + * So remember this PTE and clear it on the next hash fault. + */ + memmove(&stress_hpt_struct[cpu].last_group[1], + &stress_hpt_struct[cpu].last_group[0], + (stress_nr_groups() - 1) * sizeof(unsigned long)); + stress_hpt_struct[cpu].last_group[0] = hpte_group; + } +} + #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index 5045048ce244..a57a25f06a21 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -13,6 +13,17 @@ static inline bool stress_slb(void) return static_branch_unlikely(&stress_slb_key); } +extern bool stress_hpt_enabled; + +DECLARE_STATIC_KEY_FALSE(stress_hpt_key); + +static inline bool stress_hpt(void) +{ + return static_branch_unlikely(&stress_hpt_key); +} + +void hpt_do_stress(unsigned long ea, unsigned long hpte_group); + void slb_setup_new_exec(void); void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); From 5921eb36d2a1b276b16a24e529788550e6a65449 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang <yangtiezhu@loongson.cn> Date: Thu, 1 Dec 2022 10:49:57 +0800 Subject: [PATCH 144/181] selftests: powerpc: Use "grep -E" instead of "egrep" The latest version of grep claims the egrep is now obsolete so the build now contains warnings that look like: egrep: warning: egrep is obsolescent; using grep -E fix this using "grep -E" instead. sed -i "s/egrep/grep -E/g" `grep egrep -rwl tools/testing/selftests/powerpc` Here are the steps to install the latest grep: wget http://ftp.gnu.org/gnu/grep/grep-3.8.tar.gz tar xf grep-3.8.tar.gz cd grep-3.8 && ./configure && make sudo make install export PATH=/usr/local/bin:$PATH Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/1669862997-31335-1-git-send-email-yangtiezhu@loongson.cn --- tools/testing/selftests/powerpc/scripts/hmi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/scripts/hmi.sh b/tools/testing/selftests/powerpc/scripts/hmi.sh index dcdb392e8427..bcc7b6b65009 100755 --- a/tools/testing/selftests/powerpc/scripts/hmi.sh +++ b/tools/testing/selftests/powerpc/scripts/hmi.sh @@ -36,7 +36,7 @@ trap "ppc64_cpu --smt-snooze-delay=100" 0 1 # for each chip+core combination # todo - less fragile parsing -egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog | +grep -E -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog | while read chipcore; do chip=$(echo "$chipcore"|awk '{print $3}') core=$(echo "$chipcore"|awk '{print $5}') From aecfd680099ba518c34dff2941017c5aa97def52 Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Mon, 28 Nov 2022 15:19:42 +1100 Subject: [PATCH 145/181] selftests/powerpc: Use mfspr/mtspr macros No need to write inline asm for mtspr/mfspr, we have macros for this in reg.h Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041948.58339-2-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dscr/dscr.h | 17 +++++------------ .../selftests/powerpc/ptrace/ptrace-hwbreak.c | 6 ++---- tools/testing/selftests/powerpc/ptrace/ptrace.h | 5 +---- .../selftests/powerpc/security/flush_utils.c | 3 ++- 4 files changed, 10 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h index 13e9b9e28e2c..b703714e7d98 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr.h +++ b/tools/testing/selftests/powerpc/dscr/dscr.h @@ -23,6 +23,7 @@ #include <sys/stat.h> #include <sys/wait.h> +#include "reg.h" #include "utils.h" #define THREADS 100 /* Max threads */ @@ -41,31 +42,23 @@ /* Prilvilege state DSCR access */ inline unsigned long get_dscr(void) { - unsigned long ret; - - asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_DSCR_PRIV)); - - return ret; + return mfspr(SPRN_DSCR_PRIV); } inline void set_dscr(unsigned long val) { - asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR_PRIV)); + mtspr(SPRN_DSCR_PRIV, val); } /* Problem state DSCR access */ inline unsigned long get_dscr_usr(void) { - unsigned long ret; - - asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_DSCR)); - - return ret; + return mfspr(SPRN_DSCR); } inline void set_dscr_usr(unsigned long val) { - asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR)); + mtspr(SPRN_DSCR, val); } /* Default DSCR access */ diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c index a0635a3819aa..1345e9b9af0f 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c @@ -23,6 +23,7 @@ #include <sys/syscall.h> #include <linux/limits.h> #include "ptrace.h" +#include "reg.h" #define SPRN_PVR 0x11F #define PVR_8xx 0x00500000 @@ -620,10 +621,7 @@ static int ptrace_hwbreak(void) int main(int argc, char **argv, char **envp) { - int pvr = 0; - asm __volatile__ ("mfspr %0,%1" : "=r"(pvr) : "i"(SPRN_PVR)); - if (pvr == PVR_8xx) - is_8xx = true; + is_8xx = mfspr(SPRN_PVR) == PVR_8xx; return test_harness(ptrace_hwbreak, "ptrace-hwbreak"); } diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace.h b/tools/testing/selftests/powerpc/ptrace/ptrace.h index 4e0233c0f2b3..04788e5fc504 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace.h +++ b/tools/testing/selftests/powerpc/ptrace/ptrace.h @@ -745,10 +745,7 @@ int show_tm_spr(pid_t child, struct tm_spr_regs *out) /* Analyse TEXASR after TM failure */ inline unsigned long get_tfiar(void) { - unsigned long ret; - - asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_TFIAR)); - return ret; + return mfspr(SPRN_TFIAR); } void analyse_texasr(unsigned long texasr) diff --git a/tools/testing/selftests/powerpc/security/flush_utils.c b/tools/testing/selftests/powerpc/security/flush_utils.c index 4d95965cb751..9c5c00e04f63 100644 --- a/tools/testing/selftests/powerpc/security/flush_utils.c +++ b/tools/testing/selftests/powerpc/security/flush_utils.c @@ -14,6 +14,7 @@ #include <string.h> #include <stdio.h> #include <sys/utsname.h> +#include "reg.h" #include "utils.h" #include "flush_utils.h" @@ -79,5 +80,5 @@ void set_dscr(unsigned long val) init = 1; } - asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR)); + mtspr(SPRN_DSCR, val); } From 94ba4f2c33f42dae7813dc169a177e922a39560c Mon Sep 17 00:00:00 2001 From: Benjamin Gray <bgray@linux.ibm.com> Date: Mon, 28 Nov 2022 15:19:43 +1100 Subject: [PATCH 146/181] selftests/powerpc: Add ptrace setup_core_pattern() null-terminator - malloc() does not zero the buffer, - fread() does not null-terminate it's output, - `cat /proc/sys/kernel/core_pattern | hexdump -C` shows the file is not inherently null-terminated So using string operations on the buffer is risky. Explicitly add a null character to the end to make it safer. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221128041948.58339-3-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/ptrace/core-pkey.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c index 1a70a96f0bfe..4e8d0ce1ff58 100644 --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c @@ -383,7 +383,7 @@ static int setup_core_pattern(char **core_pattern_, bool *changed_) goto out; } - ret = fread(core_pattern, 1, PATH_MAX, f); + ret = fread(core_pattern, 1, PATH_MAX - 1, f); fclose(f); if (!ret) { perror("Error reading core_pattern file"); @@ -391,6 +391,8 @@ static int setup_core_pattern(char **core_pattern_, bool *changed_) goto out; } + core_pattern[ret] = '\0'; + /* Check whether we can predict the name of the core file. */ if (!strcmp(core_pattern, "core") || !strcmp(core_pattern, "core.%p")) *changed_ = false; From 4d0eea415216fe3791da2f65eb41399e70c7bedf Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Sat, 29 Oct 2022 19:16:26 +0800 Subject: [PATCH 147/181] powerpc/83xx/mpc832x_rdb: call platform_device_put() in error case in of_fsl_spi_probe() If platform_device_add() is not called or failed, it can not call platform_device_del() to clean up memory, it should call platform_device_put() in error case. Fixes: 26f6cb999366 ("[POWERPC] fsl_soc: add support for fsl_spi") Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221029111626.429971-1-yangyingliang@huawei.com --- arch/powerpc/platforms/83xx/mpc832x_rdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index e12cb44e717f..caa96edf0e72 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -107,7 +107,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, goto next; unreg: - platform_device_del(pdev); + platform_device_put(pdev); err: pr_err("%pOF: registration failed\n", np); next: From 03f7c1d2a49acd30e38789cd809d3300721e9b0e Mon Sep 17 00:00:00 2001 From: Kajol Jain <kjain@linux.ibm.com> Date: Wed, 30 Nov 2022 23:15:13 +0530 Subject: [PATCH 148/181] powerpc/hv-gpci: Fix hv_gpci event list Based on getPerfCountInfo v1.018 documentation, some of the hv_gpci events were deprecated for platform firmware that supports counter_info_version 0x8 or above. Fix the hv_gpci event list by adding a new attribute group called "hv_gpci_event_attrs_v6" and a "ENABLE_EVENTS_COUNTERINFO_V6" macro to enable these events for platform firmware that supports counter_info_version 0x6 or below. And assigning the hv_gpci event list based on output counter info version of underlying plaform. Fixes: 97bf2640184f ("powerpc/perf/hv-gpci: add the remaining gpci requests") Signed-off-by: Kajol Jain <kjain@linux.ibm.com> Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com> Reviewed-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221130174513.87501-1-kjain@linux.ibm.com --- arch/powerpc/perf/hv-gpci-requests.h | 4 ++++ arch/powerpc/perf/hv-gpci.c | 35 ++++++++++++++++++++++++++-- arch/powerpc/perf/hv-gpci.h | 1 + arch/powerpc/perf/req-gen/perf.h | 20 ++++++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/hv-gpci-requests.h b/arch/powerpc/perf/hv-gpci-requests.h index 8965b4463d43..5e86371a20c7 100644 --- a/arch/powerpc/perf/hv-gpci-requests.h +++ b/arch/powerpc/perf/hv-gpci-requests.h @@ -79,6 +79,7 @@ REQUEST(__field(0, 8, partition_id) ) #include I(REQUEST_END) +#ifdef ENABLE_EVENTS_COUNTERINFO_V6 /* * Not available for counter_info_version >= 0x8, use * run_instruction_cycles_by_partition(0x100) instead. @@ -92,6 +93,7 @@ REQUEST(__field(0, 8, partition_id) __count(0x10, 8, cycles) ) #include I(REQUEST_END) +#endif #define REQUEST_NAME system_performance_capabilities #define REQUEST_NUM 0x40 @@ -103,6 +105,7 @@ REQUEST(__field(0, 1, perf_collect_privileged) ) #include I(REQUEST_END) +#ifdef ENABLE_EVENTS_COUNTERINFO_V6 #define REQUEST_NAME processor_bus_utilization_abc_links #define REQUEST_NUM 0x50 #define REQUEST_IDX_KIND "hw_chip_id=?" @@ -194,6 +197,7 @@ REQUEST(__field(0, 4, phys_processor_idx) __count(0x28, 8, instructions_completed) ) #include I(REQUEST_END) +#endif /* Processor_core_power_mode (0x95) skipped, no counters */ /* Affinity_domain_information_by_virtual_processor (0xA0) skipped, diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index 5eb60ed5b5e8..7ff8ff3509f5 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -70,9 +70,9 @@ static const struct attribute_group format_group = { .attrs = format_attrs, }; -static const struct attribute_group event_group = { +static struct attribute_group event_group = { .name = "events", - .attrs = hv_gpci_event_attrs, + /* .attrs is set in init */ }; #define HV_CAPS_ATTR(_name, _format) \ @@ -330,6 +330,7 @@ static int hv_gpci_init(void) int r; unsigned long hret; struct hv_perf_caps caps; + struct hv_gpci_request_buffer *arg; hv_gpci_assert_offsets_correct(); @@ -353,6 +354,36 @@ static int hv_gpci_init(void) /* sampling not supported */ h_gpci_pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + arg = (void *)get_cpu_var(hv_gpci_reqb); + memset(arg, 0, HGPCI_REQ_BUFFER_SIZE); + + /* + * hcall H_GET_PERF_COUNTER_INFO populates the output + * counter_info_version value based on the system hypervisor. + * Pass the counter request 0x10 corresponds to request type + * 'Dispatch_timebase_by_processor', to get the supported + * counter_info_version. + */ + arg->params.counter_request = cpu_to_be32(0x10); + + r = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, + virt_to_phys(arg), HGPCI_REQ_BUFFER_SIZE); + if (r) { + pr_devel("hcall failed, can't get supported counter_info_version: 0x%x\n", r); + arg->params.counter_info_version_out = 0x8; + } + + /* + * Use counter_info_version_out value to assign + * required hv-gpci event list. + */ + if (arg->params.counter_info_version_out >= 0x8) + event_group.attrs = hv_gpci_event_attrs; + else + event_group.attrs = hv_gpci_event_attrs_v6; + + put_cpu_var(hv_gpci_reqb); + r = perf_pmu_register(&h_gpci_pmu, h_gpci_pmu.name, -1); if (r) return r; diff --git a/arch/powerpc/perf/hv-gpci.h b/arch/powerpc/perf/hv-gpci.h index 4d108262bed7..c72020912dea 100644 --- a/arch/powerpc/perf/hv-gpci.h +++ b/arch/powerpc/perf/hv-gpci.h @@ -26,6 +26,7 @@ enum { #define REQUEST_FILE "../hv-gpci-requests.h" #define NAME_LOWER hv_gpci #define NAME_UPPER HV_GPCI +#define ENABLE_EVENTS_COUNTERINFO_V6 #include "req-gen/perf.h" #undef REQUEST_FILE #undef NAME_LOWER diff --git a/arch/powerpc/perf/req-gen/perf.h b/arch/powerpc/perf/req-gen/perf.h index fa9bc804e67a..6b2a59fefffa 100644 --- a/arch/powerpc/perf/req-gen/perf.h +++ b/arch/powerpc/perf/req-gen/perf.h @@ -139,6 +139,26 @@ PMU_EVENT_ATTR_STRING( \ #define REQUEST_(r_name, r_value, r_idx_1, r_fields) \ r_fields +/* Generate event list for platforms with counter_info_version 0x6 or below */ +static __maybe_unused struct attribute *hv_gpci_event_attrs_v6[] = { +#include REQUEST_FILE + NULL +}; + +/* + * Based on getPerfCountInfo v1.018 documentation, some of the hv-gpci + * events were deprecated for platform firmware that supports + * counter_info_version 0x8 or above. + * Those deprecated events are still part of platform firmware that + * support counter_info_version 0x6 and below. As per the getPerfCountInfo + * v1.018 documentation there is no counter_info_version 0x7. + * Undefining macro ENABLE_EVENTS_COUNTERINFO_V6, to disable the addition of + * deprecated events in "hv_gpci_event_attrs" attribute group, for platforms + * that supports counter_info_version 0x8 or above. + */ +#undef ENABLE_EVENTS_COUNTERINFO_V6 + +/* Generate event list for platforms with counter_info_version 0x8 or above*/ static __maybe_unused struct attribute *hv_gpci_event_attrs[] = { #include REQUEST_FILE NULL From 0e23347f1e0f2b1c98f87a4088231d0d6f59b962 Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:13 +1100 Subject: [PATCH 149/181] powerpc/64: Add INTERRUPT_SANITIZE_REGISTERS Kconfig Add Kconfig option for enabling clearing of registers on arrival in an interrupt handler. This reduces the speculation influence of registers on kernel internals. The option will be consumed by 64-bit systems that feature speculation and wish to implement this mitigation. This patch only introduces the Kconfig option, no actual mitigations. The primary overhead of this mitigation lies in an increased number of registers that must be saved and restored by interrupt handlers on Book3S systems. Enable by default on Book3E systems, which prior to this patch eagerly save and restore register state, meaning that the mitigation when implemented will have minimal overhead. Acked-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-1-rmclure@linux.ibm.com --- arch/powerpc/Kconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fe2aa445b654..aec1431be06e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -534,6 +534,15 @@ config HOTPLUG_CPU Say N if you are unsure. +config INTERRUPT_SANITIZE_REGISTERS + bool "Clear gprs on interrupt arrival" + depends on PPC64 && ARCH_HAS_SYSCALL_WRAPPER + default PPC_BOOK3E_64 + help + Reduce the influence of user register state on interrupt handlers and + syscalls through clearing user state from registers before handling + the exception. + config PPC_QUEUED_SPINLOCKS bool "Queued spinlocks" if EXPERT depends on SMP From cbf892ba56677b942020d2bc7ca9b79281fa0bcc Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:14 +1100 Subject: [PATCH 150/181] powerpc/64: Add interrupt register sanitisation macros Include in asm/ppc_asm.h macros to be used in multiple successive patches to implement zeroising architected registers in interrupt handlers. Registers will be sanitised in this fashion in future patches to reduce the speculation influence of user-controlled register values. These mitigations will be configurable through the CONFIG_INTERRUPT_SANITIZE_REGISTERS Kconfig option. Included are macros for conditionally zeroising registers and restoring as required with the mitigation enabled. With the mitigation disabled, non-volatiles must be restored on demand at separate locations to those required by the mitigation. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-2-rmclure@linux.ibm.com --- arch/powerpc/include/asm/ppc_asm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 753a2757bcd4..d2f44612f4b0 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -74,6 +74,25 @@ #define SAVE_GPR(n, base) SAVE_GPRS(n, n, base) #define REST_GPR(n, base) REST_GPRS(n, n, base) +/* macros for handling user register sanitisation */ +#ifdef CONFIG_INTERRUPT_SANITIZE_REGISTERS +#define SANITIZE_SYSCALL_GPRS() ZEROIZE_GPR(0); \ + ZEROIZE_GPRS(5, 12); \ + ZEROIZE_NVGPRS() +#define SANITIZE_GPR(n) ZEROIZE_GPR(n) +#define SANITIZE_GPRS(start, end) ZEROIZE_GPRS(start, end) +#define SANITIZE_NVGPRS() ZEROIZE_NVGPRS() +#define SANITIZE_RESTORE_NVGPRS() REST_NVGPRS(r1) +#define HANDLER_RESTORE_NVGPRS() +#else +#define SANITIZE_SYSCALL_GPRS() +#define SANITIZE_GPR(n) +#define SANITIZE_GPRS(start, end) +#define SANITIZE_NVGPRS() +#define SANITIZE_RESTORE_NVGPRS() +#define HANDLER_RESTORE_NVGPRS() REST_NVGPRS(r1) +#endif /* CONFIG_INTERRUPT_SANITIZE_REGISTERS */ + #define SAVE_FPR(n, base) stfd n,8*TS_FPRWIDTH*(n)(base) #define SAVE_2FPRS(n, base) SAVE_FPR(n, base); SAVE_FPR(n+1, base) #define SAVE_4FPRS(n, base) SAVE_2FPRS(n, base); SAVE_2FPRS(n+2, base) From 75c5d6b1e194c341371639469fcb8691afa0e254 Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:15 +1100 Subject: [PATCH 151/181] powerpc/64: Sanitise common exit code for interrupts Interrupt code is shared between Book3E/S 64-bit systems for interrupt handlers. Ensure that exit code correctly restores non-volatile gprs on each system when CONFIG_INTERRUPT_SANITIZE_REGISTERS is enabled. Also introduce macros for clearing/restoring registers on interrupt entry for when this configuration option is either disabled or enabled. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-3-rmclure@linux.ibm.com --- arch/powerpc/kernel/interrupt_64.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 321992c1c9f9..dd04b0ba3959 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -408,9 +408,11 @@ interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user) addi r3,r1,STACK_INT_FRAME_REGS bl interrupt_exit_user_prepare +#ifndef CONFIG_INTERRUPT_SANITIZE_REGISTERS cmpdi r3,0 bne- .Lrestore_nvgprs_\srr .Lrestore_nvgprs_\srr\()_cont: +#endif std r1,PACA_EXIT_SAVE_R1(r13) /* save r1 for restart */ #ifdef CONFIG_PPC_BOOK3S .Linterrupt_return_\srr\()_user_rst_start: @@ -424,6 +426,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_user) stb r11,PACAIRQHAPPENED(r13) # clear out possible HARD_DIS .Lfast_user_interrupt_return_\srr\(): + SANITIZE_RESTORE_NVGPRS() #ifdef CONFIG_PPC_BOOK3S .ifc \srr,srr lbz r4,PACASRR_VALID(r13) @@ -493,9 +496,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) b . /* prevent speculative execution */ .Linterrupt_return_\srr\()_user_rst_end: +#ifndef CONFIG_INTERRUPT_SANITIZE_REGISTERS .Lrestore_nvgprs_\srr\(): REST_NVGPRS(r1) b .Lrestore_nvgprs_\srr\()_cont +#endif #ifdef CONFIG_PPC_BOOK3S interrupt_return_\srr\()_user_restart: @@ -585,6 +590,7 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) stb r11,PACAIRQHAPPENED(r13) // clear the possible HARD_DIS .Lfast_kernel_interrupt_return_\srr\(): + SANITIZE_RESTORE_NVGPRS() cmpdi cr1,r3,0 #ifdef CONFIG_PPC_BOOK3S .ifc \srr,srr From 2487fd2e6d61b5293eed8ecd25add3cc78593d38 Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:16 +1100 Subject: [PATCH 152/181] powerpc/64s: IOption for MSR stored in r12 Interrupt handlers in asm/exceptions-64s.S contain a great deal of common code produced by the GEN_COMMON macros. Currently, at the exit point of the macro, r12 will contain the contents of the MSR. A future patch will cause these macros to zeroise architected registers to avoid potential speculation influence of user data. Provide an IOption that signals that r12 must be retained, as the interrupt handler assumes it to hold the contents of the MSR. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-4-rmclure@linux.ibm.com --- arch/powerpc/kernel/exceptions-64s.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ac3b0580224e..42b7c3212f29 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -111,6 +111,7 @@ name: #define ISTACK .L_ISTACK_\name\() /* Set regular kernel stack */ #define __ISTACK(name) .L_ISTACK_ ## name #define IKUAP .L_IKUAP_\name\() /* Do KUAP lock */ +#define IMSR_R12 .L_IMSR_R12_\name\() /* Assumes MSR saved to r12 */ #define INT_DEFINE_BEGIN(n) \ .macro int_define_ ## n name @@ -176,6 +177,9 @@ do_define_int n .ifndef IKUAP IKUAP=1 .endif + .ifndef IMSR_R12 + IMSR_R12=0 + .endif .endm /* @@ -1751,6 +1755,7 @@ INT_DEFINE_BEGIN(fp_unavailable) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + IMSR_R12=1 INT_DEFINE_END(fp_unavailable) EXC_REAL_BEGIN(fp_unavailable, 0x800, 0x100) @@ -2384,6 +2389,7 @@ INT_DEFINE_BEGIN(altivec_unavailable) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + IMSR_R12=1 INT_DEFINE_END(altivec_unavailable) EXC_REAL_BEGIN(altivec_unavailable, 0xf20, 0x20) @@ -2433,6 +2439,7 @@ INT_DEFINE_BEGIN(vsx_unavailable) #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE IKVM_REAL=1 #endif + IMSR_R12=1 INT_DEFINE_END(vsx_unavailable) EXC_REAL_BEGIN(vsx_unavailable, 0xf40, 0x20) From 1df45d78b8a89da6544fab5267e8f5da15073d28 Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:17 +1100 Subject: [PATCH 153/181] powerpc/64s: Zeroise gprs on interrupt routine entry on Book3S Zeroise user state in gprs (assign to zero) to reduce the influence of user registers on speculation within kernel syscall handlers. Clears occur at the very beginning of the sc and scv 0 interrupt handlers, with restores occurring following the execution of the syscall handler. Zeroise GPRS r0, r2-r11, r14-r31, on entry into the kernel for all other interrupt sources. The remaining gprs are overwritten by entry macros to interrupt handlers, irrespective of whether or not a given handler consumes these register values. If an interrupt does not select the IMSR_R12 IOption, zeroise r12. Prior to this commit, r14-r31 are restored on a per-interrupt basis at exit, but now they are always restored on 64bit Book3S. Remove explicit REST_NVGPRS invocations on 64-bit Book3S. 32-bit systems do not clear user registers on interrupt, and continue to depend on the return value of interrupt_exit_user_prepare to determine whether or not to restore non-volatiles. The mmap_bench benchmark in selftests should rapidly invoke pagefaults. See ~0.8% performance regression with this mitigation, but this indicates the worst-case performance due to heavier-weight interrupt handlers. This mitigation is able to be enabled/disabled through CONFIG_INTERRUPT_SANITIZE_REGISTERS. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-5-rmclure@linux.ibm.com --- arch/powerpc/kernel/exceptions-64s.S | 27 ++++++++++++++++++--------- arch/powerpc/kernel/interrupt_64.S | 16 ++++++++++++++-- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 42b7c3212f29..429096b037d7 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -506,6 +506,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real, text) std r10,0(r1) /* make stack chain pointer */ std r0,GPR0(r1) /* save r0 in stackframe */ std r10,GPR1(r1) /* save r1 in stackframe */ + SANITIZE_GPR(0) /* Mark our [H]SRRs valid for return */ li r10,1 @@ -548,8 +549,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r9,GPR11(r1) std r10,GPR12(r1) std r11,GPR13(r1) + .if !IMSR_R12 + SANITIZE_GPRS(9, 12) + .else + SANITIZE_GPRS(9, 11) + .endif SAVE_NVGPRS(r1) + SANITIZE_NVGPRS() .if IDAR .if IISIDE @@ -581,8 +588,8 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_CFAR) ld r10,IAREA+EX_CTR(r13) std r10,_CTR(r1) - std r2,GPR2(r1) /* save r2 in stackframe */ - SAVE_GPRS(3, 8, r1) /* save r3 - r8 in stackframe */ + SAVE_GPRS(2, 8, r1) /* save r2 - r8 in stackframe */ + SANITIZE_GPRS(2, 8) mflr r9 /* Get LR, later save to stack */ LOAD_PACA_TOC() /* get kernel TOC into r2 */ std r9,_LINK(r1) @@ -700,6 +707,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) mtlr r9 ld r9,_CCR(r1) mtcr r9 + SANITIZE_RESTORE_NVGPRS() REST_GPRS(2, 13, r1) REST_GPR(0, r1) /* restore original r1. */ @@ -1445,7 +1453,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) * do_break() may have changed the NV GPRS while handling a breakpoint. * If so, we need to restore them with their updated values. */ - REST_NVGPRS(r1) + HANDLER_RESTORE_NVGPRS() b interrupt_return_srr @@ -1671,7 +1679,7 @@ EXC_COMMON_BEGIN(alignment_common) GEN_COMMON alignment addi r3,r1,STACK_INT_FRAME_REGS bl alignment_exception - REST_NVGPRS(r1) /* instruction emulation may change GPRs */ + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -1737,7 +1745,7 @@ EXC_COMMON_BEGIN(program_check_common) .Ldo_program_check: addi r3,r1,STACK_INT_FRAME_REGS bl program_check_exception - REST_NVGPRS(r1) /* instruction emulation may change GPRs */ + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -2169,7 +2177,7 @@ EXC_COMMON_BEGIN(emulation_assist_common) GEN_COMMON emulation_assist addi r3,r1,STACK_INT_FRAME_REGS bl emulation_assist_interrupt - REST_NVGPRS(r1) /* instruction emulation may change GPRs */ + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ b interrupt_return_hsrr @@ -2501,7 +2509,7 @@ EXC_COMMON_BEGIN(facility_unavailable_common) GEN_COMMON facility_unavailable addi r3,r1,STACK_INT_FRAME_REGS bl facility_unavailable_exception - REST_NVGPRS(r1) /* instruction emulation may change GPRs */ + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ b interrupt_return_srr @@ -2529,7 +2537,8 @@ EXC_COMMON_BEGIN(h_facility_unavailable_common) GEN_COMMON h_facility_unavailable addi r3,r1,STACK_INT_FRAME_REGS bl facility_unavailable_exception - REST_NVGPRS(r1) /* XXX Shouldn't be necessary in practice */ + /* XXX Shouldn't be necessary in practice */ + HANDLER_RESTORE_NVGPRS() b interrupt_return_hsrr @@ -2755,7 +2764,7 @@ EXC_COMMON_BEGIN(altivec_assist_common) addi r3,r1,STACK_INT_FRAME_REGS #ifdef CONFIG_ALTIVEC bl altivec_assist_exception - REST_NVGPRS(r1) /* instruction emulation may change GPRs */ + HANDLER_RESTORE_NVGPRS() /* instruction emulation may change GPRs */ #else bl unknown_exception #endif diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index dd04b0ba3959..fccc34489add 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -96,6 +96,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * but this is the best we can do. */ + /* + * Zero user registers to prevent influencing speculative execution + * state of kernel code. + */ + SANITIZE_SYSCALL_GPRS() bl system_call_exception .Lsyscall_vectored_\name\()_exit: @@ -124,6 +129,7 @@ BEGIN_FTR_SECTION HMT_MEDIUM_LOW END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + SANITIZE_RESTORE_NVGPRS() cmpdi r3,0 bne .Lsyscall_vectored_\name\()_restore_regs @@ -159,7 +165,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r4,_LINK(r1) ld r5,_XER(r1) - REST_NVGPRS(r1) + HANDLER_RESTORE_NVGPRS() REST_GPR(0, r1) mtcr r2 mtctr r3 @@ -275,6 +281,11 @@ END_BTB_FLUSH_SECTION wrteei 1 #endif + /* + * Zero user registers to prevent influencing speculative execution + * state of kernel code. + */ + SANITIZE_SYSCALL_GPRS() bl system_call_exception .Lsyscall_exit: @@ -315,6 +326,7 @@ BEGIN_FTR_SECTION stdcx. r0,0,r1 /* to clear the reservation */ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + SANITIZE_RESTORE_NVGPRS() cmpdi r3,0 bne .Lsyscall_restore_regs /* Zero volatile regs that may contain sensitive kernel data */ @@ -342,7 +354,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) .Lsyscall_restore_regs: ld r3,_CTR(r1) ld r4,_XER(r1) - REST_NVGPRS(r1) + HANDLER_RESTORE_NVGPRS() mtctr r3 mtspr SPRN_XER,r4 REST_GPR(0, r1) From efe1691ac814e4cf3653538b701662cbd905bddc Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:18 +1100 Subject: [PATCH 154/181] powerpc/64e: Clear gprs on interrupt routine entry on Book3E Zero GPRS r14-r31 on entry into the kernel for interrupt sources to limit influence of user-space values in potential speculation gadgets. Prior to this commit, all other GPRS are reassigned during the common prologue to interrupt handlers and so need not be zeroised explicitly. This may be done safely, without loss of register state prior to the interrupt, as the common prologue saves the initial values of non-volatiles, which are unconditionally restored in interrupt_64.S. Mitigation defaults to enabled by INTERRUPT_SANITIZE_REGISTERS. Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-6-rmclure@linux.ibm.com --- arch/powerpc/kernel/exceptions-64e.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index b9cec22df9f9..3f86091e68b3 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -358,7 +358,6 @@ ret_from_mc_except: std r14,PACA_EXMC+EX_R14(r13); \ std r15,PACA_EXMC+EX_R15(r13) - /* Core exception code for all exceptions except TLB misses. */ #define EXCEPTION_COMMON_LVL(n, scratch, excf) \ exc_##n##_common: \ @@ -394,7 +393,8 @@ exc_##n##_common: \ std r12,STACK_INT_FRAME_MARKER(r1); /* mark the frame */ \ std r3,_TRAP(r1); /* set trap number */ \ std r0,RESULT(r1); /* clear regs->result */ \ - SAVE_NVGPRS(r1); + SAVE_NVGPRS(r1); \ + SANITIZE_NVGPRS(); /* minimise speculation influence */ #define EXCEPTION_COMMON(n) \ EXCEPTION_COMMON_LVL(n, SPRN_SPRG_GEN_SCRATCH, PACA_EXGEN) From 7cd882df9485988f7d9b3fae04fde4e95a4c7a74 Mon Sep 17 00:00:00 2001 From: Rohan McLure <rmclure@linux.ibm.com> Date: Thu, 1 Dec 2022 18:10:19 +1100 Subject: [PATCH 155/181] powerpc/64: Sanitise user registers on interrupt in pseries, POWERNV Cause pseries and POWERNV platforms to default to zeroising all potentially user-defined registers when entering the kernel by means of any interrupt source, reducing user-influence of the kernel and the likelihood or producing speculation gadgets. Acked-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Rohan McLure <rmclure@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201071019.1953023-7-rmclure@linux.ibm.com --- arch/powerpc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index aec1431be06e..e21d6de797d6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -537,7 +537,7 @@ config HOTPLUG_CPU config INTERRUPT_SANITIZE_REGISTERS bool "Clear gprs on interrupt arrival" depends on PPC64 && ARCH_HAS_SYSCALL_WRAPPER - default PPC_BOOK3E_64 + default PPC_BOOK3E_64 || PPC_PSERIES || PPC_POWERNV help Reduce the influence of user register state on interrupt handlers and syscalls through clearing user state from registers before handling From ad050d2390fccb22aa3e6f65e11757ce7a5a7ca5 Mon Sep 17 00:00:00 2001 From: Michael Jeanson <mjeanson@efficios.com> Date: Thu, 1 Dec 2022 11:14:42 -0500 Subject: [PATCH 156/181] powerpc/ftrace: fix syscall tracing on PPC64_ELF_ABI_V1 In v5.7 the powerpc syscall entry/exit logic was rewritten in C, on PPC64_ELF_ABI_V1 this resulted in the symbols in the syscall table changing from their dot prefixed variant to the non-prefixed ones. Since ftrace prefixes a dot to the syscall names when matching them to build its syscall event list, this resulted in no syscall events being available. Remove the PPC64_ELF_ABI_V1 specific version of arch_syscall_match_sym_name to have the same behavior across all powerpc variants. Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic in C") Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Michael Jeanson <mjeanson@efficios.com> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221201161442.2127231-1-mjeanson@efficios.com --- arch/powerpc/include/asm/ftrace.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index ade406dc6504..441c5f08258b 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -71,17 +71,6 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, * those. */ #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME -#ifdef CONFIG_PPC64_ELF_ABI_V1 -static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) -{ - /* We need to skip past the initial dot, and the __se_sys alias */ - return !strcmp(sym + 1, name) || - (!strncmp(sym, ".__se_sys", 9) && !strcmp(sym + 6, name)) || - (!strncmp(sym, ".ppc_", 5) && !strcmp(sym + 5, name + 4)) || - (!strncmp(sym, ".ppc32_", 7) && !strcmp(sym + 7, name + 4)) || - (!strncmp(sym, ".ppc64_", 7) && !strcmp(sym + 7, name + 4)); -} -#else static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { return !strcmp(sym, name) || @@ -90,7 +79,6 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name (!strncmp(sym, "ppc32_", 6) && !strcmp(sym + 6, name + 4)) || (!strncmp(sym, "ppc64_", 6) && !strcmp(sym + 6, name + 4)); } -#endif /* CONFIG_PPC64_ELF_ABI_V1 */ #endif /* CONFIG_FTRACE_SYSCALLS */ #if defined(CONFIG_PPC64) && defined(CONFIG_FUNCTION_TRACER) From 84ecfe6f38ae4ee779ebd97ee173937fff565bf9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 2 Dec 2022 09:31:39 +0100 Subject: [PATCH 157/181] powerpc/code-patching: Remove #ifdef CONFIG_STRICT_KERNEL_RWX No need to have one implementation of patch_instruction() for CONFIG_STRICT_KERNEL_RWX and one for !CONFIG_STRICT_KERNEL_RWX. In patch_instruction(), call raw_patch_instruction() when !CONFIG_STRICT_KERNEL_RWX. In poking_init(), bail out immediately, it will be equivalent to the weak default implementation. Everything else is declared static and will be discarded by GCC when !CONFIG_STRICT_KERNEL_RWX. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/f67d2a109404d03e8fdf1ea15388c8778337a76b.1669969781.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 5b8f87db1217..a6a5047f8ba2 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -46,8 +46,6 @@ int raw_patch_instruction(u32 *addr, ppc_inst_t instr) return __patch_instruction(addr, instr, addr); } -#ifdef CONFIG_STRICT_KERNEL_RWX - struct patch_context { union { struct vm_struct *area; @@ -208,6 +206,9 @@ void __init poking_init(void) { int ret; + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + return; + if (mm_patch_enabled()) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/text_poke_mm:online", @@ -358,7 +359,8 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) * when text_poke_area is not ready, but we still need * to allow patching. We just do the plain old patching */ - if (!static_branch_likely(&poking_init_done)) + if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) || + !static_branch_likely(&poking_init_done)) return raw_patch_instruction(addr, instr); local_irq_save(flags); @@ -370,14 +372,6 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) return err; } -#else /* !CONFIG_STRICT_KERNEL_RWX */ - -static int do_patch_instruction(u32 *addr, ppc_inst_t instr) -{ - return raw_patch_instruction(addr, instr); -} - -#endif /* CONFIG_STRICT_KERNEL_RWX */ __ro_after_init DEFINE_STATIC_KEY_FALSE(init_mem_is_free); From 6076dc349b1c587c74c37027efff76f0fa4646f4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 2 Dec 2022 09:31:40 +0100 Subject: [PATCH 158/181] powerpc/feature-fixups: Refactor entry fixups patching Several fonctions have the same loop for patching instructions. Introduce function do_patch_entry_fixups() to refactor those loops. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/79eeff7b20a98f7136da5f79b1f7c436928f27f3.1669969781.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/feature-fixups.c | 84 ++++++++++++------------------- 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 31f40f544de5..93b3f8ea38aa 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -118,9 +118,33 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } #ifdef CONFIG_PPC_BOOK3S_64 +static int do_patch_entry_fixups(long *start, long *end, unsigned int *instrs, + bool do_fallback, void *fallback) +{ + int i; + + for (i = 0; start < end; start++, i++) { + unsigned int *dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + // See comment in do_entry_flush_fixups() RE order of patching + if (do_fallback) { + patch_instruction(dest, ppc_inst(instrs[0])); + patch_instruction(dest + 2, ppc_inst(instrs[2])); + patch_branch(dest + 1, (unsigned long)fallback, BRANCH_SET_LINK); + } else { + patch_instruction(dest + 1, ppc_inst(instrs[1])); + patch_instruction(dest + 2, ppc_inst(instrs[2])); + patch_instruction(dest, ppc_inst(instrs[0])); + } + } + return i; +} + static void do_stf_entry_barrier_fixups(enum stf_barrier_type types) { - unsigned int instrs[3], *dest; + unsigned int instrs[3]; long *start, *end; int i; @@ -144,23 +168,8 @@ static void do_stf_entry_barrier_fixups(enum stf_barrier_type types) instrs[i++] = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ } - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - - // See comment in do_entry_flush_fixups() RE order of patching - if (types & STF_BARRIER_FALLBACK) { - patch_instruction(dest, ppc_inst(instrs[0])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_branch(dest + 1, - (unsigned long)&stf_barrier_fallback, BRANCH_SET_LINK); - } else { - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_instruction(dest, ppc_inst(instrs[0])); - } - } + i = do_patch_entry_fixups(start, end, instrs, types & STF_BARRIER_FALLBACK, + &stf_barrier_fallback); printk(KERN_DEBUG "stf-barrier: patched %d entry locations (%s barrier)\n", i, (types == STF_BARRIER_NONE) ? "no" : @@ -325,7 +334,7 @@ void do_uaccess_flush_fixups(enum l1d_flush_type types) static int __do_entry_flush_fixups(void *data) { enum l1d_flush_type types = *(enum l1d_flush_type *)data; - unsigned int instrs[3], *dest; + unsigned int instrs[3]; long *start, *end; int i; @@ -375,42 +384,13 @@ static int __do_entry_flush_fixups(void *data) start = PTRRELOC(&__start___entry_flush_fixup); end = PTRRELOC(&__stop___entry_flush_fixup); - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - - if (types == L1D_FLUSH_FALLBACK) { - patch_instruction(dest, ppc_inst(instrs[0])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_branch(dest + 1, - (unsigned long)&entry_flush_fallback, BRANCH_SET_LINK); - } else { - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_instruction(dest, ppc_inst(instrs[0])); - } - } + i = do_patch_entry_fixups(start, end, instrs, types == L1D_FLUSH_FALLBACK, + &entry_flush_fallback); start = PTRRELOC(&__start___scv_entry_flush_fixup); end = PTRRELOC(&__stop___scv_entry_flush_fixup); - for (; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - - if (types == L1D_FLUSH_FALLBACK) { - patch_instruction(dest, ppc_inst(instrs[0])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_branch(dest + 1, - (unsigned long)&scv_entry_flush_fallback, BRANCH_SET_LINK); - } else { - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_instruction(dest, ppc_inst(instrs[0])); - } - } - + i += do_patch_entry_fixups(start, end, instrs, types == L1D_FLUSH_FALLBACK, + &scv_entry_flush_fallback); printk(KERN_DEBUG "entry-flush: patched %d locations (%s flush)\n", i, (types == L1D_FLUSH_NONE) ? "no" : From 3d1dbbca33a9c6dd3aafd4d14aaea9cc310723e1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 2 Dec 2022 09:31:41 +0100 Subject: [PATCH 159/181] powerpc/feature-fixups: Refactor other fixups patching Several fonctions have the same loop for patching instructions. Introduce function do_patch_fixups() to refactor those loops. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/58ab36949c18f94d466fc98d6c085783b0cd474f.1669969781.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/feature-fixups.c | 77 +++++++++++-------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 93b3f8ea38aa..25168a59d1ce 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -117,6 +117,24 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } } +#ifdef CONFIG_PPC_BARRIER_NOSPEC +static int do_patch_fixups(long *start, long *end, unsigned int *instrs, int num) +{ + int i; + + for (i = 0; start < end; start++, i++) { + int j; + unsigned int *dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + for (j = 0; j < num; j++) + patch_instruction(dest + j, ppc_inst(instrs[j])); + } + return i; +} +#endif + #ifdef CONFIG_PPC_BOOK3S_64 static int do_patch_entry_fixups(long *start, long *end, unsigned int *instrs, bool do_fallback, void *fallback) @@ -181,7 +199,7 @@ static void do_stf_entry_barrier_fixups(enum stf_barrier_type types) static void do_stf_exit_barrier_fixups(enum stf_barrier_type types) { - unsigned int instrs[6], *dest; + unsigned int instrs[6]; long *start, *end; int i; @@ -215,18 +233,8 @@ static void do_stf_exit_barrier_fixups(enum stf_barrier_type types) instrs[i++] = PPC_RAW_EIEIO() | 0x02000000; /* eieio + bit 6 hint */ } - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); - pr_devel("patching dest %lx\n", (unsigned long)dest); - - patch_instruction(dest, ppc_inst(instrs[0])); - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_instruction(dest + 3, ppc_inst(instrs[3])); - patch_instruction(dest + 4, ppc_inst(instrs[4])); - patch_instruction(dest + 5, ppc_inst(instrs[5])); - } printk(KERN_DEBUG "stf-barrier: patched %d exit locations (%s barrier)\n", i, (types == STF_BARRIER_NONE) ? "no" : (types == STF_BARRIER_FALLBACK) ? "fallback" : @@ -283,7 +291,7 @@ void do_stf_barrier_fixups(enum stf_barrier_type types) void do_uaccess_flush_fixups(enum l1d_flush_type types) { - unsigned int instrs[4], *dest; + unsigned int instrs[4]; long *start, *end; int i; @@ -309,17 +317,7 @@ void do_uaccess_flush_fixups(enum l1d_flush_type types) if (types & L1D_FLUSH_MTTRIG) instrs[i++] = PPC_RAW_MTSPR(SPRN_TRIG2, _R0); - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - - patch_instruction(dest, ppc_inst(instrs[0])); - - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - patch_instruction(dest + 3, ppc_inst(instrs[3])); - } + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); printk(KERN_DEBUG "uaccess-flush: patched %d locations (%s flush)\n", i, (types == L1D_FLUSH_NONE) ? "no" : @@ -418,7 +416,7 @@ void do_entry_flush_fixups(enum l1d_flush_type types) static int __do_rfi_flush_fixups(void *data) { enum l1d_flush_type types = *(enum l1d_flush_type *)data; - unsigned int instrs[3], *dest; + unsigned int instrs[3]; long *start, *end; int i; @@ -442,15 +440,7 @@ static int __do_rfi_flush_fixups(void *data) if (types & L1D_FLUSH_MTTRIG) instrs[i++] = PPC_RAW_MTSPR(SPRN_TRIG2, _R0); - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - - patch_instruction(dest, ppc_inst(instrs[0])); - patch_instruction(dest + 1, ppc_inst(instrs[1])); - patch_instruction(dest + 2, ppc_inst(instrs[2])); - } + i = do_patch_fixups(start, end, instrs, ARRAY_SIZE(instrs)); printk(KERN_DEBUG "rfi-flush: patched %d locations (%s flush)\n", i, (types == L1D_FLUSH_NONE) ? "no" : @@ -492,7 +482,7 @@ void do_rfi_flush_fixups(enum l1d_flush_type types) void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) { - unsigned int instr, *dest; + unsigned int instr; long *start, *end; int i; @@ -506,12 +496,7 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_ instr = PPC_RAW_ORI(_R31, _R31, 0); /* speculation barrier */ } - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - patch_instruction(dest, ppc_inst(instr)); - } + i = do_patch_fixups(start, end, &instr, 1); printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); } @@ -533,7 +518,7 @@ void do_barrier_nospec_fixups(bool enable) #ifdef CONFIG_PPC_E500 void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end) { - unsigned int instr[2], *dest; + unsigned int instr[2]; long *start, *end; int i; @@ -549,13 +534,7 @@ void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_ instr[1] = PPC_RAW_SYNC(); } - for (i = 0; start < end; start++, i++) { - dest = (void *)start + *start; - - pr_devel("patching dest %lx\n", (unsigned long)dest); - patch_instruction(dest, ppc_inst(instr[0])); - patch_instruction(dest + 1, ppc_inst(instr[1])); - } + i = do_patch_fixups(start, end, instr, ARRAY_SIZE(instr)); printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i); } From b988e7797d09379057cf991ae082f9ad7a309a63 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 2 Dec 2022 09:31:42 +0100 Subject: [PATCH 160/181] powerpc/feature-fixups: Do not patch init section after init Once init section is freed, attempting to patch init code ends up in the weed. Commit 51c3c62b58b3 ("powerpc: Avoid code patching freed init sections") protected patch_instruction() against that, but it is the responsibility of the caller to ensure that the patched memory is valid. In the same spirit as jump_label with its jump_label_can_update() function, add is_fixup_addr_valid() function to skip patching on freed init section. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/8e9311fc1b057e4e6a2a3a0701ebcc74b787affe.1669969781.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/feature-fixups.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 25168a59d1ce..80def1c2afcb 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -118,6 +118,12 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) } #ifdef CONFIG_PPC_BARRIER_NOSPEC +static bool is_fixup_addr_valid(void *dest, size_t size) +{ + return system_state < SYSTEM_FREEING_INITMEM || + !init_section_contains(dest, size); +} + static int do_patch_fixups(long *start, long *end, unsigned int *instrs, int num) { int i; @@ -126,6 +132,9 @@ static int do_patch_fixups(long *start, long *end, unsigned int *instrs, int num int j; unsigned int *dest = (void *)start + *start; + if (!is_fixup_addr_valid(dest, sizeof(*instrs) * num)) + continue; + pr_devel("patching dest %lx\n", (unsigned long)dest); for (j = 0; j < num; j++) @@ -144,6 +153,9 @@ static int do_patch_entry_fixups(long *start, long *end, unsigned int *instrs, for (i = 0; start < end; start++, i++) { unsigned int *dest = (void *)start + *start; + if (!is_fixup_addr_valid(dest, sizeof(*instrs) * 3)) + continue; + pr_devel("patching dest %lx\n", (unsigned long)dest); // See comment in do_entry_flush_fixups() RE order of patching From 6f3a81b60091031c2c14eb2373d1937b027deb46 Mon Sep 17 00:00:00 2001 From: Christophe Leroy <christophe.leroy@csgroup.eu> Date: Fri, 2 Dec 2022 09:31:43 +0100 Subject: [PATCH 161/181] powerpc/code-patching: Remove protection against patching init addresses after init Once init section is freed, attempting to patch init code ends up in the weed. Commit 51c3c62b58b3 ("powerpc: Avoid code patching freed init sections") protected patch_instruction() against that, but it is the responsibility of the caller to ensure that the patched memory is valid. All callers have now been verified and fixed so the check can be removed. This improves ftrace activation by about 2% on 8xx. Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/504310828f473d424e2ed229eff57bf075f52796.1669969781.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/code-patching.h | 2 -- arch/powerpc/lib/code-patching.c | 13 +------------ arch/powerpc/mm/mem.c | 1 - 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 1c6316ec4b74..3f881548fb61 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -22,8 +22,6 @@ #define BRANCH_SET_LINK 0x1 #define BRANCH_ABSOLUTE 0x2 -DECLARE_STATIC_KEY_FALSE(init_mem_is_free); - /* * Powerpc branch instruction is : * diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index a6a5047f8ba2..73ce4b90bb1b 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -349,7 +349,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) return err; } -static int do_patch_instruction(u32 *addr, ppc_inst_t instr) +int patch_instruction(u32 *addr, ppc_inst_t instr) { int err; unsigned long flags; @@ -372,17 +372,6 @@ static int do_patch_instruction(u32 *addr, ppc_inst_t instr) return err; } - -__ro_after_init DEFINE_STATIC_KEY_FALSE(init_mem_is_free); - -int patch_instruction(u32 *addr, ppc_inst_t instr) -{ - /* Make sure we aren't patching a freed init section */ - if (static_branch_likely(&init_mem_is_free) && init_section_contains(addr, 4)) - return 0; - - return do_patch_instruction(addr, instr); -} NOKPROBE_SYMBOL(patch_instruction); int patch_branch(u32 *addr, unsigned long target, int flags) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 84d171953ba4..8b121df7b08f 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -344,7 +344,6 @@ void free_initmem(void) { ppc_md.progress = ppc_printk_progress; mark_initmem_nx(); - static_branch_enable(&init_mem_is_free); free_initmem_default(POISON_FREE_INITMEM); ftrace_free_init_tramp(); } From 8f4ab7da904ab7027ccd43ddb4f0094e932a5877 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin <linmq006@gmail.com> Date: Mon, 5 Dec 2022 12:44:27 +0400 Subject: [PATCH 162/181] selftests/powerpc: Fix resource leaks In check_all_cpu_dscr_defaults, opendir() opens the directory stream. Add missing closedir() in the error path to release it. In check_cpu_dscr_default, open() creates an open file descriptor. Add missing close() in the error path to release it. Fixes: ebd5858c904b ("selftests/powerpc: Add test for all DSCR sysfs interfaces") Signed-off-by: Miaoqian Lin <linmq006@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221205084429.570654-1-linmq006@gmail.com --- tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c index fbbdffdb2e5d..f20d1c166d1e 100644 --- a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c +++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c @@ -24,6 +24,7 @@ static int check_cpu_dscr_default(char *file, unsigned long val) rc = read(fd, buf, sizeof(buf)); if (rc == -1) { perror("read() failed"); + close(fd); return 1; } close(fd); @@ -65,8 +66,10 @@ static int check_all_cpu_dscr_defaults(unsigned long val) if (access(file, F_OK)) continue; - if (check_cpu_dscr_default(file, val)) + if (check_cpu_dscr_default(file, val)) { + closedir(sysfs); return 1; + } } closedir(sysfs); return 0; From 6aecc0a59e07ba895b5473e0c916ba5f3d556c15 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas <bhelgaas@google.com> Date: Mon, 5 Dec 2022 16:32:31 -0600 Subject: [PATCH 163/181] cxl: Remove unnecessary cxl_pci_window_alignment() cxl_pci_window_alignment() is referenced only via the struct pci_controller_ops.window_alignment function pointer, and only in the powerpc implementation of pcibios_window_alignment(). pcibios_window_alignment() defaults to returning 1 if the function pointer is NULL, which is the same was what cxl_pci_window_alignment() does. cxl_pci_window_alignment() is unnecessary, so remove it. No functional change intended. Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Acked-by: Frederic Barrat <fbarrat@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221205223231.1268085-1-helgaas@kernel.org --- drivers/misc/cxl/vphb.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index 1264253cc07b..6332db8044bd 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -67,12 +67,6 @@ static void cxl_pci_disable_device(struct pci_dev *dev) } } -static resource_size_t cxl_pci_window_alignment(struct pci_bus *bus, - unsigned long type) -{ - return 1; -} - static void cxl_pci_reset_secondary_bus(struct pci_dev *dev) { /* Should we do an AFU reset here ? */ @@ -200,7 +194,6 @@ static struct pci_controller_ops cxl_pci_controller_ops = .enable_device_hook = cxl_pci_enable_device_hook, .disable_device = cxl_pci_disable_device, .release_device = cxl_pci_disable_device, - .window_alignment = cxl_pci_window_alignment, .reset_secondary_bus = cxl_pci_reset_secondary_bus, .setup_msi_irqs = cxl_setup_msi_irqs, .teardown_msi_irqs = cxl_teardown_msi_irqs, From 3ae7c96dd51025550c8001c6f833337f11d00807 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven <geert+renesas@glider.be> Date: Fri, 2 Dec 2022 17:49:26 +0100 Subject: [PATCH 164/181] powerpc/dts/fsl: Fix pca954x i2c-mux node names "make dtbs_check": arch/powerpc/boot/dts/fsl/t1040rdb-rev-a.dtb: pca9546@77: $nodename:0: 'pca9546@77' does not match '^(i2c-?)?mux' From schema: Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml arch/powerpc/boot/dts/fsl/t1024qds.dtb: pca9547@77: Unevaluated properties are not allowed ('#address-cells', '#size-cells', 'i2c@0', 'i2c@2', 'i2c@3' were unexpected) From schema: Documentation/devicetree/bindings/i2c/i2c-mux-pca954x.yaml ... Fix this by renaming pca954x nodes to "i2c-mux", to match the I2C bus multiplexer/switch DT bindings and the Generic Names Recommendation in the Devicetree Specification. Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/6c5d86c49ac170e9d56ab121ea0602f3873849ca.1669999298.git.geert+renesas@glider.be --- arch/powerpc/boot/dts/fsl/t1024qds.dts | 2 +- arch/powerpc/boot/dts/fsl/t1024rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/t104xqds.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t104xrdb.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t208xqds.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t208xrdb.dtsi | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/t1024qds.dts b/arch/powerpc/boot/dts/fsl/t1024qds.dts index d6858b7cd93f..9ea7942f914e 100644 --- a/arch/powerpc/boot/dts/fsl/t1024qds.dts +++ b/arch/powerpc/boot/dts/fsl/t1024qds.dts @@ -151,7 +151,7 @@ }; i2c@118000 { - pca9547@77 { + i2c-mux@77 { compatible = "nxp,pca9547"; reg = <0x77>; #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts b/arch/powerpc/boot/dts/fsl/t1024rdb.dts index dbcd31cc35dc..270aaf631f2a 100644 --- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts @@ -165,7 +165,7 @@ }; i2c@118100 { - pca9546@77 { + i2c-mux@77 { compatible = "nxp,pca9546"; reg = <0x77>; #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/t104xqds.dtsi b/arch/powerpc/boot/dts/fsl/t104xqds.dtsi index 615479732252..1c329f076f64 100644 --- a/arch/powerpc/boot/dts/fsl/t104xqds.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xqds.dtsi @@ -268,7 +268,7 @@ }; i2c@118000 { - pca9547@77 { + i2c-mux@77 { compatible = "nxp,pca9547"; reg = <0x77>; }; diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi index bfe1ed5be337..fc7bec5dcb90 100644 --- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi @@ -128,7 +128,7 @@ }; i2c@118100 { - pca9546@77 { + i2c-mux@77 { compatible = "nxp,pca9546"; reg = <0x77>; #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi index db4139999b28..962c99941645 100644 --- a/arch/powerpc/boot/dts/fsl/t208xqds.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xqds.dtsi @@ -135,7 +135,7 @@ }; i2c@118000 { - pca9547@77 { + i2c-mux@77 { compatible = "nxp,pca9547"; reg = <0x77>; #address-cells = <1>; diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi index ff87e67c70da..ecc3e8c7394c 100644 --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi @@ -138,7 +138,7 @@ }; i2c@118100 { - pca9546@77 { + i2c-mux@77 { compatible = "nxp,pca9546"; reg = <0x77>; }; From 5ddcc03a07ae1ab5062f89a946d9495f1fd8eaa4 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan <aboorvad@linux.vnet.ibm.com> Date: Mon, 14 Nov 2022 20:26:11 +0530 Subject: [PATCH 165/181] powerpc/cpuidle: Set CPUIDLE_FLAG_POLLING for snooze state During the comparative study of cpuidle governors, it is noticed that the menu governor does not select CEDE state in some scenarios even though when the sleep duration of the CPU exceeds the target residency of the CEDE idle state this is because the CPU exits the snooze "polling" state when snooze time limit is reached in the snooze_loop(), which is not a real wake up and it just means that the polling state selection was not adequate. cpuidle governors rely on CPUIDLE_FLAG_POLLING flag to be set for the polling states to handle the condition mentioned above. Hence, set the CPUIDLE_FLAG_POLLING flag for snooze state (polling state) in powerpc arch to make the cpuidle governor work as expected. Reference Commits: - Timeout enabled for snooze state: commit 78eaa10f027c ("cpuidle: powernv/pseries: Auto-promotion of snooze to deeper idle state") - commit dc2251bf98c6 ("cpuidle: Eliminate the CPUIDLE_DRIVER_STATE_START symbol") - Fix wakeup stats in governor for polling states commit 5f26bdceb9c0 ("cpuidle: menu: Fix wakeup statistics updates for polling state") Signed-off-by: Aboorva Devarajan <aboorvad@linux.vnet.ibm.com> Tested-by: Vishal Chourasia <vishalc@linux.vnet.ibm.com> Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com> Reviewed-by: Vishal Chourasia <vishalc@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114145611.37669-1-aboorvad@linux.vnet.ibm.com --- drivers/cpuidle/cpuidle-powernv.c | 5 ++++- drivers/cpuidle/cpuidle-pseries.c | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 0b5461b3d7dd..9ebedd972df0 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -76,6 +76,7 @@ static int snooze_loop(struct cpuidle_device *dev, local_irq_enable(); snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + dev->poll_time_limit = false; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { @@ -86,6 +87,7 @@ static int snooze_loop(struct cpuidle_device *dev, * cleared to order subsequent test of need_resched(). */ clear_thread_flag(TIF_POLLING_NRFLAG); + dev->poll_time_limit = true; smp_mb(); break; } @@ -155,7 +157,8 @@ static struct cpuidle_state powernv_states[CPUIDLE_STATE_MAX] = { .desc = "snooze", .exit_latency = 0, .target_residency = 0, - .enter = snooze_loop }, + .enter = snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, }; static int powernv_cpuidle_cpu_online(unsigned int cpu) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 7e7ab5597d7a..1bad4d2b7be3 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -44,6 +44,7 @@ static int snooze_loop(struct cpuidle_device *dev, pseries_idle_prolog(); local_irq_enable(); snooze_exit_time = get_tb() + snooze_timeout; + dev->poll_time_limit = false; while (!need_resched()) { HMT_low(); @@ -54,6 +55,7 @@ static int snooze_loop(struct cpuidle_device *dev, * loop anyway. Require a barrier after polling is * cleared to order subsequent test of need_resched(). */ + dev->poll_time_limit = true; clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); break; @@ -268,7 +270,8 @@ static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = { .desc = "snooze", .exit_latency = 0, .target_residency = 0, - .enter = &snooze_loop }, + .enter = &snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, { /* CEDE */ .name = "CEDE", .desc = "CEDE", @@ -286,7 +289,8 @@ static struct cpuidle_state shared_states[] = { .desc = "snooze", .exit_latency = 0, .target_residency = 0, - .enter = &snooze_loop }, + .enter = &snooze_loop, + .flags = CPUIDLE_FLAG_POLLING }, { /* Shared Cede */ .name = "Shared Cede", .desc = "Shared Cede", From e13d23a404f2e6dfaf8b1ef7d161a0836fce4fa5 Mon Sep 17 00:00:00 2001 From: Laurent Dufour <ldufour@linux.ibm.com> Date: Thu, 10 Nov 2022 19:06:18 +0100 Subject: [PATCH 166/181] powerpc: export the CPU node count At boot time, the FDT is parsed to compute the number of CPUs. In addition count the number of CPU nodes and export it. This is useful when building the FDT for a kexeced kernel since we need to take in account the CPU node added since the boot time during CPU hotplug operations. Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221110180619.15796-2-ldufour@linux.ibm.com --- arch/powerpc/include/asm/prom.h | 1 + arch/powerpc/kernel/prom.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 2e82820fbd64..c0107d8ddd8c 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -85,6 +85,7 @@ struct of_drc_info { extern int of_read_drc_info_cell(struct property **prop, const __be32 **curval, struct of_drc_info *data); +extern unsigned int boot_cpu_node_count; /* * There are two methods for telling firmware what our capabilities are. diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1eed87d954ba..645f4450dfc3 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -72,6 +72,7 @@ int __initdata iommu_is_off; int __initdata iommu_force_on; unsigned long tce_alloc_start, tce_alloc_end; u64 ppc64_rma_size; +unsigned int boot_cpu_node_count __ro_after_init; #endif static phys_addr_t first_memblock_size; static int __initdata boot_cpu_count; @@ -335,6 +336,8 @@ static int __init early_init_dt_scan_cpus(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; + boot_cpu_node_count++; + /* Get physical cpuid */ intserv = of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &len); if (!intserv) From 340a4a9f8773e102cc5ef531665970a686dfa245 Mon Sep 17 00:00:00 2001 From: Laurent Dufour <ldufour@linux.ibm.com> Date: Thu, 10 Nov 2022 19:06:19 +0100 Subject: [PATCH 167/181] powerpc: Take in account addition CPU node when building kexec FDT On a system with a large number of CPUs, the creation of the FDT for a kexec kernel may fail because the allocated FDT is not large enough. When this happens, such a message is displayed on the console: Unable to add ibm,processor-vadd-size property: FDT_ERR_NOSPACE The property's name may change depending when the buffer overwrite is detected. Obviously the created FDT is missing information, and it is expected that system dump or kexec kernel failed to run properly. When the FDT is allocated, the size of the FDT the kernel received at boot time is used and an extra size can be applied. Currently, only memory added after boot time is taken in account, not the CPU nodes. The extra size should take in account these additional CPU nodes and compute the required extra space. To achieve that, the size of a CPU node, including its subnode is computed once and multiplied by the number of additional CPU nodes. The assumption is that the size of the CPU node is _same_ for all the node, the only variable part should be the name "PowerPC,POWERxx@##" where "##" may vary a little. Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> [mpe: Don't shadow function name w/variable, minor coding style changes] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221110180619.15796-3-ldufour@linux.ibm.com --- arch/powerpc/kexec/file_load_64.c | 59 ++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 349a781cea0b..2500c37c628c 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -26,6 +26,7 @@ #include <asm/firmware.h> #include <asm/kexec_ranges.h> #include <asm/crashdump-ppc64.h> +#include <asm/prom.h> struct umem_info { u64 *buf; /* data buffer for usable-memory property */ @@ -928,6 +929,45 @@ out: return ret; } +/** + * get_cpu_node_size - Compute the size of a CPU node in the FDT. + * This should be done only once and the value is stored in + * a static variable. + * Returns the max size of a CPU node in the FDT. + */ +static unsigned int cpu_node_size(void) +{ + static unsigned int size; + struct device_node *dn; + struct property *pp; + + /* + * Don't compute it twice, we are assuming that the per CPU node size + * doesn't change during the system's life. + */ + if (size) + return size; + + dn = of_find_node_by_type(NULL, "cpu"); + if (WARN_ON_ONCE(!dn)) { + // Unlikely to happen + return 0; + } + + /* + * We compute the sub node size for a CPU node, assuming it + * will be the same for all. + */ + size += strlen(dn->name) + 5; + for_each_property_of_node(dn, pp) { + size += strlen(pp->name); + size += pp->length; + } + + of_node_put(dn); + return size; +} + /** * kexec_extra_fdt_size_ppc64 - Return the estimated additional size needed to * setup FDT for kexec/kdump kernel. @@ -937,6 +977,8 @@ out: */ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) { + unsigned int cpu_nodes, extra_size; + struct device_node *dn; u64 usm_entries; if (image->type != KEXEC_TYPE_CRASH) @@ -949,7 +991,22 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) */ usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); - return (unsigned int)(usm_entries * sizeof(u64)); + + extra_size = (unsigned int)(usm_entries * sizeof(u64)); + + /* + * Get the number of CPU nodes in the current DT. This allows to + * reserve places for CPU nodes added since the boot time. + */ + cpu_nodes = 0; + for_each_node_by_type(dn, "cpu") { + cpu_nodes++; + } + + if (cpu_nodes > boot_cpu_node_count) + extra_size += (cpu_nodes - boot_cpu_node_count) * cpu_node_size(); + + return extra_size; } /** From 9b574cfab7d4e68c67c4ee4fcde912ef54a25b88 Mon Sep 17 00:00:00 2001 From: Laurent Dufour <ldufour@linux.ibm.com> Date: Fri, 25 Nov 2022 18:32:04 +0100 Subject: [PATCH 168/181] powerpc/pseries: reset the RCU watchdogs after a LPM The RCU watchdog timer should be reset when restarting the CPU after a Live Partition Mobility operation. Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> Acked-by: Nicholas Piggin <npiggin@gmail.com> [mpe: Combine comments into a single comment block] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221125173204.15329-1-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/mobility.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index 634fac5db3f9..4cea71aa0f41 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -635,10 +635,13 @@ retry: prod_others(); } /* - * Execution may have been suspended for several seconds, so - * reset the watchdog. + * Execution may have been suspended for several seconds, so reset + * the watchdogs. touch_nmi_watchdog() also touches the soft lockup + * watchdog. */ + rcu_cpu_stall_reset(); touch_nmi_watchdog(); + return ret; } From f6aa37c51ec0d053ee34c235bfe0e666618a3baf Mon Sep 17 00:00:00 2001 From: Laurent Dufour <ldufour@linux.ibm.com> Date: Mon, 14 Nov 2022 17:01:50 +0100 Subject: [PATCH 169/181] powerpc/pseries: unregister VPA when hot unplugging a CPU The VPA should unregister when offlining a CPU. Otherwise there could be a short window where 2 CPUs could share the same VPA. This happens because the hypervisor is still keeping the VPA attached to the vCPU even if it became offline. Here is a potential situation: 1. remove proc A, 2. add proc B. If proc B gets proc A's place in cpu_present_mask, then it registers proc A's VPAs. 3. If proc B is then re-added to the LP, its threads are sharing VPAs with proc A briefly as they come online. As the hypervisor may check for the VPA's yield_count field oddity, it may detect an unexpected value and kill the LPAR. Suggested-by: Nathan Lynch <nathanl@linux.ibm.com> Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Nathan Lynch <nathanl@linux.ibm.com> [mpe: s/cpu_present_map/cpu_present_mask/ in change log] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221114160150.13554-1-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index e0a7ac5db15d..090ae5a1e0f5 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -70,6 +70,7 @@ static void pseries_cpu_offline_self(void) xics_teardown_cpu(); unregister_slb_shadow(hwcpu); + unregister_vpa(hwcpu); rtas_stop_self(); /* Should never get here... */ From 336e2554ec99eb97616004c791ee89abe96bdab2 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:39 -0600 Subject: [PATCH 170/181] powerpc/rtas: document rtas_call() rtas_call() has a complex calling convention, non-standard return values, and many users. Add kernel-doc for it and remove the less structured commentary from rtas.h. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-2-nathanl@linux.ibm.com --- arch/powerpc/include/asm/rtas.h | 15 --------- arch/powerpc/kernel/rtas.c | 58 +++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 56319aea646e..479a95cb2770 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -33,21 +33,6 @@ #define RTAS_THREADS_ACTIVE -9005 /* Multiple processor threads active */ #define RTAS_OUTSTANDING_COPROC -9006 /* Outstanding coprocessor operations */ -/* - * In general to call RTAS use rtas_token("string") to lookup - * an RTAS token for the given string (e.g. "event-scan"). - * To actually perform the call use - * ret = rtas_call(token, n_in, n_out, ...) - * Where n_in is the number of input parameters and - * n_out is the number of output parameters - * - * If the "string" is invalid on this system, RTAS_UNKNOWN_SERVICE - * will be returned as a token. rtas_call() does look for this - * token and error out gracefully so rtas_call(rtas_token("str"), ...) - * may be safely used for one-shot calls to RTAS. - * - */ - /* RTAS event classes */ #define RTAS_INTERNAL_ERROR 0x80000000 /* set bit 0 */ #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index e847f9b1c5b9..c12dd5ed5e00 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -467,6 +467,64 @@ void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, static int ibm_open_errinjct_token; static int ibm_errinjct_token; +/** + * rtas_call() - Invoke an RTAS firmware function. + * @token: Identifies the function being invoked. + * @nargs: Number of input parameters. Does not include token. + * @nret: Number of output parameters, including the call status. + * @outputs: Array of @nret output words. + * @....: List of @nargs input parameters. + * + * Invokes the RTAS function indicated by @token, which the caller + * should obtain via rtas_token(). + * + * The @nargs and @nret arguments must match the number of input and + * output parameters specified for the RTAS function. + * + * rtas_call() returns RTAS status codes, not conventional Linux errno + * values. Callers must translate any failure to an appropriate errno + * in syscall context. Most callers of RTAS functions that can return + * -2 or 990x should use rtas_busy_delay() to correctly handle those + * statuses before calling again. + * + * The return value descriptions are adapted from 7.2.8 [RTAS] Return + * Codes of the PAPR and CHRP specifications. + * + * Context: Process context preferably, interrupt context if + * necessary. Acquires an internal spinlock and may perform + * GFP_ATOMIC slab allocation in error path. Unsafe for NMI + * context. + * Return: + * * 0 - RTAS function call succeeded. + * * -1 - RTAS function encountered a hardware or + * platform error, or the token is invalid, + * or the function is restricted by kernel policy. + * * -2 - Specs say "A necessary hardware device was busy, + * and the requested function could not be + * performed. The operation should be retried at + * a later time." This is misleading, at least with + * respect to current RTAS implementations. What it + * usually means in practice is that the function + * could not be completed while meeting RTAS's + * deadline for returning control to the OS (250us + * for PAPR/PowerVM, typically), but the call may be + * immediately reattempted to resume work on it. + * * -3 - Parameter error. + * * -7 - Unexpected state change. + * * 9000...9899 - Vendor-specific success codes. + * * 9900...9905 - Advisory extended delay. Caller should try + * again after ~10^x ms has elapsed, where x is + * the last digit of the status [0-5]. Again going + * beyond the PAPR text, 990x on PowerVM indicates + * contention for RTAS-internal resources. Other + * RTAS call sequences in progress should be + * allowed to complete before reattempting the + * call. + * * -9000 - Multi-level isolation error. + * * -9999...-9004 - Vendor-specific error codes. + * * Additional negative values - Function-specific error. + * * Additional positive values - Function-specific success. + */ int rtas_call(int token, int nargs, int nret, int *outputs, ...) { va_list list; From b10af504a2015d12c566b6b0a4c7e3b602949eeb Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:40 -0600 Subject: [PATCH 171/181] powerpc/rtasd: use correct OF API for event scan rate rtas_token() should be used only for properties that are RTAS function tokens. "rtas-event-scan-rate" does not contain a function token, but it has the same size/format as token properties so reading it with rtas_token() happens to work. Convert to of_property_read_u32(). Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-3-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtasd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 5270b450bbde..cc56ac6ba4b0 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/kernel.h> +#include <linux/of.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/init.h> @@ -499,6 +500,8 @@ EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); static int __init rtas_event_scan_init(void) { + int err; + if (!machine_is(pseries) && !machine_is(chrp)) return 0; @@ -509,8 +512,8 @@ static int __init rtas_event_scan_init(void) return -ENODEV; } - rtas_event_scan_rate = rtas_token("rtas-event-scan-rate"); - if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) { + err = of_property_read_u32(rtas.dev, "rtas-event-scan-rate", &rtas_event_scan_rate); + if (err) { printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n"); return -ENODEV; } From ed2213bfb192ab51f09f12e9b49b5d482c6493f3 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:41 -0600 Subject: [PATCH 172/181] powerpc/rtas: avoid device tree lookups in rtas_os_term() rtas_os_term() is called during panic. Its behavior depends on a couple of conditions in the /rtas node of the device tree, the traversal of which entails locking and local IRQ state changes. If the kernel panics while devtree_lock is held, rtas_os_term() as currently written could hang. Instead of discovering the relevant characteristics at panic time, cache them in file-static variables at boot. Note the lookup for "ibm,extended-os-term" is converted to of_property_read_bool() since it is a boolean property, not an RTAS function token. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> [mpe: Incorporate suggested change from Nick] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-4-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index c12dd5ed5e00..db43cbdcc74c 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -947,6 +947,7 @@ void __noreturn rtas_halt(void) /* Must be in the RMO region, so we place it here */ static char rtas_os_term_buf[2048]; +static s32 ibm_os_term_token = RTAS_UNKNOWN_SERVICE; void rtas_os_term(char *str) { @@ -958,14 +959,13 @@ void rtas_os_term(char *str) * this property may terminate the partition which we want to avoid * since it interferes with panic_timeout. */ - if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") || - RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term")) + if (ibm_os_term_token == RTAS_UNKNOWN_SERVICE) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); do { - status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL, + status = rtas_call(ibm_os_term_token, 1, 1, NULL, __pa(rtas_os_term_buf)); } while (rtas_busy_delay(status)); @@ -1335,6 +1335,13 @@ void __init rtas_initialize(void) no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry); rtas.entry = no_entry ? rtas.base : entry; + /* + * Discover these now to avoid device tree lookups in the + * panic path. + */ + if (of_property_read_bool(rtas.dev, "ibm,extended-os-term")) + ibm_os_term_token = rtas_token("ibm,os-term"); + /* If RTAS was found, allocate the RMO buffer for it and look for * the stop-self token if any */ From 6c606e57eecc37d6b36d732b1ff7e55b7dc32dd4 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:42 -0600 Subject: [PATCH 173/181] powerpc/rtas: avoid scheduling in rtas_os_term() It's unsafe to use rtas_busy_delay() to handle a busy status from the ibm,os-term RTAS function in rtas_os_term(): Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b BUG: sleeping function called from invalid context at arch/powerpc/kernel/rtas.c:618 in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 1, name: swapper/0 preempt_count: 2, expected: 0 CPU: 7 PID: 1 Comm: swapper/0 Tainted: G D 6.0.0-rc5-02182-gf8553a572277-dirty #9 Call Trace: [c000000007b8f000] [c000000001337110] dump_stack_lvl+0xb4/0x110 (unreliable) [c000000007b8f040] [c0000000002440e4] __might_resched+0x394/0x3c0 [c000000007b8f0e0] [c00000000004f680] rtas_busy_delay+0x120/0x1b0 [c000000007b8f100] [c000000000052d04] rtas_os_term+0xb8/0xf4 [c000000007b8f180] [c0000000001150fc] pseries_panic+0x50/0x68 [c000000007b8f1f0] [c000000000036354] ppc_panic_platform_handler+0x34/0x50 [c000000007b8f210] [c0000000002303c4] notifier_call_chain+0xd4/0x1c0 [c000000007b8f2b0] [c0000000002306cc] atomic_notifier_call_chain+0xac/0x1c0 [c000000007b8f2f0] [c0000000001d62b8] panic+0x228/0x4d0 [c000000007b8f390] [c0000000001e573c] do_exit+0x140c/0x1420 [c000000007b8f480] [c0000000001e586c] make_task_dead+0xdc/0x200 Use rtas_busy_delay_time() instead, which signals without side effects whether to attempt the ibm,os-term RTAS call again. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-5-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index db43cbdcc74c..f21b39fcaf99 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -964,10 +964,15 @@ void rtas_os_term(char *str) snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); + /* + * Keep calling as long as RTAS returns a "try again" status, + * but don't use rtas_busy_delay(), which potentially + * schedules. + */ do { status = rtas_call(ibm_os_term_token, 1, 1, NULL, __pa(rtas_os_term_buf)); - } while (rtas_busy_delay(status)); + } while (rtas_busy_delay_time(status)); if (status != 0) printk(KERN_EMERG "ibm,os-term call failed %d\n", status); From 9aafbfa5f57a4b75bafd3bed0191e8429c5fa618 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:43 -0600 Subject: [PATCH 174/181] powerpc/pseries/eeh: use correct API for error log size rtas-error-log-max is not the name of an RTAS function, so rtas_token() is not the appropriate API for retrieving its value. We already have rtas_get_error_log_max() which returns a sensible value if the property is absent for any reason, so use that instead. Fixes: 8d633291b4fc ("powerpc/eeh: pseries platform EEH error log retrieval") Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> [mpe: Drop no-longer possible error handling as noticed by ajd] Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-6-nathanl@linux.ibm.com --- arch/powerpc/platforms/pseries/eeh_pseries.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index ea890037843c..6b507b62ce8f 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -848,16 +848,7 @@ static int __init eeh_pseries_init(void) } /* Initialize error log size */ - eeh_error_buf_size = rtas_token("rtas-error-log-max"); - if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { - pr_info("%s: unknown EEH error log size\n", - __func__); - eeh_error_buf_size = 1024; - } else if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { - pr_info("%s: EEH error log size %d exceeds the maximal %d\n", - __func__, eeh_error_buf_size, RTAS_ERROR_LOG_MAX); - eeh_error_buf_size = RTAS_ERROR_LOG_MAX; - } + eeh_error_buf_size = rtas_get_error_log_max(); /* Set EEH probe mode */ eeh_add_flag(EEH_PROBE_MODE_DEVTREE | EEH_ENABLE_IO_FOR_LOG); From c67a0e411d0ffe0648fe84e25e9f899ce770feb3 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:44 -0600 Subject: [PATCH 175/181] powerpc/rtas: clean up rtas_error_log_max initialization The code in rtas_get_error_log_max() doesn't cause problems in practice, but there are no measures to ensure that the lazy initialization of the static rtas_error_log_max variable is atomic, and it's not worth adding them. Initialize the static rtas_error_log_max variable at boot when we're single-threaded instead of lazily on first use. Use the more appropriate of_property_read_u32() API instead of rtas_token() to consult the "rtas-error-log-max" property, which is not the name of an RTAS function. Convert use of printk() to pr_warn() and distinguish the possible error cases. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-7-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index f21b39fcaf99..a1b637259e3d 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -353,6 +353,9 @@ int rtas_service_present(const char *service) EXPORT_SYMBOL(rtas_service_present); #ifdef CONFIG_RTAS_ERROR_LOGGING + +static u32 rtas_error_log_max __ro_after_init = RTAS_ERROR_LOG_MAX; + /* * Return the firmware-specified size of the error log buffer * for all rtas calls that require an error buffer argument. @@ -360,21 +363,30 @@ EXPORT_SYMBOL(rtas_service_present); */ int rtas_get_error_log_max(void) { - static int rtas_error_log_max; - if (rtas_error_log_max) - return rtas_error_log_max; - - rtas_error_log_max = rtas_token ("rtas-error-log-max"); - if ((rtas_error_log_max == RTAS_UNKNOWN_SERVICE) || - (rtas_error_log_max > RTAS_ERROR_LOG_MAX)) { - printk (KERN_WARNING "RTAS: bad log buffer size %d\n", - rtas_error_log_max); - rtas_error_log_max = RTAS_ERROR_LOG_MAX; - } return rtas_error_log_max; } EXPORT_SYMBOL(rtas_get_error_log_max); +static void __init init_error_log_max(void) +{ + static const char propname[] __initconst = "rtas-error-log-max"; + u32 max; + + if (of_property_read_u32(rtas.dev, propname, &max)) { + pr_warn("%s not found, using default of %u\n", + propname, RTAS_ERROR_LOG_MAX); + max = RTAS_ERROR_LOG_MAX; + } + + if (max > RTAS_ERROR_LOG_MAX) { + pr_warn("%s = %u, clamping max error log size to %u\n", + propname, max, RTAS_ERROR_LOG_MAX); + max = RTAS_ERROR_LOG_MAX; + } + + rtas_error_log_max = max; +} + static char rtas_err_buf[RTAS_ERROR_LOG_MAX]; static int rtas_last_error_token; @@ -432,6 +444,7 @@ static char *__fetch_rtas_last_error(char *altbuf) #else /* CONFIG_RTAS_ERROR_LOGGING */ #define __fetch_rtas_last_error(x) NULL #define get_errorlog_buffer() NULL +static void __init init_error_log_max(void) {} #endif @@ -1340,6 +1353,8 @@ void __init rtas_initialize(void) no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry); rtas.entry = no_entry ? rtas.base : entry; + init_error_log_max(); + /* * Discover these now to avoid device tree lookups in the * panic path. From 9581f8a00777a073fdd8146659a51ca007cae8d6 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:45 -0600 Subject: [PATCH 176/181] powerpc/rtas: clean up includes rtas.c used to host complex code related to pseries-specific guest migration and suspend, which used atomics, completions, hcalls, and CPU hotplug APIs. That's all been deleted or moved, so remove the include directives that have been rendered unnecessary. Sort the remainder (with linux/ before asm/) to impose some order on where future additions go. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-8-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 50 +++++++++++++++----------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index a1b637259e3d..5ce17b4bd7eb 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -7,43 +7,33 @@ * Copyright (C) 2001 IBM. */ -#include <linux/stdarg.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/export.h> -#include <linux/init.h> #include <linux/capability.h> #include <linux/delay.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/completion.h> -#include <linux/cpumask.h> +#include <linux/export.h> +#include <linux/init.h> +#include <linux/kernel.h> #include <linux/memblock.h> -#include <linux/slab.h> -#include <linux/reboot.h> -#include <linux/security.h> -#include <linux/syscalls.h> #include <linux/of.h> #include <linux/of_fdt.h> - -#include <asm/interrupt.h> -#include <asm/rtas.h> -#include <asm/hvcall.h> -#include <asm/machdep.h> -#include <asm/firmware.h> -#include <asm/page.h> -#include <asm/param.h> -#include <asm/delay.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/security.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/stdarg.h> +#include <linux/syscalls.h> +#include <linux/types.h> #include <linux/uaccess.h> -#include <asm/udbg.h> -#include <asm/syscalls.h> -#include <asm/smp.h> -#include <linux/atomic.h> -#include <asm/time.h> + +#include <asm/delay.h> +#include <asm/firmware.h> +#include <asm/interrupt.h> +#include <asm/machdep.h> #include <asm/mmu.h> -#include <asm/topology.h> +#include <asm/page.h> +#include <asm/rtas.h> +#include <asm/time.h> +#include <asm/udbg.h> /* This is here deliberately so it's only used in this file */ void enter_rtas(unsigned long); From f975b6559bac510f1b1b39637997bb240f0a9969 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:46 -0600 Subject: [PATCH 177/181] powerpc/rtas: define pr_fmt and convert printk call sites Set pr_fmt to "rtas: " and convert the handful of printk() uses in rtas.c, adjusting the messages to remove now-redundant "RTAS" strings. Note that rtas_restart(), rtas_power_off(), and rtas_halt() all currently use printk() without specifying a log level. These have been changed to use pr_emerg(), which matches the behavior of rtas_os_term(). Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-9-nathanl@linux.ibm.com --- arch/powerpc/kernel/rtas.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 5ce17b4bd7eb..10c19228aaa3 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -7,6 +7,8 @@ * Copyright (C) 2001 IBM. */ +#define pr_fmt(fmt) "rtas: " fmt + #include <linux/capability.h> #include <linux/delay.h> #include <linux/export.h> @@ -718,8 +720,7 @@ static int rtas_error_rc(int rtas_rc) rc = -ENODEV; break; default: - printk(KERN_ERR "%s: unexpected RTAS error %d\n", - __func__, rtas_rc); + pr_err("%s: unexpected error %d\n", __func__, rtas_rc); rc = -ERANGE; break; } @@ -923,8 +924,8 @@ void __noreturn rtas_restart(char *cmd) { if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_RESTART); - printk("RTAS system-reboot returned %d\n", - rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); + pr_emerg("system-reboot returned %d\n", + rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); for (;;); } @@ -933,8 +934,8 @@ void rtas_power_off(void) if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_POWER_OFF); /* allow power on only with power button press */ - printk("RTAS power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + pr_emerg("power-off returned %d\n", + rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); for (;;); } @@ -943,8 +944,8 @@ void __noreturn rtas_halt(void) if (rtas_flash_term_hook) rtas_flash_term_hook(SYS_HALT); /* allow power on only with power button press */ - printk("RTAS power-off returned %d\n", - rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); + pr_emerg("power-off returned %d\n", + rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1)); for (;;); } @@ -978,7 +979,7 @@ void rtas_os_term(char *str) } while (rtas_busy_delay_time(status)); if (status != 0) - printk(KERN_EMERG "ibm,os-term call failed %d\n", status); + pr_emerg("ibm,os-term call failed %d\n", status); } /** From 98c738c8cee6e5a58d4060862e2f8cf3cdc8a328 Mon Sep 17 00:00:00 2001 From: Nathan Lynch <nathanl@linux.ibm.com> Date: Fri, 18 Nov 2022 09:07:47 -0600 Subject: [PATCH 178/181] powerpc/rtas: mandate RTAS syscall filtering CONFIG_PPC_RTAS_FILTER has been optional but default-enabled since its introduction. It's been enabled in enterprise distro kernels for a while without causing ABI breakage that wasn't easily fixed, and it prevents harmful abuses of the rtas syscall. Let's make it unconditional. Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com> Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221118150751.469393-10-nathanl@linux.ibm.com --- arch/powerpc/Kconfig | 13 ------------- arch/powerpc/kernel/rtas.c | 16 ---------------- 2 files changed, 29 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e21d6de797d6..65952f62ea4b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1044,19 +1044,6 @@ config PPC_SECVAR_SYSFS read/write operations on these variables. Say Y if you have secure boot enabled and want to expose variables to userspace. -config PPC_RTAS_FILTER - bool "Enable filtering of RTAS syscalls" - default y - depends on PPC_RTAS - help - The RTAS syscall API has security issues that could be used to - compromise system integrity. This option enforces restrictions on the - RTAS calls and arguments passed by userspace programs to mitigate - these issues. - - Say Y unless you know what you are doing and the filter is causing - problems for you. - endmenu config ISA_DMA_API diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 10c19228aaa3..deded51a7978 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1050,8 +1050,6 @@ noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log return NULL; } -#ifdef CONFIG_PPC_RTAS_FILTER - /* * The sys_rtas syscall, as originally designed, allows root to pass * arbitrary physical addresses to RTAS calls. A number of RTAS calls @@ -1200,20 +1198,6 @@ static void __init rtas_syscall_filter_init(void) rtas_filters[i].token = rtas_token(rtas_filters[i].name); } -#else - -static bool block_rtas_call(int token, int nargs, - struct rtas_args *args) -{ - return false; -} - -static void __init rtas_syscall_filter_init(void) -{ -} - -#endif /* CONFIG_PPC_RTAS_FILTER */ - /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { From 64fdcbcc064966bbf261bb455876dffa58858d32 Mon Sep 17 00:00:00 2001 From: Michael Ellerman <mpe@ellerman.id.au> Date: Thu, 8 Dec 2022 09:43:15 +1100 Subject: [PATCH 179/181] powerpc/prom: Fix 32-bit build Add an IS_ENABLED() check to fix the build error: arch/powerpc/kernel/prom.o: in function `early_init_dt_scan_cpus': prom.c:(.init.text+0x2ea): undefined reference to `boot_cpu_node_count' Fixes: e13d23a404f2 ("powerpc: export the CPU node count") Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> --- arch/powerpc/kernel/prom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 645f4450dfc3..4f1c920aa13e 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -336,7 +336,8 @@ static int __init early_init_dt_scan_cpus(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; - boot_cpu_node_count++; + if (IS_ENABLED(CONFIG_PPC64)) + boot_cpu_node_count++; /* Get physical cpuid */ intserv = of_get_flat_dt_prop(node, "ibm,ppc-interrupt-server#s", &len); From 13959373e9c9021cc80730c7bd1242e07b10b328 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin <npiggin@gmail.com> Date: Thu, 8 Dec 2022 22:32:25 +1000 Subject: [PATCH 180/181] powerpc/qspinlock: Fix 32-bit build Some 32-bit configurations don't pull in the spin_begin/end/relax definitions. Fix is to restore a lost include. Reported-by: kernel test robot <lkp@intel.com> Fixes: 84990b169557 ("powerpc/qspinlock: add mcs queueing for contended waiters") Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/oe-kbuild-all/202212050224.i7uh9fOh-lkp@intel.com Link: https://lore.kernel.org/r/20221208123225.1566113-1-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 1cf5d3e75250..e4bd145255d0 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -3,6 +3,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <linux/percpu.h> +#include <linux/processor.h> #include <linux/smp.h> #include <linux/topology.h> #include <linux/sched/clock.h> From 980411a4d1bb925d28cd9e8d8301dc982ece788d Mon Sep 17 00:00:00 2001 From: Michael Ellerman <mpe@ellerman.id.au> Date: Fri, 16 Dec 2022 12:43:12 +1100 Subject: [PATCH 181/181] powerpc/code-patching: Fix oops with DEBUG_VM enabled Nathan reported that the new per-cpu mm patching oopses if DEBUG_VM is enabled: ------------[ cut here ]------------ kernel BUG at arch/powerpc/mm/pgtable.c:333! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.1.0-rc2+ #1 Hardware name: IBM PowerNV (emulated by qemu) POWER9 0x4e1200 opal:v7.0 PowerNV ... NIP assert_pte_locked+0x180/0x1a0 LR assert_pte_locked+0x170/0x1a0 Call Trace: 0x60000000 (unreliable) patch_instruction+0x618/0x6d0 arch_prepare_kprobe+0xfc/0x2d0 register_kprobe+0x520/0x7c0 arch_init_kprobes+0x28/0x3c init_kprobes+0x108/0x184 do_one_initcall+0x60/0x2e0 kernel_init_freeable+0x1f0/0x3e0 kernel_init+0x34/0x1d0 ret_from_kernel_thread+0x5c/0x64 It's caused by the assert_spin_locked() failing in assert_pte_locked(). The assert fails because the PTE was unlocked in text_area_cpu_up_mm(), and never relocked. The PTE page shouldn't be freed, the patching_mm is only used for patching on this CPU, only that single PTE is ever mapped, and it's only unmapped at CPU offline. In fact assert_pte_locked() has a special case to ignore init_mm entirely, and the patching_mm is more-or-less like init_mm, so possibly the check could be skipped for patching_mm too. But for now be conservative, and use the proper PTE accessors at patching time, so that the PTE lock is held while the PTE is used. That also avoids the warning in assert_pte_locked(). With that it's no longer necessary to save the PTE in cpu_patching_context for the mm_patch_enabled() case. Fixes: c28c15b6d28a ("powerpc/code-patching: Use temporary mm for Radix MMU") Reported-by: Nathan Chancellor <nathan@kernel.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20221216125913.990972-1-mpe@ellerman.id.au --- arch/powerpc/lib/code-patching.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 73ce4b90bb1b..b00112d7ad46 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -178,7 +178,6 @@ static int text_area_cpu_up_mm(unsigned int cpu) this_cpu_write(cpu_patching_context.mm, mm); this_cpu_write(cpu_patching_context.addr, addr); - this_cpu_write(cpu_patching_context.pte, pte); return 0; @@ -195,7 +194,6 @@ static int text_area_cpu_down_mm(unsigned int cpu) this_cpu_write(cpu_patching_context.mm, NULL); this_cpu_write(cpu_patching_context.addr, 0); - this_cpu_write(cpu_patching_context.pte, NULL); return 0; } @@ -289,12 +287,16 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) unsigned long pfn = get_patch_pfn(addr); struct mm_struct *patching_mm; struct mm_struct *orig_mm; + spinlock_t *ptl; patching_mm = __this_cpu_read(cpu_patching_context.mm); - pte = __this_cpu_read(cpu_patching_context.pte); text_poke_addr = __this_cpu_read(cpu_patching_context.addr); patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); + if (!pte) + return -ENOMEM; + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); /* order PTE update before use, also serves as the hwsync */ @@ -321,6 +323,8 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) */ local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + pte_unmap_unlock(pte, ptl); + return err; }