From 0cbc2659123e6946ba926c5ec3871efe310123e8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 22 Jun 2020 13:28:34 +0100 Subject: [PATCH 001/148] arm64: vdso32: Remove a bunch of #ifdef CONFIG_COMPAT_VDSO guards Most of the compat vDSO code can be built and guarded using IS_ENABLED, so drop the unnecessary #ifdefs. Reviewed-by: Vincenzo Frascino Reviewed-by: Ard Biesheuvel Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso.c | 44 ++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index d4202a32abc9..1501b22a3cf6 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -30,15 +30,11 @@ #include extern char vdso_start[], vdso_end[]; -#ifdef CONFIG_COMPAT_VDSO extern char vdso32_start[], vdso32_end[]; -#endif /* CONFIG_COMPAT_VDSO */ enum vdso_abi { VDSO_ABI_AA64, -#ifdef CONFIG_COMPAT_VDSO VDSO_ABI_AA32, -#endif /* CONFIG_COMPAT_VDSO */ }; enum vvar_pages { @@ -284,21 +280,17 @@ up_fail: /* * Create and map the vectors page for AArch32 tasks. */ -#ifdef CONFIG_COMPAT_VDSO static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { return __vdso_remap(VDSO_ABI_AA32, sm, new_vma); } -#endif /* CONFIG_COMPAT_VDSO */ enum aarch32_map { AA32_MAP_VECTORS, /* kuser helpers */ -#ifdef CONFIG_COMPAT_VDSO + AA32_MAP_SIGPAGE, AA32_MAP_VVAR, AA32_MAP_VDSO, -#endif - AA32_MAP_SIGPAGE }; static struct page *aarch32_vectors_page __ro_after_init; @@ -309,7 +301,10 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { .name = "[vectors]", /* ABI */ .pages = &aarch32_vectors_page, }, -#ifdef CONFIG_COMPAT_VDSO + [AA32_MAP_SIGPAGE] = { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_sig_page, + }, [AA32_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, @@ -319,11 +314,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { .name = "[vdso]", .mremap = aarch32_vdso_mremap, }, -#endif /* CONFIG_COMPAT_VDSO */ - [AA32_MAP_SIGPAGE] = { - .name = "[sigpage]", /* ABI */ - .pages = &aarch32_sig_page, - }, }; static int aarch32_alloc_kuser_vdso_page(void) @@ -362,25 +352,25 @@ static int aarch32_alloc_sigpage(void) return 0; } -#ifdef CONFIG_COMPAT_VDSO static int __aarch32_alloc_vdso_pages(void) { + + if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) + return 0; + vdso_info[VDSO_ABI_AA32].dm = &aarch32_vdso_maps[AA32_MAP_VVAR]; vdso_info[VDSO_ABI_AA32].cm = &aarch32_vdso_maps[AA32_MAP_VDSO]; return __vdso_init(VDSO_ABI_AA32); } -#endif /* CONFIG_COMPAT_VDSO */ static int __init aarch32_alloc_vdso_pages(void) { int ret; -#ifdef CONFIG_COMPAT_VDSO ret = __aarch32_alloc_vdso_pages(); if (ret) return ret; -#endif ret = aarch32_alloc_sigpage(); if (ret) @@ -449,14 +439,14 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (ret) goto out; -#ifdef CONFIG_COMPAT_VDSO - ret = __setup_additional_pages(VDSO_ABI_AA32, - mm, - bprm, - uses_interp); - if (ret) - goto out; -#endif /* CONFIG_COMPAT_VDSO */ + if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { + ret = __setup_additional_pages(VDSO_ABI_AA32, + mm, + bprm, + uses_interp); + if (ret) + goto out; + } ret = aarch32_sigreturn_setup(mm); out: From 2a30aca81a72d71e128eb692cc4755f33e317670 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 22 Jun 2020 13:37:09 +0100 Subject: [PATCH 002/148] arm64: vdso: Fix unusual formatting in *setup_additional_pages() There's really no need to put every parameter on a new line when calling a function with a long name, so reformat the *setup_additional_pages() functions in the vDSO setup code to follow the usual conventions. Acked-by: Mark Rutland Reviewed-by: Vincenzo Frascino Reviewed-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/kernel/vdso.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 1501b22a3cf6..debb8995d57f 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -440,9 +440,7 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) goto out; if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { - ret = __setup_additional_pages(VDSO_ABI_AA32, - mm, - bprm, + ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, uses_interp); if (ret) goto out; @@ -487,8 +485,7 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; int ret; @@ -496,11 +493,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __setup_additional_pages(VDSO_ABI_AA64, - mm, - bprm, - uses_interp); - + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); mmap_write_unlock(mm); return ret; From c058b1c4a5ea7b88cce4c961c1000acf482ea64b Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 6 Sep 2019 10:55:29 +0100 Subject: [PATCH 003/148] arm64: mte: system register definitions Add Memory Tagging Extension system register definitions together with the relevant bitfields. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/include/asm/sysreg.h | 53 ++++++++++++++++++++++++++++ arch/arm64/include/uapi/asm/ptrace.h | 1 + arch/arm64/kernel/ptrace.c | 2 +- 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1da8e3dc4455..1593669cca4d 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,6 +12,7 @@ #include /* Hyp Configuration Register (HCR) bits */ +#define HCR_ATA (UL(1) << 56) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) #define HCR_APK (UL(1) << 40) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 554a7e8ecb07..6fa9aa477e76 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -91,10 +91,12 @@ #define PSTATE_PAN pstate_field(0, 4) #define PSTATE_UAO pstate_field(0, 3) #define PSTATE_SSBS pstate_field(3, 1) +#define PSTATE_TCO pstate_field(3, 4) #define SET_PSTATE_PAN(x) __emit_inst(0xd500401f | PSTATE_PAN | ((!!x) << PSTATE_Imm_shift)) #define SET_PSTATE_UAO(x) __emit_inst(0xd500401f | PSTATE_UAO | ((!!x) << PSTATE_Imm_shift)) #define SET_PSTATE_SSBS(x) __emit_inst(0xd500401f | PSTATE_SSBS | ((!!x) << PSTATE_Imm_shift)) +#define SET_PSTATE_TCO(x) __emit_inst(0xd500401f | PSTATE_TCO | ((!!x) << PSTATE_Imm_shift)) #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -181,6 +183,8 @@ #define SYS_SCTLR_EL1 sys_reg(3, 0, 1, 0, 0) #define SYS_ACTLR_EL1 sys_reg(3, 0, 1, 0, 1) #define SYS_CPACR_EL1 sys_reg(3, 0, 1, 0, 2) +#define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) +#define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) #define SYS_ZCR_EL1 sys_reg(3, 0, 1, 2, 0) @@ -218,6 +222,8 @@ #define SYS_ERXADDR_EL1 sys_reg(3, 0, 5, 4, 3) #define SYS_ERXMISC0_EL1 sys_reg(3, 0, 5, 5, 0) #define SYS_ERXMISC1_EL1 sys_reg(3, 0, 5, 5, 1) +#define SYS_TFSR_EL1 sys_reg(3, 0, 5, 6, 0) +#define SYS_TFSRE0_EL1 sys_reg(3, 0, 5, 6, 1) #define SYS_FAR_EL1 sys_reg(3, 0, 6, 0, 0) #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) @@ -368,6 +374,7 @@ #define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0) #define SYS_CLIDR_EL1 sys_reg(3, 1, 0, 0, 1) +#define SYS_GMID_EL1 sys_reg(3, 1, 0, 0, 4) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) #define SYS_CSSELR_EL1 sys_reg(3, 2, 0, 0, 0) @@ -460,6 +467,7 @@ #define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0) #define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3) #define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0) +#define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0) #define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0) #define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1) @@ -516,6 +524,7 @@ #define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0) #define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1) #define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0) +#define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0) #define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0) #define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0) #define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0) @@ -531,6 +540,15 @@ /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_DSSBS (BIT(44)) +#define SCTLR_ELx_ATA (BIT(43)) + +#define SCTLR_ELx_TCF_SHIFT 40 +#define SCTLR_ELx_TCF_NONE (UL(0x0) << SCTLR_ELx_TCF_SHIFT) +#define SCTLR_ELx_TCF_SYNC (UL(0x1) << SCTLR_ELx_TCF_SHIFT) +#define SCTLR_ELx_TCF_ASYNC (UL(0x2) << SCTLR_ELx_TCF_SHIFT) +#define SCTLR_ELx_TCF_MASK (UL(0x3) << SCTLR_ELx_TCF_SHIFT) + +#define SCTLR_ELx_ITFSB (BIT(37)) #define SCTLR_ELx_ENIA (BIT(31)) #define SCTLR_ELx_ENIB (BIT(30)) #define SCTLR_ELx_ENDA (BIT(27)) @@ -559,6 +577,14 @@ #endif /* SCTLR_EL1 specific flags. */ +#define SCTLR_EL1_ATA0 (BIT(42)) + +#define SCTLR_EL1_TCF0_SHIFT 38 +#define SCTLR_EL1_TCF0_NONE (UL(0x0) << SCTLR_EL1_TCF0_SHIFT) +#define SCTLR_EL1_TCF0_SYNC (UL(0x1) << SCTLR_EL1_TCF0_SHIFT) +#define SCTLR_EL1_TCF0_ASYNC (UL(0x2) << SCTLR_EL1_TCF0_SHIFT) +#define SCTLR_EL1_TCF0_MASK (UL(0x3) << SCTLR_EL1_TCF0_SHIFT) + #define SCTLR_EL1_BT1 (BIT(36)) #define SCTLR_EL1_BT0 (BIT(35)) #define SCTLR_EL1_UCI (BIT(26)) @@ -595,6 +621,7 @@ #define MAIR_ATTR_DEVICE_GRE UL(0x0c) #define MAIR_ATTR_NORMAL_NC UL(0x44) #define MAIR_ATTR_NORMAL_WT UL(0xbb) +#define MAIR_ATTR_NORMAL_TAGGED UL(0xf0) #define MAIR_ATTR_NORMAL UL(0xff) #define MAIR_ATTR_MASK UL(0xff) @@ -686,6 +713,10 @@ #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 #define ID_AA64PFR1_BT_BTI 0x1 +#define ID_AA64PFR1_MTE_NI 0x0 +#define ID_AA64PFR1_MTE_EL0 0x1 +#define ID_AA64PFR1_MTE 0x2 + /* id_aa64zfr0 */ #define ID_AA64ZFR0_F64MM_SHIFT 56 #define ID_AA64ZFR0_F32MM_SHIFT 52 @@ -920,6 +951,28 @@ #define CPACR_EL1_ZEN_EL0EN (BIT(17)) /* enable EL0 access, if EL1EN set */ #define CPACR_EL1_ZEN (CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) +/* TCR EL1 Bit Definitions */ +#define SYS_TCR_EL1_TCMA1 (BIT(58)) +#define SYS_TCR_EL1_TCMA0 (BIT(57)) + +/* GCR_EL1 Definitions */ +#define SYS_GCR_EL1_RRND (BIT(16)) +#define SYS_GCR_EL1_EXCL_MASK 0xffffUL + +/* RGSR_EL1 Definitions */ +#define SYS_RGSR_EL1_TAG_MASK 0xfUL +#define SYS_RGSR_EL1_SEED_SHIFT 8 +#define SYS_RGSR_EL1_SEED_MASK 0xffffUL + +/* GMID_EL1 field definitions */ +#define SYS_GMID_EL1_BS_SHIFT 0 +#define SYS_GMID_EL1_BS_SIZE 4 + +/* TFSR{,E0}_EL1 bit definitions */ +#define SYS_TFSR_EL1_TF0_SHIFT 0 +#define SYS_TFSR_EL1_TF1_SHIFT 1 +#define SYS_TFSR_EL1_TF0 (UL(1) << SYS_TFSR_EL1_TF0_SHIFT) +#define SYS_TFSR_EL1_TF1 (UK(2) << SYS_TFSR_EL1_TF1_SHIFT) /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */ #define SYS_MPIDR_SAFE_VAL (BIT(31)) diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 42cbe34d95ce..06413d9f2341 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -51,6 +51,7 @@ #define PSR_PAN_BIT 0x00400000 #define PSR_UAO_BIT 0x00800000 #define PSR_DIT_BIT 0x01000000 +#define PSR_TCO_BIT 0x02000000 #define PSR_V_BIT 0x10000000 #define PSR_C_BIT 0x20000000 #define PSR_Z_BIT 0x40000000 diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index d8ebfd813e28..8942de814b72 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1793,7 +1793,7 @@ void syscall_trace_exit(struct pt_regs *regs) * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ - (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ + (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) From 0178dc7613684561ff3bb1625cd5504f1e7fbe3d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Nov 2019 09:51:13 +0000 Subject: [PATCH 004/148] arm64: mte: Use Normal Tagged attributes for the linear map Once user space is given access to tagged memory, the kernel must be able to clear/save/restore tags visible to the user. This is done via the linear mapping, therefore map it as such. The new MT_NORMAL_TAGGED index for MAIR_EL1 is initially mapped as Normal memory and later changed to Normal Tagged via the cpufeature infrastructure. From a mismatched attribute aliases perspective, the Tagged memory is considered a permission and it won't lead to undefined behaviour. Signed-off-by: Catalin Marinas Cc: Will Deacon Cc: Suzuki K Poulose --- arch/arm64/include/asm/memory.h | 1 + arch/arm64/include/asm/pgtable-prot.h | 2 ++ arch/arm64/mm/dump.c | 4 ++++ arch/arm64/mm/mmu.c | 20 ++++++++++++++++++-- arch/arm64/mm/proc.S | 8 ++++++-- 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index afa722504bfd..1e0a78266410 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -133,6 +133,7 @@ #define MT_NORMAL_NC 3 #define MT_NORMAL 4 #define MT_NORMAL_WT 5 +#define MT_NORMAL_TAGGED 6 /* * Memory types for Stage-2 translation diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 4d867c6446c4..afd8b9fc76f2 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -50,6 +50,7 @@ extern bool arm64_use_ng_mappings; #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL_WT (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_WT)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) +#define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) @@ -59,6 +60,7 @@ extern bool arm64_use_ng_mappings; #define _HYP_PAGE_DEFAULT _PAGE_DEFAULT #define PAGE_KERNEL __pgprot(PROT_NORMAL) +#define PAGE_KERNEL_TAGGED __pgprot(PROT_NORMAL_TAGGED) #define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 0b8da1cc1c07..ba6d1d89f9b2 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -169,6 +169,10 @@ static const struct prot_bits pte_bits[] = { .mask = PTE_ATTRINDX_MASK, .val = PTE_ATTRINDX(MT_NORMAL), .set = "MEM/NORMAL", + }, { + .mask = PTE_ATTRINDX_MASK, + .val = PTE_ATTRINDX(MT_NORMAL_TAGGED), + .set = "MEM/NORMAL-TAGGED", } }; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 75df62fea1b6..936c4762dadf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -122,7 +122,7 @@ static bool pgattr_change_is_safe(u64 old, u64 new) * The following mapping attributes may be updated in live * kernel mappings without the need for break-before-make. */ - static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG; + pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG; /* creating or taking down mappings is always safe */ if (old == 0 || new == 0) @@ -136,6 +136,17 @@ static bool pgattr_change_is_safe(u64 old, u64 new) if (old & ~new & PTE_NG) return false; + /* + * Changing the memory type between Normal and Normal-Tagged is safe + * since Tagged is considered a permission attribute from the + * mismatched attribute aliases perspective. + */ + if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || + (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) && + ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || + (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED))) + mask |= PTE_ATTRINDX_MASK; + return ((old ^ new) & ~mask) == 0; } @@ -491,7 +502,12 @@ static void __init map_mem(pgd_t *pgdp) if (memblock_is_nomap(reg)) continue; - __map_memblock(pgdp, start, end, PAGE_KERNEL, flags); + /* + * The linear map must allow allocation tags reading/writing + * if MTE is present. Otherwise, it has the same attributes as + * PAGE_KERNEL. + */ + __map_memblock(pgdp, start, end, PAGE_KERNEL_TAGGED, flags); } /* diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 796e47a571e6..4817ed52e343 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -44,14 +44,18 @@ #define TCR_KASAN_FLAGS 0 #endif -/* Default MAIR_EL1 */ +/* + * Default MAIR_EL1. MT_NORMAL_TAGGED is initially mapped as Normal memory and + * changed during __cpu_setup to Normal Tagged if the system supports MTE. + */ #define MAIR_EL1_SET \ (MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRnE, MT_DEVICE_nGnRnE) | \ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_nGnRE, MT_DEVICE_nGnRE) | \ MAIR_ATTRIDX(MAIR_ATTR_DEVICE_GRE, MT_DEVICE_GRE) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL_NC, MT_NORMAL_NC) | \ MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL) | \ - MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT)) + MAIR_ATTRIDX(MAIR_ATTR_NORMAL_WT, MT_NORMAL_WT) | \ + MAIR_ATTRIDX(MAIR_ATTR_NORMAL, MT_NORMAL_TAGGED)) #ifdef CONFIG_CPU_PM /** From 3b714d24ef173f81c78af16f73dcc9b40428c803 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 6 Sep 2019 10:58:01 +0100 Subject: [PATCH 005/148] arm64: mte: CPU feature detection and initial sysreg configuration Add the cpufeature and hwcap entries to detect the presence of MTE. Any secondary CPU not supporting the feature, if detected on the boot CPU, will be parked. Add the minimum SCTLR_EL1 and HCR_EL2 bits for enabling MTE. The Normal Tagged memory type is configured in MAIR_EL1 before the MMU is enabled in order to avoid disrupting other CPUs in the CnP domain. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Will Deacon Cc: Suzuki K Poulose --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/cpufeature.h | 6 ++++++ arch/arm64/include/asm/hwcap.h | 2 +- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 2 +- arch/arm64/kernel/cpufeature.c | 17 +++++++++++++++++ arch/arm64/kernel/cpuinfo.c | 2 +- arch/arm64/mm/proc.S | 24 ++++++++++++++++++++++++ 9 files changed, 54 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 07b643a70710..1937653b05a3 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -64,7 +64,8 @@ #define ARM64_BTI 54 #define ARM64_HAS_ARMv8_4_TTL 55 #define ARM64_HAS_TLB_RANGE 56 +#define ARM64_MTE 57 -#define ARM64_NCAPS 57 +#define ARM64_NCAPS 58 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 89b4f0142c28..680b5b36ddd5 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -681,6 +681,12 @@ static __always_inline bool system_uses_irq_prio_masking(void) cpus_have_const_cap(ARM64_HAS_IRQ_PRIO_MASKING); } +static inline bool system_supports_mte(void) +{ + return IS_ENABLED(CONFIG_ARM64_MTE) && + cpus_have_const_cap(ARM64_MTE); +} + static inline bool system_has_prio_mask_debugging(void) { return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) && diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 22f73fe09030..0d4a6741b6a5 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -95,7 +95,7 @@ #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) #define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) -/* reserved for KERNEL_HWCAP_MTE __khwcap2_feature(MTE) */ +#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE) /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1593669cca4d..5aee5f79b24e 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -79,7 +79,7 @@ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) -#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK) +#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6fa9aa477e76..daf030a05de0 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -613,6 +613,7 @@ SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\ SCTLR_EL1_DZE | SCTLR_EL1_UCT |\ SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\ + SCTLR_ELx_ITFSB| SCTLR_ELx_ATA | SCTLR_EL1_ATA0 |\ ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1) /* MAIR_ELx memory attributes (used by Linux) */ diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 912162f73529..b8f41aa234ee 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -74,6 +74,6 @@ #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) #define HWCAP2_BTI (1 << 17) -/* reserved for HWCAP2_MTE (1 << 18) */ +#define HWCAP2_MTE (1 << 18) #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6424584be01e..fabc8a237223 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -227,6 +227,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE), + FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MTE_SHIFT, 4, ID_AA64PFR1_MTE_NI), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), @@ -2121,6 +2123,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, }, #endif +#ifdef CONFIG_ARM64_MTE + { + .desc = "Memory Tagging Extension", + .capability = ARM64_MTE, + .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR1_EL1, + .field_pos = ID_AA64PFR1_MTE_SHIFT, + .min_field_value = ID_AA64PFR1_MTE, + .sign = FTR_UNSIGNED, + }, +#endif /* CONFIG_ARM64_MTE */ {}, }; @@ -2237,6 +2251,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif +#ifdef CONFIG_ARM64_MTE + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE), +#endif /* CONFIG_ARM64_MTE */ {}, }; diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index d0076c2159e6..6104b87f021d 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -93,7 +93,7 @@ static const char *const hwcap_str[] = { "dgh", "rng", "bti", - /* reserved for "mte" */ + "mte", NULL }; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 4817ed52e343..23c326a06b2d 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_ARM64_64K_PAGES #define TCR_TG_FLAGS TCR_TG0_64K | TCR_TG1_64K @@ -425,6 +426,29 @@ SYM_FUNC_START(__cpu_setup) * Memory region attributes */ mov_q x5, MAIR_EL1_SET +#ifdef CONFIG_ARM64_MTE + /* + * Update MAIR_EL1, GCR_EL1 and TFSR*_EL1 if MTE is supported + * (ID_AA64PFR1_EL1[11:8] > 1). + */ + mrs x10, ID_AA64PFR1_EL1 + ubfx x10, x10, #ID_AA64PFR1_MTE_SHIFT, #4 + cmp x10, #ID_AA64PFR1_MTE + b.lt 1f + + /* Normal Tagged memory type at the corresponding MAIR index */ + mov x10, #MAIR_ATTR_NORMAL_TAGGED + bfi x5, x10, #(8 * MT_NORMAL_TAGGED), #8 + + /* initialize GCR_EL1: all non-zero tags excluded by default */ + mov x10, #(SYS_GCR_EL1_RRND | SYS_GCR_EL1_EXCL_MASK) + msr_s SYS_GCR_EL1, x10 + + /* clear any pending tag check faults in TFSR*_EL1 */ + msr_s SYS_TFSR_EL1, xzr + msr_s SYS_TFSRE0_EL1, xzr +1: +#endif msr mair_el1, x5 /* * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for From 2ac638fc5724f011f8ba1b425667c5592e1571ce Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 26 Aug 2020 18:52:59 +0100 Subject: [PATCH 006/148] arm64: kvm: mte: Hide the MTE CPUID information from the guests KVM does not support MTE in guests yet, so clear the corresponding field in the ID_AA64PFR1_EL1 register. In addition, inject an undefined exception in the guest if it accesses one of the GCR_EL1, RGSR_EL1, TFSR_EL1 or TFSRE0_EL1 registers. While the emulate_sys_reg() function already injects an undefined exception, this patch prevents the unnecessary printk. Signed-off-by: Catalin Marinas Cc: Steven Price Acked-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 077293b5115f..379f4969d0bd 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1131,6 +1131,8 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_sve(vcpu)) val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT); + } else if (id == SYS_ID_AA64PFR1_EL1) { + val &= ~(0xfUL << ID_AA64PFR1_MTE_SHIFT); } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) { val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) | (0xfUL << ID_AA64ISAR1_API_SHIFT) | @@ -1382,6 +1384,13 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; } +static bool access_mte_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *r) +{ + kvm_inject_undefined(vcpu); + return false; +} + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1547,6 +1556,10 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, + + { SYS_DESC(SYS_RGSR_EL1), access_mte_regs }, + { SYS_DESC(SYS_GCR_EL1), access_mte_regs }, + { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, @@ -1571,6 +1584,9 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi }, { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi }, + { SYS_DESC(SYS_TFSR_EL1), access_mte_regs }, + { SYS_DESC(SYS_TFSRE0_EL1), access_mte_regs }, + { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, From 74f1082487feb90bbf880af14beb8e29c3030c9f Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Wed, 7 Aug 2019 12:21:05 +0100 Subject: [PATCH 007/148] arm64: mte: Add specific SIGSEGV codes Add MTE-specific SIGSEGV codes to siginfo.h and update the x86 BUILD_BUG_ON(NSIGSEGV != 7) compile check. Signed-off-by: Vincenzo Frascino [catalin.marinas@arm.com: renamed precise/imprecise to sync/async] [catalin.marinas@arm.com: dropped #ifdef __aarch64__, renumbered] Signed-off-by: Catalin Marinas Acked-by: "Eric W. Biederman" Cc: Arnd Bergmann Cc: Will Deacon --- arch/x86/kernel/signal_compat.c | 2 +- include/uapi/asm-generic/siginfo.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 9ccbf0576cd0..a7f3e12cfbdb 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void) */ BUILD_BUG_ON(NSIGILL != 11); BUILD_BUG_ON(NSIGFPE != 15); - BUILD_BUG_ON(NSIGSEGV != 7); + BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index cb3d6c267181..7aacf9389010 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -229,7 +229,9 @@ typedef struct siginfo { #define SEGV_ACCADI 5 /* ADI not enabled for mapped object */ #define SEGV_ADIDERR 6 /* Disrupting MCD error */ #define SEGV_ADIPERR 7 /* Precise MCD exception */ -#define NSIGSEGV 7 +#define SEGV_MTEAERR 8 /* Asynchronous ARM MTE error */ +#define SEGV_MTESERR 9 /* Synchronous ARM MTE exception */ +#define NSIGSEGV 9 /* * SIGBUS si_codes From 637ec831ea4f09c7529ac4078399ce4e25b46341 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Mon, 16 Sep 2019 11:51:17 +0100 Subject: [PATCH 008/148] arm64: mte: Handle synchronous and asynchronous tag check faults The Memory Tagging Extension has two modes of notifying a tag check fault at EL0, configurable through the SCTLR_EL1.TCF0 field: 1. Synchronous raising of a Data Abort exception with DFSC 17. 2. Asynchronous setting of a cumulative bit in TFSRE0_EL1. Add the exception handler for the synchronous exception and handling of the asynchronous TFSRE0_EL1.TF0 bit setting via a new TIF flag in do_notify_resume(). On a tag check failure in user-space, whether synchronous or asynchronous, a SIGSEGV will be raised on the faulting thread. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 23 +++++++++++++++++ arch/arm64/include/asm/thread_info.h | 4 ++- arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/entry.S | 37 ++++++++++++++++++++++++++++ arch/arm64/kernel/mte.c | 21 ++++++++++++++++ arch/arm64/kernel/process.c | 8 +++++- arch/arm64/kernel/signal.c | 9 +++++++ arch/arm64/kernel/syscall.c | 10 ++++++++ arch/arm64/mm/fault.c | 9 ++++++- 9 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/include/asm/mte.h create mode 100644 arch/arm64/kernel/mte.c diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h new file mode 100644 index 000000000000..a0bf310da74b --- /dev/null +++ b/arch/arm64/include/asm/mte.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 ARM Ltd. + */ +#ifndef __ASM_MTE_H +#define __ASM_MTE_H + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_ARM64_MTE + +void flush_mte_state(void); + +#else + +static inline void flush_mte_state(void) +{ +} + +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_MTE_H */ diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 5e784e16ee89..1fbab854a51b 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -67,6 +67,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ +#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -96,10 +97,11 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_SVE (1 << TIF_SVE) +#define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE | _TIF_FSCHECK) + _TIF_UPROBE | _TIF_FSCHECK | _TIF_MTE_ASYNC_FAULT) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index a561cbb91d4d..5fb9b728459b 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o +obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso/ probes/ obj-$(CONFIG_COMPAT_VDSO) += vdso32/ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 55af8b504b65..ff34461524d4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -149,6 +149,32 @@ alternative_cb_end #endif .endm + /* Check for MTE asynchronous tag check faults */ + .macro check_mte_async_tcf, flgs, tmp +#ifdef CONFIG_ARM64_MTE +alternative_if_not ARM64_MTE + b 1f +alternative_else_nop_endif + mrs_s \tmp, SYS_TFSRE0_EL1 + tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f + /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ + orr \flgs, \flgs, #_TIF_MTE_ASYNC_FAULT + str \flgs, [tsk, #TSK_TI_FLAGS] + msr_s SYS_TFSRE0_EL1, xzr +1: +#endif + .endm + + /* Clear the MTE asynchronous tag check faults */ + .macro clear_mte_async_tcf +#ifdef CONFIG_ARM64_MTE +alternative_if ARM64_MTE + dsb ish + msr_s SYS_TFSRE0_EL1, xzr +alternative_else_nop_endif +#endif + .endm + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 @@ -182,6 +208,8 @@ alternative_cb_end ldr x19, [tsk, #TSK_TI_FLAGS] disable_step_tsk x19, x20 + /* Check for asynchronous tag check faults in user space */ + check_mte_async_tcf x19, x22 apply_ssbd 1, x22, x23 ptrauth_keys_install_kernel tsk, x20, x22, x23 @@ -233,6 +261,13 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING str x20, [sp, #S_PMR_SAVE] alternative_else_nop_endif + /* Re-enable tag checking (TCO set on exception entry) */ +#ifdef CONFIG_ARM64_MTE +alternative_if ARM64_MTE + SET_PSTATE_TCO(0) +alternative_else_nop_endif +#endif + /* * Registers that may be useful after this macro is invoked: * @@ -744,6 +779,8 @@ SYM_CODE_START_LOCAL(ret_to_user) and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: + /* Ignore asynchronous tag check faults in the uaccess routines */ + clear_mte_async_tcf enable_step_tsk x1, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK bl stackleak_erase diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c new file mode 100644 index 000000000000..032016823957 --- /dev/null +++ b/arch/arm64/kernel/mte.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 ARM Ltd. + */ + +#include + +#include +#include +#include + +void flush_mte_state(void) +{ + if (!system_supports_mte()) + return; + + /* clear any pending asynchronous tag fault */ + dsb(ish); + write_sysreg_s(0, SYS_TFSRE0_EL1); + clear_thread_flag(TIF_MTE_ASYNC_FAULT); +} diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index f1804496b935..a49028efab68 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -239,7 +240,7 @@ static void print_pstate(struct pt_regs *regs) const char *btype_str = btypes[(pstate & PSR_BTYPE_MASK) >> PSR_BTYPE_SHIFT]; - printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO BTYPE=%s)\n", + printk("pstate: %08llx (%c%c%c%c %c%c%c%c %cPAN %cUAO %cTCO BTYPE=%s)\n", pstate, pstate & PSR_N_BIT ? 'N' : 'n', pstate & PSR_Z_BIT ? 'Z' : 'z', @@ -251,6 +252,7 @@ static void print_pstate(struct pt_regs *regs) pstate & PSR_F_BIT ? 'F' : 'f', pstate & PSR_PAN_BIT ? '+' : '-', pstate & PSR_UAO_BIT ? '+' : '-', + pstate & PSR_TCO_BIT ? '+' : '-', btype_str); } } @@ -336,6 +338,7 @@ void flush_thread(void) tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); + flush_mte_state(); } void release_thread(struct task_struct *dead_task) @@ -368,6 +371,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + /* clear any pending asynchronous tag fault raised by the parent */ + clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); + return 0; } diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 3b4f31f35e45..b27e87572ce3 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -748,6 +748,9 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->pstate |= PSR_BTYPE_C; } + /* TCO (Tag Check Override) always cleared for signal handlers */ + regs->pstate &= ~PSR_TCO_BIT; + if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else @@ -932,6 +935,12 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, if (thread_flags & _TIF_UPROBE) uprobe_notify_resume(regs); + if (thread_flags & _TIF_MTE_ASYNC_FAULT) { + clear_thread_flag(TIF_MTE_ASYNC_FAULT); + send_sig_fault(SIGSEGV, SEGV_MTEAERR, + (void __user *)NULL, current); + } + if (thread_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 5f0c04863d2c..e4c0dadf0d92 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -123,6 +123,16 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, local_daif_restore(DAIF_PROCCTX); user_exit(); + if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { + /* + * Process the asynchronous tag check fault before the actual + * syscall. do_notify_resume() will send a signal to userspace + * before the syscall is restarted. + */ + regs->regs[0] = -ERESTARTNOINTR; + return; + } + if (has_syscall_work(flags)) { /* * The de-facto standard way to skip a system call using ptrace diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f07333e86c2f..a3bd189602df 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -641,6 +641,13 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; } +static int do_tag_check_fault(unsigned long addr, unsigned int esr, + struct pt_regs *regs) +{ + do_bad_area(addr, esr, regs); + return 0; +} + static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, @@ -659,7 +666,7 @@ static const struct fault_info fault_info[] = { { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, - { do_bad, SIGKILL, SI_KERNEL, "unknown 17" }, + { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, From 4beba9486abd2f86d125271d6946f7c38ed0fe77 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 22 Apr 2020 15:25:27 +0100 Subject: [PATCH 009/148] mm: Add PG_arch_2 page flag For arm64 MTE support it is necessary to be able to mark pages that contain user space visible tags that will need to be saved/restored e.g. when swapped out. To support this add a new arch specific flag (PG_arch_2). This flag is only available on 64-bit architectures due to the limited number of spare page flags on the 32-bit ones. Signed-off-by: Steven Price [catalin.marinas@arm.com: use CONFIG_64BIT for guarding this new flag] Signed-off-by: Catalin Marinas Cc: Andrew Morton --- fs/proc/page.c | 3 +++ include/linux/kernel-page-flags.h | 1 + include/linux/page-flags.h | 3 +++ include/trace/events/mmflags.h | 9 ++++++++- tools/vm/page-types.c | 2 ++ 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index f909243d4a66..9f1077d94cde 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); +#ifdef CONFIG_64BIT + u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif return u; }; diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index abd20ef93c98..eee1877a354e 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -17,5 +17,6 @@ #define KPF_ARCH 38 #define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 +#define KPF_ARCH_2 41 #endif /* LINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6be1aa559b1e..276140c94f4a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -135,6 +135,9 @@ enum pageflags { #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) PG_young, PG_idle, +#endif +#ifdef CONFIG_64BIT + PG_arch_2, #endif __NR_PAGEFLAGS, diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 5fb752034386..67018d367b9f 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -79,6 +79,12 @@ #define IF_HAVE_PG_IDLE(flag,string) #endif +#ifdef CONFIG_64BIT +#define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_ARCH_2(flag,string) +#endif + #define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ {1UL << PG_waiters, "waiters" }, \ @@ -105,7 +111,8 @@ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ -IF_HAVE_PG_IDLE(PG_idle, "idle" ) +IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ +IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define show_page_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 58c0eab71bca..0517c744b04e 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -78,6 +78,7 @@ #define KPF_ARCH 38 #define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 +#define KPF_ARCH_2 41 /* [48-] take some arbitrary free slots for expanding overloaded flags * not part of kernel API @@ -135,6 +136,7 @@ static const char * const page_flag_names[] = { [KPF_ARCH] = "h:arch", [KPF_UNCACHED] = "c:uncached", [KPF_SOFTDIRTY] = "f:softdirty", + [KPF_ARCH_2] = "H:arch_2", [KPF_READAHEAD] = "I:readahead", [KPF_SLOB_FREE] = "P:slob_free", From 72e6afa08e988744822f9bf18043fc04c4df2178 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 2 Jul 2020 10:19:30 +0100 Subject: [PATCH 010/148] mm: Preserve the PG_arch_2 flag in __split_huge_page_tail() When a huge page is split into normal pages, part of the head page flags are transferred to the tail pages. However, the PG_arch_* flags are not part of the preserved set. PG_arch_2 is used by the arm64 MTE support to mark pages that have valid tags. The absence of such flag would cause the arm64 set_pte_at() to clear the tags in order to avoid stale tags exposed to user or the swapping out hooks to ignore the tags. Not preserving PG_arch_2 on huge page splitting leads to tag corruption in the tail pages. Preserve the newly added PG_arch_2 flag in __split_huge_page_tail(). Signed-off-by: Catalin Marinas Cc: Andrew Morton --- mm/huge_memory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2ccff8472cd4..1a5773c95f53 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2337,6 +2337,9 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | +#ifdef CONFIG_64BIT + (1L << PG_arch_2) | +#endif (1L << PG_dirty))); /* ->mapping in first tail page is compound_mapcount */ From 34bfeea4a9e9cdae713637541f240c3adfdfede3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 4 May 2020 14:42:36 +0100 Subject: [PATCH 011/148] arm64: mte: Clear the tags when a page is mapped in user-space with PROT_MTE Pages allocated by the kernel are not guaranteed to have the tags zeroed, especially as the kernel does not (yet) use MTE itself. To ensure the user can still access such pages when mapped into its address space, clear the tags via set_pte_at(). A new page flag - PG_mte_tagged (PG_arch_2) - is used to track pages with valid allocation tags. Since the zero page is mapped as pte_special(), it won't be covered by the above set_pte_at() mechanism. Clear its tags during early MTE initialisation. Co-developed-by: Steven Price Signed-off-by: Steven Price Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 16 +++++++++++++++ arch/arm64/include/asm/pgtable.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 18 +++++++++++++++++ arch/arm64/kernel/mte.c | 14 +++++++++++++ arch/arm64/lib/Makefile | 2 ++ arch/arm64/lib/mte.S | 34 ++++++++++++++++++++++++++++++++ 6 files changed, 91 insertions(+) create mode 100644 arch/arm64/lib/mte.S diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index a0bf310da74b..1716b3d02489 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -7,12 +7,28 @@ #ifndef __ASSEMBLY__ +#include + +#include + +void mte_clear_page_tags(void *addr); + #ifdef CONFIG_ARM64_MTE +/* track which pages have valid allocation tags */ +#define PG_mte_tagged PG_arch_2 + +void mte_sync_tags(pte_t *ptep, pte_t pte); void flush_mte_state(void); #else +/* unused if !CONFIG_ARM64_MTE, silence the compiler */ +#define PG_mte_tagged 0 + +static inline void mte_sync_tags(pte_t *ptep, pte_t pte) +{ +} static inline void flush_mte_state(void) { } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d5d3fbe73953..0a205a8e91b2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -90,6 +91,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) #define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP)) +#define pte_tagged(pte) ((pte_val(pte) & PTE_ATTRINDX_MASK) == \ + PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define pte_cont_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \ @@ -284,6 +287,10 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) __sync_icache_dcache(pte); + if (system_supports_mte() && + pte_present(pte) && pte_tagged(pte) && !pte_special(pte)) + mte_sync_tags(ptep, pte); + __check_racy_pte_update(mm, ptep, pte); set_pte(ptep, pte); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index fabc8a237223..add9da5d8ea3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include #include @@ -1704,6 +1705,22 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused) } #endif /* CONFIG_ARM64_BTI */ +#ifdef CONFIG_ARM64_MTE +static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) +{ + static bool cleared_zero_page = false; + + /* + * Clear the tags in the zero page. This needs to be done via the + * linear map which has the Tagged attribute. + */ + if (!cleared_zero_page) { + cleared_zero_page = true; + mte_clear_page_tags(lm_alias(empty_zero_page)); + } +} +#endif /* CONFIG_ARM64_MTE */ + /* Internal helper functions to match cpu capability type */ static bool cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap) @@ -2133,6 +2150,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR1_MTE_SHIFT, .min_field_value = ID_AA64PFR1_MTE, .sign = FTR_UNSIGNED, + .cpu_enable = cpu_enable_mte, }, #endif /* CONFIG_ARM64_MTE */ {}, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 032016823957..5bf9bbed5a25 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -3,12 +3,26 @@ * Copyright (C) 2020 ARM Ltd. */ +#include +#include #include #include #include #include +void mte_sync_tags(pte_t *ptep, pte_t pte) +{ + struct page *page = pte_page(pte); + long i, nr_pages = compound_nr(page); + + /* if PG_mte_tagged is set, tags have already been initialised */ + for (i = 0; i < nr_pages; i++, page++) { + if (!test_and_set_bit(PG_mte_tagged, &page->flags)) + mte_clear_page_tags(page_address(page)); + } +} + void flush_mte_state(void) { if (!system_supports_mte()) diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 2fc253466dbf..d31e1169d9b8 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -16,3 +16,5 @@ lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + +obj-$(CONFIG_ARM64_MTE) += mte.o diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S new file mode 100644 index 000000000000..a36705640086 --- /dev/null +++ b/arch/arm64/lib/mte.S @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 ARM Ltd. + */ +#include + +#include +#include + + .arch armv8.5-a+memtag + +/* + * multitag_transfer_size - set \reg to the block size that is accessed by the + * LDGM/STGM instructions. + */ + .macro multitag_transfer_size, reg, tmp + mrs_s \reg, SYS_GMID_EL1 + ubfx \reg, \reg, #SYS_GMID_EL1_BS_SHIFT, #SYS_GMID_EL1_BS_SIZE + mov \tmp, #4 + lsl \reg, \tmp, \reg + .endm + +/* + * Clear the tags in a page + * x0 - address of the page to be cleared + */ +SYM_FUNC_START(mte_clear_page_tags) + multitag_transfer_size x1, x2 +1: stgm xzr, [x0] + add x0, x0, x1 + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + ret +SYM_FUNC_END(mte_clear_page_tags) From 2563776b41c3190849c6b011c72b47bff314963d Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Tue, 6 Aug 2019 11:37:53 +0100 Subject: [PATCH 012/148] arm64: mte: Tags-aware copy_{user_,}highpage() implementations When the Memory Tagging Extension is enabled, the tags need to be preserved across page copy (e.g. for copy-on-write, page migration). Introduce MTE-aware copy_{user_,}highpage() functions to copy tags to the destination if the source page has the PG_mte_tagged flag set. copy_user_page() does not need to handle tag copying since, with this patch, it is only called by the DAX code where there is no source page structure (and no source tags). Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 4 ++++ arch/arm64/include/asm/page.h | 14 +++++++++++--- arch/arm64/lib/mte.S | 19 +++++++++++++++++++ arch/arm64/mm/copypage.c | 25 +++++++++++++++++++++---- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 1716b3d02489..b2577eee62c2 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -19,6 +19,7 @@ void mte_clear_page_tags(void *addr); #define PG_mte_tagged PG_arch_2 void mte_sync_tags(pte_t *ptep, pte_t pte); +void mte_copy_page_tags(void *kto, const void *kfrom); void flush_mte_state(void); #else @@ -29,6 +30,9 @@ void flush_mte_state(void); static inline void mte_sync_tags(pte_t *ptep, pte_t pte) { } +static inline void mte_copy_page_tags(void *kto, const void *kfrom) +{ +} static inline void flush_mte_state(void) { } diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index c01b52add377..11734ce29702 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -15,18 +15,26 @@ #include /* for READ_IMPLIES_EXEC */ #include +struct page; +struct vm_area_struct; + extern void __cpu_clear_user_page(void *p, unsigned long user); -extern void __cpu_copy_user_page(void *to, const void *from, - unsigned long user); extern void copy_page(void *to, const void *from); extern void clear_page(void *to); +void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma); +#define __HAVE_ARCH_COPY_USER_HIGHPAGE + +void copy_highpage(struct page *to, struct page *from); +#define __HAVE_ARCH_COPY_HIGHPAGE + #define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE #define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr) -#define copy_user_page(to,from,vaddr,pg) __cpu_copy_user_page(to, from, vaddr) +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct page *pgtable_t; diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index a36705640086..3c3d0edbbca3 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -5,6 +5,7 @@ #include #include +#include #include .arch armv8.5-a+memtag @@ -32,3 +33,21 @@ SYM_FUNC_START(mte_clear_page_tags) b.ne 1b ret SYM_FUNC_END(mte_clear_page_tags) + +/* + * Copy the tags from the source page to the destination one + * x0 - address of the destination page + * x1 - address of the source page + */ +SYM_FUNC_START(mte_copy_page_tags) + mov x2, x0 + mov x3, x1 + multitag_transfer_size x5, x6 +1: ldgm x4, [x3] + stgm x4, [x2] + add x2, x2, x5 + add x3, x3, x5 + tst x2, #(PAGE_SIZE - 1) + b.ne 1b + ret +SYM_FUNC_END(mte_copy_page_tags) diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 2ee7b73433a5..4a2233fa674e 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -6,18 +6,35 @@ * Copyright (C) 2012 ARM Ltd. */ +#include #include #include #include +#include +#include -void __cpu_copy_user_page(void *kto, const void *kfrom, unsigned long vaddr) +void copy_highpage(struct page *to, struct page *from) { - struct page *page = virt_to_page(kto); + struct page *kto = page_address(to); + struct page *kfrom = page_address(from); + copy_page(kto, kfrom); - flush_dcache_page(page); + + if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) { + set_bit(PG_mte_tagged, &to->flags); + mte_copy_page_tags(kto, kfrom); + } } -EXPORT_SYMBOL_GPL(__cpu_copy_user_page); +EXPORT_SYMBOL(copy_highpage); + +void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_highpage(to, from); + flush_dcache_page(to); +} +EXPORT_SYMBOL_GPL(copy_user_highpage); void __cpu_clear_user_page(void *kaddr, unsigned long vaddr) { From 738c8780fc1fa11fb996a962a54703d0450cae59 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sun, 21 Jun 2020 11:41:27 +0100 Subject: [PATCH 013/148] arm64: Avoid unnecessary clear_user_page() indirection Since clear_user_page() calls clear_page() directly, avoid the unnecessary indirection. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/page.h | 3 +-- arch/arm64/mm/copypage.c | 6 ------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index 11734ce29702..d918cb1d83a6 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -18,7 +18,6 @@ struct page; struct vm_area_struct; -extern void __cpu_clear_user_page(void *p, unsigned long user); extern void copy_page(void *to, const void *from); extern void clear_page(void *to); @@ -33,7 +32,7 @@ void copy_highpage(struct page *to, struct page *from); alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE -#define clear_user_page(addr,vaddr,pg) __cpu_clear_user_page(addr, vaddr) +#define clear_user_page(page, vaddr, pg) clear_page(page) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) typedef struct page *pgtable_t; diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c index 4a2233fa674e..70a71f38b6a9 100644 --- a/arch/arm64/mm/copypage.c +++ b/arch/arm64/mm/copypage.c @@ -35,9 +35,3 @@ void copy_user_highpage(struct page *to, struct page *from, flush_dcache_page(to); } EXPORT_SYMBOL_GPL(copy_user_highpage); - -void __cpu_clear_user_page(void *kaddr, unsigned long vaddr) -{ - clear_page(kaddr); -} -EXPORT_SYMBOL_GPL(__cpu_clear_user_page); From 4d1a8a2dc0f4cdc2d74491a60709e698f85daf55 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Nov 2019 09:53:44 +0000 Subject: [PATCH 014/148] arm64: mte: Tags-aware aware memcmp_pages() implementation When the Memory Tagging Extension is enabled, two pages are identical only if both their data and tags are identical. Make the generic memcmp_pages() a __weak function and add an arm64-specific implementation which returns non-zero if any of the two pages contain valid MTE tags (PG_mte_tagged set). There isn't much benefit in comparing the tags of two pages since these are normally used for heap allocations and likely to differ anyway. Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/kernel/mte.c | 26 ++++++++++++++++++++++++++ mm/util.c | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 5bf9bbed5a25..5f54fd140610 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -23,6 +24,31 @@ void mte_sync_tags(pte_t *ptep, pte_t pte) } } +int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = page_address(page1); + addr2 = page_address(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + + if (!system_supports_mte() || ret) + return ret; + + /* + * If the page content is identical but at least one of the pages is + * tagged, return non-zero to avoid KSM merging. If only one of the + * pages is tagged, set_pte_at() may zero or change the tags of the + * other page via mte_sync_tags(). + */ + if (test_bit(PG_mte_tagged, &page1->flags) || + test_bit(PG_mte_tagged, &page2->flags)) + return addr1 != addr2; + + return ret; +} + void flush_mte_state(void) { if (!system_supports_mte()) diff --git a/mm/util.c b/mm/util.c index 5ef378a2a038..4e21fe7eae27 100644 --- a/mm/util.c +++ b/mm/util.c @@ -957,7 +957,7 @@ out: return res; } -int memcmp_pages(struct page *page1, struct page *page2) +int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; From b3fbbea4c00220f62e6f7e2514466e6ee81f7f60 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Mon, 25 Nov 2019 17:27:06 +0000 Subject: [PATCH 015/148] mm: Introduce arch_calc_vm_flag_bits() Similarly to arch_calc_vm_prot_bits(), introduce a dummy arch_calc_vm_flag_bits() invoked from calc_vm_flag_bits(). This macro can be overridden by architectures to insert specific VM_* flags derived from the mmap() MAP_* flags. Signed-off-by: Kevin Brodsky Signed-off-by: Catalin Marinas Cc: Andrew Morton --- include/linux/mman.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/mman.h b/include/linux/mman.h index 6f34c33075f9..6fa15c9b12af 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -78,13 +78,18 @@ static inline void vm_unacct_memory(long pages) } /* - * Allow architectures to handle additional protection bits + * Allow architectures to handle additional protection and flag bits. The + * overriding macros must be defined in the arch-specific asm/mman.h file. */ #ifndef arch_calc_vm_prot_bits #define arch_calc_vm_prot_bits(prot, pkey) 0 #endif +#ifndef arch_calc_vm_flag_bits +#define arch_calc_vm_flag_bits(flags) 0 +#endif + #ifndef arch_vm_get_page_prot #define arch_vm_get_page_prot(vm_flags) __pgprot(0) #endif @@ -135,7 +140,8 @@ calc_vm_flag_bits(unsigned long flags) return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | - _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ); + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | + arch_calc_vm_flag_bits(flags); } unsigned long vm_commit_limit(void); From 9f3419315f3cdc41a7318e4d50ba18a592b30c8c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Nov 2019 10:00:27 +0000 Subject: [PATCH 016/148] arm64: mte: Add PROT_MTE support to mmap() and mprotect() To enable tagging on a memory range, the user must explicitly opt in via a new PROT_MTE flag passed to mmap() or mprotect(). Since this is a new memory type in the AttrIndx field of a pte, simplify the or'ing of these bits over the protection_map[] attributes by making MT_NORMAL index 0. There are two conditions for arch_vm_get_page_prot() to return the MT_NORMAL_TAGGED memory type: (1) the user requested it via PROT_MTE, registered as VM_MTE in the vm_flags, and (2) the vma supports MTE, decided during the mmap() call (only) and registered as VM_MTE_ALLOWED. arch_calc_vm_prot_bits() is responsible for registering the user request as VM_MTE. The newly introduced arch_calc_vm_flag_bits() sets VM_MTE_ALLOWED if the mapping is MAP_ANONYMOUS. An MTE-capable filesystem (RAM-based) may be able to set VM_MTE_ALLOWED during its mmap() file ops call. In addition, update VM_DATA_DEFAULT_FLAGS to allow mprotect(PROT_MTE) on stack or brk area. The Linux mmap() syscall currently ignores unknown PROT_* flags. In the presence of MTE, an mmap(PROT_MTE) on a file which does not support MTE will not report an error and the memory will not be mapped as Normal Tagged. For consistency, mprotect(PROT_MTE) will not report an error either if the memory range does not support MTE. Two subsequent patches in the series will propose tightening of this behaviour. Co-developed-by: Vincenzo Frascino Signed-off-by: Vincenzo Frascino Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/memory.h | 18 +++++++----- arch/arm64/include/asm/mman.h | 46 +++++++++++++++++++++++++++--- arch/arm64/include/asm/page.h | 2 +- arch/arm64/include/asm/pgtable.h | 7 ++++- arch/arm64/include/uapi/asm/mman.h | 1 + fs/proc/task_mmu.c | 4 +++ include/linux/mm.h | 8 ++++++ 7 files changed, 73 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 1e0a78266410..e424fc3a68cb 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -126,14 +126,18 @@ /* * Memory types available. + * + * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in + * the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note + * that protection_map[] only contains MT_NORMAL attributes. */ -#define MT_DEVICE_nGnRnE 0 -#define MT_DEVICE_nGnRE 1 -#define MT_DEVICE_GRE 2 -#define MT_NORMAL_NC 3 -#define MT_NORMAL 4 -#define MT_NORMAL_WT 5 -#define MT_NORMAL_TAGGED 6 +#define MT_NORMAL 0 +#define MT_NORMAL_TAGGED 1 +#define MT_NORMAL_NC 2 +#define MT_NORMAL_WT 3 +#define MT_DEVICE_nGnRnE 4 +#define MT_DEVICE_nGnRE 5 +#define MT_DEVICE_GRE 6 /* * Memory types for Stage-2 translation diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 081ec8de9ea6..b01051be7750 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -9,16 +9,51 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, unsigned long pkey __always_unused) { - if (system_supports_bti() && (prot & PROT_BTI)) - return VM_ARM64_BTI; + unsigned long ret = 0; - return 0; + if (system_supports_bti() && (prot & PROT_BTI)) + ret |= VM_ARM64_BTI; + + if (system_supports_mte() && (prot & PROT_MTE)) + ret |= VM_MTE; + + return ret; } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) +static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) +{ + /* + * Only allow MTE on anonymous mappings as these are guaranteed to be + * backed by tags-capable memory. The vm_flags may be overridden by a + * filesystem supporting MTE (RAM-based). + */ + if (system_supports_mte() && (flags & MAP_ANONYMOUS)) + return VM_MTE_ALLOWED; + + return 0; +} +#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags) + static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) { - return (vm_flags & VM_ARM64_BTI) ? __pgprot(PTE_GP) : __pgprot(0); + pteval_t prot = 0; + + if (vm_flags & VM_ARM64_BTI) + prot |= PTE_GP; + + /* + * There are two conditions required for returning a Normal Tagged + * memory type: (1) the user requested it via PROT_MTE passed to + * mmap() or mprotect() and (2) the corresponding vma supports MTE. We + * register (1) as VM_MTE in the vma->vm_flags and (2) as + * VM_MTE_ALLOWED. Note that the latter can only be set during the + * mmap() call since mprotect() does not accept MAP_* flags. + */ + if ((vm_flags & VM_MTE) && (vm_flags & VM_MTE_ALLOWED)) + prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); + + return __pgprot(prot); } #define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags) @@ -30,6 +65,9 @@ static inline bool arch_validate_prot(unsigned long prot, if (system_supports_bti()) supported |= PROT_BTI; + if (system_supports_mte()) + supported |= PROT_MTE; + return (prot & ~supported) == 0; } #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index d918cb1d83a6..012cffc574e8 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -43,7 +43,7 @@ extern int pfn_valid(unsigned long); #endif /* !__ASSEMBLY__ */ -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC +#define VM_DATA_DEFAULT_FLAGS (VM_DATA_FLAGS_TSK_EXEC | VM_MTE_ALLOWED) #include diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0a205a8e91b2..057c40b6f5e0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -681,8 +681,13 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { + /* + * Normal and Normal-Tagged are two different memory types and indices + * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK. + */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | - PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP; + PTE_PROT_NONE | PTE_VALID | PTE_WRITE | PTE_GP | + PTE_ATTRINDX_MASK; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = pte_mkdirty(pte); diff --git a/arch/arm64/include/uapi/asm/mman.h b/arch/arm64/include/uapi/asm/mman.h index 6fdd71eb644f..1e6482a838e1 100644 --- a/arch/arm64/include/uapi/asm/mman.h +++ b/arch/arm64/include/uapi/asm/mman.h @@ -5,5 +5,6 @@ #include #define PROT_BTI 0x10 /* BTI guarded page */ +#define PROT_MTE 0x20 /* Normal Tagged mapping */ #endif /* ! _UAPI__ASM_MMAN_H */ diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5066b0251ed8..35172a91148e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -653,6 +653,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", +#ifdef CONFIG_ARM64_MTE + [ilog2(VM_MTE)] = "mt", + [ilog2(VM_MTE_ALLOWED)] = "", +#endif #ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", diff --git a/include/linux/mm.h b/include/linux/mm.h index ca6e6a81576b..4312c6c808e9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -340,6 +340,14 @@ extern unsigned int kobjsize(const void *objp); # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif +#if defined(CONFIG_ARM64_MTE) +# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +#else +# define VM_MTE VM_NONE +# define VM_MTE_ALLOWED VM_NONE +#endif + #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif From c462ac288f2c97e0c1d9ff6a65955317e799f958 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 25 Nov 2019 17:27:06 +0000 Subject: [PATCH 017/148] mm: Introduce arch_validate_flags() Similarly to arch_validate_prot() called from do_mprotect_pkey(), an architecture may need to sanity-check the new vm_flags. Define a dummy function always returning true. In addition to do_mprotect_pkey(), also invoke it from mmap_region() prior to updating vma->vm_page_prot to allow the architecture code to veto potentially inconsistent vm_flags. Signed-off-by: Catalin Marinas Acked-by: Andrew Morton --- include/linux/mman.h | 13 +++++++++++++ mm/mmap.c | 9 +++++++++ mm/mprotect.c | 6 ++++++ 3 files changed, 28 insertions(+) diff --git a/include/linux/mman.h b/include/linux/mman.h index 6fa15c9b12af..629cefc4ecba 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -108,6 +108,19 @@ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) #define arch_validate_prot arch_validate_prot #endif +#ifndef arch_validate_flags +/* + * This is called from mmap() and mprotect() with the updated vma->vm_flags. + * + * Returns true if the VM_* flags are valid. + */ +static inline bool arch_validate_flags(unsigned long flags) +{ + return true; +} +#define arch_validate_flags arch_validate_flags +#endif + /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 diff --git a/mm/mmap.c b/mm/mmap.c index 40248d84ad5f..eed30b096667 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1812,6 +1812,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_anonymous(vma); } + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { diff --git a/mm/mprotect.c b/mm/mprotect.c index ce8b8a5eacbb..56c02beb6041 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -603,6 +603,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } + /* Allow architectures to sanity-check the new flags */ + if (!arch_validate_flags(newflags)) { + error = -EINVAL; + goto out; + } + error = security_file_mprotect(vma, reqprot, prot); if (error) goto out; From 0042090548740921951f31fc0c20dcdb96638cb0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 9 Aug 2019 15:48:46 +0100 Subject: [PATCH 018/148] arm64: mte: Validate the PROT_MTE request via arch_validate_flags() Make use of the newly introduced arch_validate_flags() hook to sanity-check the PROT_MTE request passed to mmap() and mprotect(). If the mapping does not support MTE, these syscalls will return -EINVAL. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mman.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index b01051be7750..e3e28f7daf62 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -49,8 +49,10 @@ static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags) * register (1) as VM_MTE in the vma->vm_flags and (2) as * VM_MTE_ALLOWED. Note that the latter can only be set during the * mmap() call since mprotect() does not accept MAP_* flags. + * Checking for VM_MTE only is sufficient since arch_validate_flags() + * does not permit (VM_MTE & !VM_MTE_ALLOWED). */ - if ((vm_flags & VM_MTE) && (vm_flags & VM_MTE_ALLOWED)) + if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); return __pgprot(prot); @@ -72,4 +74,14 @@ static inline bool arch_validate_prot(unsigned long prot, } #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr) +static inline bool arch_validate_flags(unsigned long vm_flags) +{ + if (!system_supports_mte()) + return true; + + /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */ + return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED); +} +#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) + #endif /* ! __ASM_MMAN_H__ */ From 51b0bff2f703f7ecfeb228eaa3d8f6090c18c9c1 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 29 Nov 2019 12:45:08 +0000 Subject: [PATCH 019/148] mm: Allow arm64 mmap(PROT_MTE) on RAM-based files Since arm64 memory (allocation) tags can only be stored in RAM, mapping files with PROT_MTE is not allowed by default. RAM-based files like those in a tmpfs mount or memfd_create() can support memory tagging, so update the vm_flags accordingly in shmem_mmap(). Signed-off-by: Catalin Marinas Acked-by: Andrew Morton --- mm/shmem.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 271548ca20f3..ec94f4c7851e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2267,6 +2267,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags &= ~(VM_MAYWRITE); } + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && From 1c101da8b971a36695319dce7a24711dc567a0dd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 27 Nov 2019 10:30:15 +0000 Subject: [PATCH 020/148] arm64: mte: Allow user control of the tag check mode via prctl() By default, even if PROT_MTE is set on a memory range, there is no tag check fault reporting (SIGSEGV). Introduce a set of option to the exiting prctl(PR_SET_TAGGED_ADDR_CTRL) to allow user control of the tag check fault mode: PR_MTE_TCF_NONE - no reporting (default) PR_MTE_TCF_SYNC - synchronous tag check fault reporting PR_MTE_TCF_ASYNC - asynchronous tag check fault reporting These options translate into the corresponding SCTLR_EL1.TCF0 bitfield, context-switched by the kernel. Note that the kernel accesses to the user address space (e.g. read() system call) are not checked if the user thread tag checking mode is PR_MTE_TCF_NONE or PR_MTE_TCF_ASYNC. If the tag checking mode is PR_MTE_TCF_SYNC, the kernel makes a best effort to check its user address accesses, however it cannot always guarantee it. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 14 ++++++ arch/arm64/include/asm/processor.h | 3 ++ arch/arm64/kernel/mte.c | 77 ++++++++++++++++++++++++++++++ arch/arm64/kernel/process.c | 26 ++++++++-- include/uapi/linux/prctl.h | 6 +++ 5 files changed, 123 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index b2577eee62c2..df2efbc9f8f1 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -21,6 +21,9 @@ void mte_clear_page_tags(void *addr); void mte_sync_tags(pte_t *ptep, pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void flush_mte_state(void); +void mte_thread_switch(struct task_struct *next); +long set_mte_ctrl(unsigned long arg); +long get_mte_ctrl(void); #else @@ -36,6 +39,17 @@ static inline void mte_copy_page_tags(void *kto, const void *kfrom) static inline void flush_mte_state(void) { } +static inline void mte_thread_switch(struct task_struct *next) +{ +} +static inline long set_mte_ctrl(unsigned long arg) +{ + return 0; +} +static inline long get_mte_ctrl(void) +{ + return 0; +} #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 240fe5e5b720..80e7f0573309 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -151,6 +151,9 @@ struct thread_struct { struct ptrauth_keys_user keys_user; struct ptrauth_keys_kernel keys_kernel; #endif +#ifdef CONFIG_ARM64_MTE + u64 sctlr_tcf0; +#endif }; static inline void arch_thread_struct_whitelist(unsigned long *offset, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 5f54fd140610..375483a1f573 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include @@ -49,6 +51,26 @@ int memcmp_pages(struct page *page1, struct page *page2) return ret; } +static void update_sctlr_el1_tcf0(u64 tcf0) +{ + /* ISB required for the kernel uaccess routines */ + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0); + isb(); +} + +static void set_sctlr_el1_tcf0(u64 tcf0) +{ + /* + * mte_thread_switch() checks current->thread.sctlr_tcf0 as an + * optimisation. Disable preemption so that it does not see + * the variable update before the SCTLR_EL1.TCF0 one. + */ + preempt_disable(); + current->thread.sctlr_tcf0 = tcf0; + update_sctlr_el1_tcf0(tcf0); + preempt_enable(); +} + void flush_mte_state(void) { if (!system_supports_mte()) @@ -58,4 +80,59 @@ void flush_mte_state(void) dsb(ish); write_sysreg_s(0, SYS_TFSRE0_EL1); clear_thread_flag(TIF_MTE_ASYNC_FAULT); + /* disable tag checking */ + set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE); +} + +void mte_thread_switch(struct task_struct *next) +{ + if (!system_supports_mte()) + return; + + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) + update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); +} + +long set_mte_ctrl(unsigned long arg) +{ + u64 tcf0; + + if (!system_supports_mte()) + return 0; + + switch (arg & PR_MTE_TCF_MASK) { + case PR_MTE_TCF_NONE: + tcf0 = SCTLR_EL1_TCF0_NONE; + break; + case PR_MTE_TCF_SYNC: + tcf0 = SCTLR_EL1_TCF0_SYNC; + break; + case PR_MTE_TCF_ASYNC: + tcf0 = SCTLR_EL1_TCF0_ASYNC; + break; + default: + return -EINVAL; + } + + set_sctlr_el1_tcf0(tcf0); + + return 0; +} + +long get_mte_ctrl(void) +{ + if (!system_supports_mte()) + return 0; + + switch (current->thread.sctlr_tcf0) { + case SCTLR_EL1_TCF0_NONE: + return PR_MTE_TCF_NONE; + case SCTLR_EL1_TCF0_SYNC: + return PR_MTE_TCF_SYNC; + case SCTLR_EL1_TCF0_ASYNC: + return PR_MTE_TCF_ASYNC; + } + + return 0; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a49028efab68..bb759b88d44a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -577,6 +577,13 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, */ dsb(ish); + /* + * MTE thread switching must happen after the DSB above to ensure that + * any asynchronous tag check faults have been logged in the TFSR*_EL1 + * registers. + */ + mte_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); @@ -636,9 +643,15 @@ static unsigned int tagged_addr_disabled; long set_tagged_addr_ctrl(unsigned long arg) { + unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + if (is_compat_task()) return -EINVAL; - if (arg & ~PR_TAGGED_ADDR_ENABLE) + + if (system_supports_mte()) + valid_mask |= PR_MTE_TCF_MASK; + + if (arg & ~valid_mask) return -EINVAL; /* @@ -648,6 +661,9 @@ long set_tagged_addr_ctrl(unsigned long arg) if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) return -EINVAL; + if (set_mte_ctrl(arg) != 0) + return -EINVAL; + update_thread_flag(TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); return 0; @@ -655,13 +671,17 @@ long set_tagged_addr_ctrl(unsigned long arg) long get_tagged_addr_ctrl(void) { + long ret = 0; + if (is_compat_task()) return -EINVAL; if (test_thread_flag(TIF_TAGGED_ADDR)) - return PR_TAGGED_ADDR_ENABLE; + ret = PR_TAGGED_ADDR_ENABLE; - return 0; + ret |= get_mte_ctrl(); + + return ret; } /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 07b4f8131e36..2390ab324afa 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -233,6 +233,12 @@ struct prctl_mm_map { #define PR_SET_TAGGED_ADDR_CTRL 55 #define PR_GET_TAGGED_ADDR_CTRL 56 # define PR_TAGGED_ADDR_ENABLE (1UL << 0) +/* MTE tag check fault modes */ +# define PR_MTE_TCF_SHIFT 1 +# define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) +# define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 From af5ce95282dc99d08a27a407a02c763dde1c5558 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 10 Dec 2019 11:19:15 +0000 Subject: [PATCH 021/148] arm64: mte: Allow user control of the generated random tags via prctl() The IRG, ADDG and SUBG instructions insert a random tag in the resulting address. Certain tags can be excluded via the GCR_EL1.Exclude bitmap when, for example, the user wants a certain colour for freed buffers. Since the GCR_EL1 register is not accessible at EL0, extend the prctl(PR_SET_TAGGED_ADDR_CTRL) interface to include a 16-bit field in the first argument for controlling which tags can be generated by the above instruction (an include rather than exclude mask). Note that by default all non-zero tags are excluded. This setting is per-thread. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/sysreg.h | 7 ++++++ arch/arm64/kernel/mte.c | 35 +++++++++++++++++++++++++++--- arch/arm64/kernel/process.c | 2 +- include/uapi/linux/prctl.h | 3 +++ 5 files changed, 44 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 80e7f0573309..e1b1c2a6086e 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -153,6 +153,7 @@ struct thread_struct { #endif #ifdef CONFIG_ARM64_MTE u64 sctlr_tcf0; + u64 gcr_user_incl; #endif }; diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index daf030a05de0..52eefe2f7d95 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1078,6 +1078,13 @@ write_sysreg(__scs_new, sysreg); \ } while (0) +#define sysreg_clear_set_s(sysreg, clear, set) do { \ + u64 __scs_val = read_sysreg_s(sysreg); \ + u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \ + if (__scs_new != __scs_val) \ + write_sysreg_s(__scs_new, sysreg); \ +} while (0) + #endif #endif /* __ASM_SYSREG_H */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 375483a1f573..07798b8d5039 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -71,6 +71,25 @@ static void set_sctlr_el1_tcf0(u64 tcf0) preempt_enable(); } +static void update_gcr_el1_excl(u64 incl) +{ + u64 excl = ~incl & SYS_GCR_EL1_EXCL_MASK; + + /* + * Note that 'incl' is an include mask (controlled by the user via + * prctl()) while GCR_EL1 accepts an exclude mask. + * No need for ISB since this only affects EL0 currently, implicit + * with ERET. + */ + sysreg_clear_set_s(SYS_GCR_EL1, SYS_GCR_EL1_EXCL_MASK, excl); +} + +static void set_gcr_el1_excl(u64 incl) +{ + current->thread.gcr_user_incl = incl; + update_gcr_el1_excl(incl); +} + void flush_mte_state(void) { if (!system_supports_mte()) @@ -82,6 +101,8 @@ void flush_mte_state(void) clear_thread_flag(TIF_MTE_ASYNC_FAULT); /* disable tag checking */ set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE); + /* reset tag generation mask */ + set_gcr_el1_excl(0); } void mte_thread_switch(struct task_struct *next) @@ -92,6 +113,7 @@ void mte_thread_switch(struct task_struct *next) /* avoid expensive SCTLR_EL1 accesses if no change */ if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); + update_gcr_el1_excl(next->thread.gcr_user_incl); } long set_mte_ctrl(unsigned long arg) @@ -116,23 +138,30 @@ long set_mte_ctrl(unsigned long arg) } set_sctlr_el1_tcf0(tcf0); + set_gcr_el1_excl((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT); return 0; } long get_mte_ctrl(void) { + unsigned long ret; + if (!system_supports_mte()) return 0; + ret = current->thread.gcr_user_incl << PR_MTE_TAG_SHIFT; + switch (current->thread.sctlr_tcf0) { case SCTLR_EL1_TCF0_NONE: return PR_MTE_TCF_NONE; case SCTLR_EL1_TCF0_SYNC: - return PR_MTE_TCF_SYNC; + ret |= PR_MTE_TCF_SYNC; + break; case SCTLR_EL1_TCF0_ASYNC: - return PR_MTE_TCF_ASYNC; + ret |= PR_MTE_TCF_ASYNC; + break; } - return 0; + return ret; } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index bb759b88d44a..c80383f30d6a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -649,7 +649,7 @@ long set_tagged_addr_ctrl(unsigned long arg) return -EINVAL; if (system_supports_mte()) - valid_mask |= PR_MTE_TCF_MASK; + valid_mask |= PR_MTE_TCF_MASK | PR_MTE_TAG_MASK; if (arg & ~valid_mask) return -EINVAL; diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 2390ab324afa..7f0827705c9a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -239,6 +239,9 @@ struct prctl_mm_map { # define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) # define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) # define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) +/* MTE tag inclusion mask */ +# define PR_MTE_TAG_SHIFT 3 +# define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) /* Control reclaim behavior when allocating memory */ #define PR_SET_IO_FLUSHER 57 From 39d08e8318c49c5fd2bda9cadfd2bc54d2d3dfd8 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 17 Apr 2020 18:29:35 +0100 Subject: [PATCH 022/148] arm64: mte: Restore the GCR_EL1 register after a suspend The CPU resume/suspend routines only take care of the common system registers. Restore GCR_EL1 in addition via the __cpu_suspend_exit() function. Signed-off-by: Catalin Marinas Cc: Will Deacon Reviewed-by: Lorenzo Pieralisi --- arch/arm64/include/asm/mte.h | 4 ++++ arch/arm64/kernel/mte.c | 8 ++++++++ arch/arm64/kernel/suspend.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index df2efbc9f8f1..c93047eff9fe 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -22,6 +22,7 @@ void mte_sync_tags(pte_t *ptep, pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void flush_mte_state(void); void mte_thread_switch(struct task_struct *next); +void mte_suspend_exit(void); long set_mte_ctrl(unsigned long arg); long get_mte_ctrl(void); @@ -42,6 +43,9 @@ static inline void flush_mte_state(void) static inline void mte_thread_switch(struct task_struct *next) { } +static inline void mte_suspend_exit(void) +{ +} static inline long set_mte_ctrl(unsigned long arg) { return 0; diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 07798b8d5039..09cf76fc1090 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -116,6 +116,14 @@ void mte_thread_switch(struct task_struct *next) update_gcr_el1_excl(next->thread.gcr_user_incl); } +void mte_suspend_exit(void) +{ + if (!system_supports_mte()) + return; + + update_gcr_el1_excl(current->thread.gcr_user_incl); +} + long set_mte_ctrl(unsigned long arg) { u64 tcf0; diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index c1dee9066ff9..62c239cd60c2 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -74,6 +75,9 @@ void notrace __cpu_suspend_exit(void) */ if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) arm64_set_ssbd_mitigation(false); + + /* Restore additional MTE-specific configuration */ + mte_suspend_exit(); } /* From 93f067f6caf5941cc730e99ce72042304e0e6ff5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 3 Jul 2020 14:25:50 +0100 Subject: [PATCH 023/148] arm64: mte: Allow {set,get}_tagged_addr_ctrl() on non-current tasks In preparation for ptrace() access to the prctl() value, allow calling these functions on non-current tasks. Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 8 ++++---- arch/arm64/include/asm/processor.h | 8 ++++---- arch/arm64/kernel/mte.c | 18 ++++++++++++------ arch/arm64/kernel/process.c | 18 ++++++++++-------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index c93047eff9fe..1a919905295b 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -23,8 +23,8 @@ void mte_copy_page_tags(void *kto, const void *kfrom); void flush_mte_state(void); void mte_thread_switch(struct task_struct *next); void mte_suspend_exit(void); -long set_mte_ctrl(unsigned long arg); -long get_mte_ctrl(void); +long set_mte_ctrl(struct task_struct *task, unsigned long arg); +long get_mte_ctrl(struct task_struct *task); #else @@ -46,11 +46,11 @@ static inline void mte_thread_switch(struct task_struct *next) static inline void mte_suspend_exit(void) { } -static inline long set_mte_ctrl(unsigned long arg) +static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg) { return 0; } -static inline long get_mte_ctrl(void) +static inline long get_mte_ctrl(struct task_struct *task) { return 0; } diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index e1b1c2a6086e..fec204d28fce 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -319,10 +319,10 @@ extern void __init minsigstksz_setup(void); #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI /* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */ -long set_tagged_addr_ctrl(unsigned long arg); -long get_tagged_addr_ctrl(void); -#define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(arg) -#define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl() +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg); +long get_tagged_addr_ctrl(struct task_struct *task); +#define SET_TAGGED_ADDR_CTRL(arg) set_tagged_addr_ctrl(current, arg) +#define GET_TAGGED_ADDR_CTRL() get_tagged_addr_ctrl(current) #endif /* diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 09cf76fc1090..e80c49af74af 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -124,9 +124,10 @@ void mte_suspend_exit(void) update_gcr_el1_excl(current->thread.gcr_user_incl); } -long set_mte_ctrl(unsigned long arg) +long set_mte_ctrl(struct task_struct *task, unsigned long arg) { u64 tcf0; + u64 gcr_incl = (arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT; if (!system_supports_mte()) return 0; @@ -145,22 +146,27 @@ long set_mte_ctrl(unsigned long arg) return -EINVAL; } - set_sctlr_el1_tcf0(tcf0); - set_gcr_el1_excl((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT); + if (task != current) { + task->thread.sctlr_tcf0 = tcf0; + task->thread.gcr_user_incl = gcr_incl; + } else { + set_sctlr_el1_tcf0(tcf0); + set_gcr_el1_excl(gcr_incl); + } return 0; } -long get_mte_ctrl(void) +long get_mte_ctrl(struct task_struct *task) { unsigned long ret; if (!system_supports_mte()) return 0; - ret = current->thread.gcr_user_incl << PR_MTE_TAG_SHIFT; + ret = task->thread.gcr_user_incl << PR_MTE_TAG_SHIFT; - switch (current->thread.sctlr_tcf0) { + switch (task->thread.sctlr_tcf0) { case SCTLR_EL1_TCF0_NONE: return PR_MTE_TCF_NONE; case SCTLR_EL1_TCF0_SYNC: diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c80383f30d6a..05a9cdd0b471 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -641,11 +641,12 @@ void arch_setup_new_exec(void) */ static unsigned int tagged_addr_disabled; -long set_tagged_addr_ctrl(unsigned long arg) +long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) { unsigned long valid_mask = PR_TAGGED_ADDR_ENABLE; + struct thread_info *ti = task_thread_info(task); - if (is_compat_task()) + if (is_compat_thread(ti)) return -EINVAL; if (system_supports_mte()) @@ -661,25 +662,26 @@ long set_tagged_addr_ctrl(unsigned long arg) if (arg & PR_TAGGED_ADDR_ENABLE && tagged_addr_disabled) return -EINVAL; - if (set_mte_ctrl(arg) != 0) + if (set_mte_ctrl(task, arg) != 0) return -EINVAL; - update_thread_flag(TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); + update_ti_thread_flag(ti, TIF_TAGGED_ADDR, arg & PR_TAGGED_ADDR_ENABLE); return 0; } -long get_tagged_addr_ctrl(void) +long get_tagged_addr_ctrl(struct task_struct *task) { long ret = 0; + struct thread_info *ti = task_thread_info(task); - if (is_compat_task()) + if (is_compat_thread(ti)) return -EINVAL; - if (test_thread_flag(TIF_TAGGED_ADDR)) + if (test_ti_thread_flag(ti, TIF_TAGGED_ADDR)) ret = PR_TAGGED_ADDR_ENABLE; - ret |= get_mte_ctrl(); + ret |= get_mte_ctrl(task); return ret; } From 18ddbaa02b7a64f4cf3e7e3d4b78b8b70481a17b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 30 Mar 2020 10:29:38 +0100 Subject: [PATCH 024/148] arm64: mte: ptrace: Add PTRACE_{PEEK,POKE}MTETAGS support Add support for bulk setting/getting of the MTE tags in a tracee's address space at 'addr' in the ptrace() syscall prototype. 'data' points to a struct iovec in the tracer's address space with iov_base representing the address of a tracer's buffer of length iov_len. The tags to be copied to/from the tracer's buffer are stored as one tag per byte. On successfully copying at least one tag, ptrace() returns 0 and updates the tracer's iov_len with the number of tags copied. In case of error, either -EIO or -EFAULT is returned, trying to follow the ptrace() man page. Note that the tag copying functions are not performance critical, therefore they lack optimisations found in typical memory copy routines. Signed-off-by: Catalin Marinas Cc: Will Deacon Cc: Alan Hayward Cc: Luis Machado Cc: Omair Javaid --- arch/arm64/include/asm/mte.h | 17 ++++ arch/arm64/include/uapi/asm/ptrace.h | 3 + arch/arm64/kernel/mte.c | 138 +++++++++++++++++++++++++++ arch/arm64/kernel/ptrace.c | 7 ++ arch/arm64/lib/mte.S | 53 ++++++++++ 5 files changed, 218 insertions(+) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 1a919905295b..7ea0c0e526d1 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -5,6 +5,11 @@ #ifndef __ASM_MTE_H #define __ASM_MTE_H +#define MTE_GRANULE_SIZE UL(16) +#define MTE_GRANULE_MASK (~(MTE_GRANULE_SIZE - 1)) +#define MTE_TAG_SHIFT 56 +#define MTE_TAG_SIZE 4 + #ifndef __ASSEMBLY__ #include @@ -12,6 +17,10 @@ #include void mte_clear_page_tags(void *addr); +unsigned long mte_copy_tags_from_user(void *to, const void __user *from, + unsigned long n); +unsigned long mte_copy_tags_to_user(void __user *to, void *from, + unsigned long n); #ifdef CONFIG_ARM64_MTE @@ -25,6 +34,8 @@ void mte_thread_switch(struct task_struct *next); void mte_suspend_exit(void); long set_mte_ctrl(struct task_struct *task, unsigned long arg); long get_mte_ctrl(struct task_struct *task); +int mte_ptrace_copy_tags(struct task_struct *child, long request, + unsigned long addr, unsigned long data); #else @@ -54,6 +65,12 @@ static inline long get_mte_ctrl(struct task_struct *task) { return 0; } +static inline int mte_ptrace_copy_tags(struct task_struct *child, + long request, unsigned long addr, + unsigned long data) +{ + return -EIO; +} #endif diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 06413d9f2341..758ae984ff97 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -76,6 +76,9 @@ /* syscall emulation path in ptrace */ #define PTRACE_SYSEMU 31 #define PTRACE_SYSEMU_SINGLESTEP 32 +/* MTE allocation tag access */ +#define PTRACE_PEEKMTETAGS 33 +#define PTRACE_POKEMTETAGS 34 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index e80c49af74af..56e79807006c 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -4,14 +4,18 @@ */ #include +#include #include #include #include +#include #include #include +#include #include #include +#include #include void mte_sync_tags(pte_t *ptep, pte_t pte) @@ -179,3 +183,137 @@ long get_mte_ctrl(struct task_struct *task) return ret; } + +/* + * Access MTE tags in another process' address space as given in mm. Update + * the number of tags copied. Return 0 if any tags copied, error otherwise. + * Inspired by __access_remote_vm(). + */ +static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct vm_area_struct *vma; + void __user *buf = kiov->iov_base; + size_t len = kiov->iov_len; + int ret; + int write = gup_flags & FOLL_WRITE; + + if (!access_ok(buf, len)) + return -EFAULT; + + if (mmap_read_lock_killable(mm)) + return -EIO; + + while (len) { + unsigned long tags, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, + &vma, NULL); + if (ret <= 0) + break; + + /* + * Only copy tags if the page has been mapped as PROT_MTE + * (PG_mte_tagged set). Otherwise the tags are not valid and + * not accessible to user. Moreover, an mprotect(PROT_MTE) + * would cause the existing tags to be cleared if the page + * was never mapped with PROT_MTE. + */ + if (!test_bit(PG_mte_tagged, &page->flags)) { + ret = -EOPNOTSUPP; + put_page(page); + break; + } + + /* limit access to the end of the page */ + offset = offset_in_page(addr); + tags = min(len, (PAGE_SIZE - offset) / MTE_GRANULE_SIZE); + + maddr = page_address(page); + if (write) { + tags = mte_copy_tags_from_user(maddr + offset, buf, tags); + set_page_dirty_lock(page); + } else { + tags = mte_copy_tags_to_user(buf, maddr + offset, tags); + } + put_page(page); + + /* error accessing the tracer's buffer */ + if (!tags) + break; + + len -= tags; + buf += tags; + addr += tags * MTE_GRANULE_SIZE; + } + mmap_read_unlock(mm); + + /* return an error if no tags copied */ + kiov->iov_len = buf - kiov->iov_base; + if (!kiov->iov_len) { + /* check for error accessing the tracee's address space */ + if (ret <= 0) + return -EIO; + else + return -EFAULT; + } + + return 0; +} + +/* + * Copy MTE tags in another process' address space at 'addr' to/from tracer's + * iovec buffer. Return 0 on success. Inspired by ptrace_access_vm(). + */ +static int access_remote_tags(struct task_struct *tsk, unsigned long addr, + struct iovec *kiov, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return -EPERM; + + if (!tsk->ptrace || (current != tsk->parent) || + ((get_dumpable(mm) != SUID_DUMP_USER) && + !ptracer_capable(tsk, mm->user_ns))) { + mmput(mm); + return -EPERM; + } + + ret = __access_remote_tags(mm, addr, kiov, gup_flags); + mmput(mm); + + return ret; +} + +int mte_ptrace_copy_tags(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + struct iovec kiov; + struct iovec __user *uiov = (void __user *)data; + unsigned int gup_flags = FOLL_FORCE; + + if (!system_supports_mte()) + return -EIO; + + if (get_user(kiov.iov_base, &uiov->iov_base) || + get_user(kiov.iov_len, &uiov->iov_len)) + return -EFAULT; + + if (request == PTRACE_POKEMTETAGS) + gup_flags |= FOLL_WRITE; + + /* align addr to the MTE tag granule */ + addr &= MTE_GRANULE_MASK; + + ret = access_remote_tags(child, addr, &kiov, gup_flags); + if (!ret) + ret = put_user(kiov.iov_len, &uiov->iov_len); + + return ret; +} diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 8942de814b72..101040a37d40 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -1691,6 +1692,12 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { + switch (request) { + case PTRACE_PEEKMTETAGS: + case PTRACE_POKEMTETAGS: + return mte_ptrace_copy_tags(child, request, addr, data); + } + return ptrace_request(child, request, addr, data); } diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 3c3d0edbbca3..434f81d9a180 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -4,7 +4,9 @@ */ #include +#include #include +#include #include #include @@ -51,3 +53,54 @@ SYM_FUNC_START(mte_copy_page_tags) b.ne 1b ret SYM_FUNC_END(mte_copy_page_tags) + +/* + * Read tags from a user buffer (one tag per byte) and set the corresponding + * tags at the given kernel address. Used by PTRACE_POKEMTETAGS. + * x0 - kernel address (to) + * x1 - user buffer (from) + * x2 - number of tags/bytes (n) + * Returns: + * x0 - number of tags read/set + */ +SYM_FUNC_START(mte_copy_tags_from_user) + mov x3, x1 + cbz x2, 2f +1: + uao_user_alternative 2f, ldrb, ldtrb, w4, x1, 0 + lsl x4, x4, #MTE_TAG_SHIFT + stg x4, [x0], #MTE_GRANULE_SIZE + add x1, x1, #1 + subs x2, x2, #1 + b.ne 1b + + // exception handling and function return +2: sub x0, x1, x3 // update the number of tags set + ret +SYM_FUNC_END(mte_copy_tags_from_user) + +/* + * Get the tags from a kernel address range and write the tag values to the + * given user buffer (one tag per byte). Used by PTRACE_PEEKMTETAGS. + * x0 - user buffer (to) + * x1 - kernel address (from) + * x2 - number of tags/bytes (n) + * Returns: + * x0 - number of tags read/set + */ +SYM_FUNC_START(mte_copy_tags_to_user) + mov x3, x0 + cbz x2, 2f +1: + ldg x4, [x1] + ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE + uao_user_alternative 2f, strb, sttrb, w4, x0, 0 + add x0, x0, #1 + add x1, x1, #MTE_GRANULE_SIZE + subs x2, x2, #1 + b.ne 1b + + // exception handling and function return +2: sub x0, x0, x3 // update the number of tags copied + ret +SYM_FUNC_END(mte_copy_tags_to_user) From 2200aa7154cb7ef76bac93e98326883ba64bfa2e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 3 Jul 2020 15:12:57 +0100 Subject: [PATCH 025/148] arm64: mte: ptrace: Add NT_ARM_TAGGED_ADDR_CTRL regset This regset allows read/write access to a ptraced process prctl(PR_SET_TAGGED_ADDR_CTRL) setting. Signed-off-by: Catalin Marinas Cc: Will Deacon Cc: Alan Hayward Cc: Luis Machado Cc: Omair Javaid --- arch/arm64/kernel/ptrace.c | 42 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 43 insertions(+) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 101040a37d40..f49b349e16a3 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1033,6 +1033,35 @@ static int pac_generic_keys_set(struct task_struct *target, #endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI +static int tagged_addr_ctrl_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long ctrl = get_tagged_addr_ctrl(target); + + if (IS_ERR_VALUE(ctrl)) + return ctrl; + + return membuf_write(&to, &ctrl, sizeof(ctrl)); +} + +static int tagged_addr_ctrl_set(struct task_struct *target, const struct + user_regset *regset, unsigned int pos, + unsigned int count, const void *kbuf, const + void __user *ubuf) +{ + int ret; + long ctrl; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); + if (ret) + return ret; + + return set_tagged_addr_ctrl(target, ctrl); +} +#endif + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -1052,6 +1081,9 @@ enum aarch64_regset { REGSET_PACG_KEYS, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + REGSET_TAGGED_ADDR_CTRL, +#endif }; static const struct user_regset aarch64_regsets[] = { @@ -1149,6 +1181,16 @@ static const struct user_regset aarch64_regsets[] = { }, #endif #endif +#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI + [REGSET_TAGGED_ADDR_CTRL] = { + .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = tagged_addr_ctrl_get, + .set = tagged_addr_ctrl_set, + }, +#endif }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 22220945a5fd..30f68b42eeb5 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -425,6 +425,7 @@ typedef struct elf64_shdr { #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ #define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ #define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ From d563d678aa0be06e7bff2953c986f5ff0355f79c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 1 Jul 2020 17:46:06 +0100 Subject: [PATCH 026/148] fs: Handle intra-page faults in copy_mount_options() The copy_mount_options() function takes a user pointer argument but no size and it tries to read up to a PAGE_SIZE. However, copy_from_user() is not guaranteed to return all the accessible bytes if, for example, the access crosses a page boundary and gets a fault on the second page. To work around this, the current copy_mount_options() implementation performs two copy_from_user() passes, first to the end of the current page and the second to what's left in the subsequent page. On arm64 with MTE enabled, access to a user page may trigger a fault after part of the buffer in a page has been copied (when the user pointer tag, bits 56-59, no longer matches the allocation tag stored in memory). Allow copy_mount_options() to handle such intra-page faults by resorting to byte at a time copy in case of copy_from_user() failure. Note that copy_from_user() handles the zeroing of the kernel buffer in case of error. Signed-off-by: Catalin Marinas Cc: Alexander Viro --- fs/namespace.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index bae0e95b3713..32a0b9146757 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3075,7 +3075,7 @@ static void shrink_submounts(struct mount *mnt) void *copy_mount_options(const void __user * data) { char *copy; - unsigned size; + unsigned left, offset; if (!data) return NULL; @@ -3084,16 +3084,27 @@ void *copy_mount_options(const void __user * data) if (!copy) return ERR_PTR(-ENOMEM); - size = PAGE_SIZE - offset_in_page(data); + left = copy_from_user(copy, data, PAGE_SIZE); - if (copy_from_user(copy, data, size)) { + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } - if (size != PAGE_SIZE) { - if (copy_from_user(copy + size, data + size, PAGE_SIZE - size)) - memset(copy + size, 0, PAGE_SIZE - size); - } + return copy; } From 8a84802e2a2b1a682761a31c2685506b9f4e1840 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 13 May 2020 16:37:49 +0100 Subject: [PATCH 027/148] mm: Add arch hooks for saving/restoring tags Arm's Memory Tagging Extension (MTE) adds some metadata (tags) to every physical page, when swapping pages out to disk it is necessary to save these tags, and later restore them when reading the pages back. Add some hooks along with dummy implementations to enable the arch code to handle this. Three new hooks are added to the swap code: * arch_prepare_to_swap() and * arch_swap_invalidate_page() / arch_swap_invalidate_area(). One new hook is added to shmem: * arch_swap_restore() Signed-off-by: Steven Price [catalin.marinas@arm.com: add unlock_page() on the error path] [catalin.marinas@arm.com: dropped the _tags suffix] Signed-off-by: Catalin Marinas Acked-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++ mm/page_io.c | 10 ++++++++++ mm/shmem.c | 6 ++++++ mm/swapfile.c | 2 ++ 4 files changed, 46 insertions(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e8cbc2e795d5..dc3b74129fbc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -633,6 +633,34 @@ static inline int arch_unmap_one(struct mm_struct *mm, } #endif +/* + * Allow architectures to preserve additional metadata associated with + * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function + * prototypes must be defined in the arch-specific asm/pgtable.h file. + */ +#ifndef __HAVE_ARCH_PREPARE_TO_SWAP +static inline int arch_prepare_to_swap(struct page *page) +{ + return 0; +} +#endif + +#ifndef __HAVE_ARCH_SWAP_INVALIDATE +static inline void arch_swap_invalidate_page(int type, pgoff_t offset) +{ +} + +static inline void arch_swap_invalidate_area(int type) +{ +} +#endif + +#ifndef __HAVE_ARCH_SWAP_RESTORE +static inline void arch_swap_restore(swp_entry_t entry, struct page *page) +{ +} +#endif + #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif diff --git a/mm/page_io.c b/mm/page_io.c index e485a6e8a6cd..4ca28aad0d94 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -252,6 +252,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + /* + * Arch code may have to preserve more data than just the page + * contents, e.g. memory tags. + */ + ret = arch_prepare_to_swap(page); + if (ret) { + set_page_dirty(page); + unlock_page(page); + goto out; + } if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); diff --git a/mm/shmem.c b/mm/shmem.c index ec94f4c7851e..e57d3314dc4b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1734,6 +1734,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, } wait_on_page_writeback(page); + /* + * Some architectures may have to restore extra metadata to the + * physical page after reading from swap. + */ + arch_swap_restore(swap, page); + if (shmem_should_replace_page(page, gfp)) { error = shmem_replace_page(&page, gfp, info, index); if (error) diff --git a/mm/swapfile.c b/mm/swapfile.c index 12f59e641b5e..4b1d1a04e327 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -717,6 +717,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, else swap_slot_free_notify = NULL; while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); frontswap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); @@ -2682,6 +2683,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); + arch_swap_invalidate_area(p->type); frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); From 36943aba91860269abfba2e736e9534d48e90cae Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 13 May 2020 16:37:50 +0100 Subject: [PATCH 028/148] arm64: mte: Enable swap of tagged pages When swapping pages out to disk it is necessary to save any tags that have been set, and restore when swapping back in. Make use of the new page flag (PG_ARCH_2, locally named PG_mte_tagged) to identify pages with tags. When swapping out these pages the tags are stored in memory and later restored when the pages are brought back in. Because shmem can swap pages back in without restoring the userspace PTE it is also necessary to add a hook for shmem. Signed-off-by: Steven Price [catalin.marinas@arm.com: move function prototypes to mte.h] [catalin.marinas@arm.com: drop '_tags' from arch_swap_restore_tags()] Signed-off-by: Catalin Marinas Cc: Andrew Morton Cc: Will Deacon --- arch/arm64/include/asm/mte.h | 8 +++ arch/arm64/include/asm/pgtable.h | 32 ++++++++++++ arch/arm64/kernel/mte.c | 19 +++++++- arch/arm64/lib/mte.S | 45 +++++++++++++++++ arch/arm64/mm/Makefile | 1 + arch/arm64/mm/mteswap.c | 83 ++++++++++++++++++++++++++++++++ 6 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/mm/mteswap.c diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 7ea0c0e526d1..1c99fcadb58c 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -21,6 +21,14 @@ unsigned long mte_copy_tags_from_user(void *to, const void __user *from, unsigned long n); unsigned long mte_copy_tags_to_user(void __user *to, void *from, unsigned long n); +int mte_save_tags(struct page *page); +void mte_save_page_tags(const void *page_addr, void *tag_storage); +bool mte_restore_tags(swp_entry_t entry, struct page *page); +void mte_restore_page_tags(void *page_addr, const void *tag_storage); +void mte_invalidate_tags(int type, pgoff_t offset); +void mte_invalidate_tags_area(int type); +void *mte_allocate_tag_storage(void); +void mte_free_tag_storage(char *storage); #ifdef CONFIG_ARM64_MTE diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 057c40b6f5e0..1c46fcd873f6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -867,6 +867,38 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, extern int kern_addr_valid(unsigned long addr); +#ifdef CONFIG_ARM64_MTE + +#define __HAVE_ARCH_PREPARE_TO_SWAP +static inline int arch_prepare_to_swap(struct page *page) +{ + if (system_supports_mte()) + return mte_save_tags(page); + return 0; +} + +#define __HAVE_ARCH_SWAP_INVALIDATE +static inline void arch_swap_invalidate_page(int type, pgoff_t offset) +{ + if (system_supports_mte()) + mte_invalidate_tags(type, offset); +} + +static inline void arch_swap_invalidate_area(int type) +{ + if (system_supports_mte()) + mte_invalidate_tags_area(type); +} + +#define __HAVE_ARCH_SWAP_RESTORE +static inline void arch_swap_restore(swp_entry_t entry, struct page *page) +{ + if (system_supports_mte() && mte_restore_tags(entry, page)) + set_bit(PG_mte_tagged, &page->flags); +} + +#endif /* CONFIG_ARM64_MTE */ + /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 56e79807006c..52a0638ed967 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -18,15 +20,30 @@ #include #include +static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap) +{ + pte_t old_pte = READ_ONCE(*ptep); + + if (check_swap && is_swap_pte(old_pte)) { + swp_entry_t entry = pte_to_swp_entry(old_pte); + + if (!non_swap_entry(entry) && mte_restore_tags(entry, page)) + return; + } + + mte_clear_page_tags(page_address(page)); +} + void mte_sync_tags(pte_t *ptep, pte_t pte) { struct page *page = pte_page(pte); long i, nr_pages = compound_nr(page); + bool check_swap = nr_pages == 1; /* if PG_mte_tagged is set, tags have already been initialised */ for (i = 0; i < nr_pages; i++, page++) { if (!test_and_set_bit(PG_mte_tagged, &page->flags)) - mte_clear_page_tags(page_address(page)); + mte_sync_page_tags(page, ptep, check_swap); } } diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 434f81d9a180..03ca6d8b8670 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -104,3 +104,48 @@ SYM_FUNC_START(mte_copy_tags_to_user) 2: sub x0, x0, x3 // update the number of tags copied ret SYM_FUNC_END(mte_copy_tags_to_user) + +/* + * Save the tags in a page + * x0 - page address + * x1 - tag storage + */ +SYM_FUNC_START(mte_save_page_tags) + multitag_transfer_size x7, x5 +1: + mov x2, #0 +2: + ldgm x5, [x0] + orr x2, x2, x5 + add x0, x0, x7 + tst x0, #0xFF // 16 tag values fit in a register, + b.ne 2b // which is 16*16=256 bytes + + str x2, [x1], #8 + + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + + ret +SYM_FUNC_END(mte_save_page_tags) + +/* + * Restore the tags in a page + * x0 - page address + * x1 - tag storage + */ +SYM_FUNC_START(mte_restore_page_tags) + multitag_transfer_size x7, x5 +1: + ldr x2, [x1], #8 +2: + stgm x2, [x0] + add x0, x0, x7 + tst x0, #0xFF + b.ne 2b + + tst x0, #(PAGE_SIZE - 1) + b.ne 1b + + ret +SYM_FUNC_END(mte_restore_page_tags) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index d91030f0ffee..5bcc9e0aa259 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_PTDUMP_CORE) += dump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o +obj-$(CONFIG_ARM64_MTE) += mteswap.o KASAN_SANITIZE_physaddr.o += n obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c new file mode 100644 index 000000000000..c52c1847079c --- /dev/null +++ b/arch/arm64/mm/mteswap.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include + +static DEFINE_XARRAY(mte_pages); + +void *mte_allocate_tag_storage(void) +{ + /* tags granule is 16 bytes, 2 tags stored per byte */ + return kmalloc(PAGE_SIZE / 16 / 2, GFP_KERNEL); +} + +void mte_free_tag_storage(char *storage) +{ + kfree(storage); +} + +int mte_save_tags(struct page *page) +{ + void *tag_storage, *ret; + + if (!test_bit(PG_mte_tagged, &page->flags)) + return 0; + + tag_storage = mte_allocate_tag_storage(); + if (!tag_storage) + return -ENOMEM; + + mte_save_page_tags(page_address(page), tag_storage); + + /* page_private contains the swap entry.val set in do_swap_page */ + ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL); + if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { + mte_free_tag_storage(tag_storage); + return xa_err(ret); + } else if (ret) { + /* Entry is being replaced, free the old entry */ + mte_free_tag_storage(ret); + } + + return 0; +} + +bool mte_restore_tags(swp_entry_t entry, struct page *page) +{ + void *tags = xa_load(&mte_pages, entry.val); + + if (!tags) + return false; + + mte_restore_page_tags(page_address(page), tags); + + return true; +} + +void mte_invalidate_tags(int type, pgoff_t offset) +{ + swp_entry_t entry = swp_entry(type, offset); + void *tags = xa_erase(&mte_pages, entry.val); + + mte_free_tag_storage(tags); +} + +void mte_invalidate_tags_area(int type) +{ + swp_entry_t entry = swp_entry(type, 0); + swp_entry_t last_entry = swp_entry(type + 1, 0); + void *tags; + + XA_STATE(xa_state, &mte_pages, entry.val); + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, last_entry.val - 1) { + __xa_erase(&mte_pages, xa_state.xa_index); + mte_free_tag_storage(tags); + } + xa_unlock(&mte_pages); +} From ee11f332af96a3b5d29586bc6d69b41531f62648 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Wed, 13 May 2020 16:37:51 +0100 Subject: [PATCH 029/148] arm64: mte: Save tags when hibernating When hibernating the contents of all pages in the system are written to disk, however the MTE tags are not visible to the generic hibernation code. So just before the hibernation image is created copy the tags out of the physical tag storage into standard memory so they will be included in the hibernation image. After hibernation apply the tags back into the physical tag storage. Signed-off-by: Steven Price Signed-off-by: Catalin Marinas Cc: James Morse Cc: Will Deacon --- arch/arm64/kernel/hibernate.c | 118 ++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 68e14152d6e9..23467092e24d 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -285,6 +286,117 @@ static int create_safe_exec_page(void *src_start, size_t length, #define dcache_clean_range(start, end) __flush_dcache_area(start, (end - start)) +#ifdef CONFIG_ARM64_MTE + +static DEFINE_XARRAY(mte_pages); + +static int save_tags(struct page *page, unsigned long pfn) +{ + void *tag_storage, *ret; + + tag_storage = mte_allocate_tag_storage(); + if (!tag_storage) + return -ENOMEM; + + mte_save_page_tags(page_address(page), tag_storage); + + ret = xa_store(&mte_pages, pfn, tag_storage, GFP_KERNEL); + if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { + mte_free_tag_storage(tag_storage); + return xa_err(ret); + } else if (WARN(ret, "swsusp: %s: Duplicate entry", __func__)) { + mte_free_tag_storage(ret); + } + + return 0; +} + +static void swsusp_mte_free_storage(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + mte_free_tag_storage(tags); + } + xa_unlock(&mte_pages); + + xa_destroy(&mte_pages); +} + +static int swsusp_mte_save_tags(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + int ret = 0; + int n = 0; + + if (!system_supports_mte()) + return 0; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + struct page *page = pfn_to_online_page(pfn); + + if (!page) + continue; + + if (!test_bit(PG_mte_tagged, &page->flags)) + continue; + + ret = save_tags(page, pfn); + if (ret) { + swsusp_mte_free_storage(); + goto out; + } + + n++; + } + } + pr_info("Saved %d MTE pages\n", n); + +out: + return ret; +} + +static void swsusp_mte_restore_tags(void) +{ + XA_STATE(xa_state, &mte_pages, 0); + int n = 0; + void *tags; + + xa_lock(&mte_pages); + xas_for_each(&xa_state, tags, ULONG_MAX) { + unsigned long pfn = xa_state.xa_index; + struct page *page = pfn_to_online_page(pfn); + + mte_restore_page_tags(page_address(page), tags); + + mte_free_tag_storage(tags); + n++; + } + xa_unlock(&mte_pages); + + pr_info("Restored %d MTE pages\n", n); + + xa_destroy(&mte_pages); +} + +#else /* CONFIG_ARM64_MTE */ + +static int swsusp_mte_save_tags(void) +{ + return 0; +} + +static void swsusp_mte_restore_tags(void) +{ +} + +#endif /* CONFIG_ARM64_MTE */ + int swsusp_arch_suspend(void) { int ret = 0; @@ -302,6 +414,10 @@ int swsusp_arch_suspend(void) /* make the crash dump kernel image visible/saveable */ crash_prepare_suspend(); + ret = swsusp_mte_save_tags(); + if (ret) + return ret; + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { @@ -315,6 +431,8 @@ int swsusp_arch_suspend(void) dcache_clean_range(__hyp_text_start, __hyp_text_end); } + swsusp_mte_restore_tags(); + /* make the crash dump kernel image protected again */ crash_post_resume(); From 89b94df9dfb16fe29f9d2085b0a98116dc34d814 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 6 Sep 2019 11:08:29 +0100 Subject: [PATCH 030/148] arm64: mte: Kconfig entry Add Memory Tagging Extension support to the arm64 kbuild. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Cc: Will Deacon --- arch/arm64/Kconfig | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d232837cbee..e7450fbd0aa7 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1664,6 +1664,39 @@ config ARCH_RANDOM provides a high bandwidth, cryptographically secure hardware random number generator. +config ARM64_AS_HAS_MTE + # Initial support for MTE went in binutils 2.32.0, checked with + # ".arch armv8.5-a+memtag" below. However, this was incomplete + # as a late addition to the final architecture spec (LDGM/STGM) + # is only supported in the newer 2.32.x and 2.33 binutils + # versions, hence the extra "stgm" instruction check below. + def_bool $(as-instr,.arch armv8.5-a+memtag\nstgm xzr$(comma)[x0]) + +config ARM64_MTE + bool "Memory Tagging Extension support" + default y + depends on ARM64_AS_HAS_MTE && ARM64_TAGGED_ADDR_ABI + select ARCH_USES_HIGH_VMA_FLAGS + help + Memory Tagging (part of the ARMv8.5 Extensions) provides + architectural support for run-time, always-on detection of + various classes of memory error to aid with software debugging + to eliminate vulnerabilities arising from memory-unsafe + languages. + + This option enables the support for the Memory Tagging + Extension at EL0 (i.e. for userspace). + + Selecting this option allows the feature to be detected at + runtime. Any secondary CPU not implementing this feature will + not be allowed a late bring-up. + + Userspace binaries that want to use this feature must + explicitly opt in. The mechanism for the userspace is + described in: + + Documentation/arm64/memory-tagging-extension.rst. + endmenu config ARM64_SVE From df9d7a22dd21c926e8175ccc6e176cb45fc7cb09 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Fri, 6 Sep 2019 10:52:39 +0100 Subject: [PATCH 031/148] arm64: mte: Add Memory Tagging Extension documentation Memory Tagging Extension (part of the ARMv8.5 Extensions) provides a mechanism to detect the sources of memory related errors which may be vulnerable to exploitation, including bounds violations, use-after-free, use-after-return, use-out-of-scope and use before initialization errors. Add Memory Tagging Extension documentation for the arm64 linux kernel support. Signed-off-by: Vincenzo Frascino Co-developed-by: Catalin Marinas Signed-off-by: Catalin Marinas Acked-by: Szabolcs Nagy Cc: Will Deacon --- Documentation/arm64/cpu-feature-registers.rst | 2 + Documentation/arm64/elf_hwcaps.rst | 4 + Documentation/arm64/index.rst | 1 + .../arm64/memory-tagging-extension.rst | 305 ++++++++++++++++++ 4 files changed, 312 insertions(+) create mode 100644 Documentation/arm64/memory-tagging-extension.rst diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index f28853f80089..328e0c454fbd 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -175,6 +175,8 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | MTE | [11-8] | y | + +------------------------------+---------+---------+ | SSBS | [7-4] | y | +------------------------------+---------+---------+ | BT | [3-0] | y | diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 84a9fd2d41b4..bbd9cf54db6c 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -240,6 +240,10 @@ HWCAP2_BTI Functionality implied by ID_AA64PFR0_EL1.BT == 0b0001. +HWCAP2_MTE + + Functionality implied by ID_AA64PFR1_EL1.MTE == 0b0010, as described + by Documentation/arm64/memory-tagging-extension.rst. 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst index d9665d83c53a..43b0939d384e 100644 --- a/Documentation/arm64/index.rst +++ b/Documentation/arm64/index.rst @@ -14,6 +14,7 @@ ARM64 Architecture hugetlbpage legacy_instructions memory + memory-tagging-extension perf pointer-authentication silicon-errata diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst new file mode 100644 index 000000000000..e3709b536b89 --- /dev/null +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -0,0 +1,305 @@ +=============================================== +Memory Tagging Extension (MTE) in AArch64 Linux +=============================================== + +Authors: Vincenzo Frascino + Catalin Marinas + +Date: 2020-02-25 + +This document describes the provision of the Memory Tagging Extension +functionality in AArch64 Linux. + +Introduction +============ + +ARMv8.5 based processors introduce the Memory Tagging Extension (MTE) +feature. MTE is built on top of the ARMv8.0 virtual address tagging TBI +(Top Byte Ignore) feature and allows software to access a 4-bit +allocation tag for each 16-byte granule in the physical address space. +Such memory range must be mapped with the Normal-Tagged memory +attribute. A logical tag is derived from bits 59-56 of the virtual +address used for the memory access. A CPU with MTE enabled will compare +the logical tag against the allocation tag and potentially raise an +exception on mismatch, subject to system registers configuration. + +Userspace Support +================= + +When ``CONFIG_ARM64_MTE`` is selected and Memory Tagging Extension is +supported by the hardware, the kernel advertises the feature to +userspace via ``HWCAP2_MTE``. + +PROT_MTE +-------- + +To access the allocation tags, a user process must enable the Tagged +memory attribute on an address range using a new ``prot`` flag for +``mmap()`` and ``mprotect()``: + +``PROT_MTE`` - Pages allow access to the MTE allocation tags. + +The allocation tag is set to 0 when such pages are first mapped in the +user address space and preserved on copy-on-write. ``MAP_SHARED`` is +supported and the allocation tags can be shared between processes. + +**Note**: ``PROT_MTE`` is only supported on ``MAP_ANONYMOUS`` and +RAM-based file mappings (``tmpfs``, ``memfd``). Passing it to other +types of mapping will result in ``-EINVAL`` returned by these system +calls. + +**Note**: The ``PROT_MTE`` flag (and corresponding memory type) cannot +be cleared by ``mprotect()``. + +**Note**: ``madvise()`` memory ranges with ``MADV_DONTNEED`` and +``MADV_FREE`` may have the allocation tags cleared (set to 0) at any +point after the system call. + +Tag Check Faults +---------------- + +When ``PROT_MTE`` is enabled on an address range and a mismatch between +the logical and allocation tags occurs on access, there are three +configurable behaviours: + +- *Ignore* - This is the default mode. The CPU (and kernel) ignores the + tag check fault. + +- *Synchronous* - The kernel raises a ``SIGSEGV`` synchronously, with + ``.si_code = SEGV_MTESERR`` and ``.si_addr = ``. The + memory access is not performed. If ``SIGSEGV`` is ignored or blocked + by the offending thread, the containing process is terminated with a + ``coredump``. + +- *Asynchronous* - The kernel raises a ``SIGSEGV``, in the offending + thread, asynchronously following one or multiple tag check faults, + with ``.si_code = SEGV_MTEAERR`` and ``.si_addr = 0`` (the faulting + address is unknown). + +The user can select the above modes, per thread, using the +``prctl(PR_SET_TAGGED_ADDR_CTRL, flags, 0, 0, 0)`` system call where +``flags`` contain one of the following values in the ``PR_MTE_TCF_MASK`` +bit-field: + +- ``PR_MTE_TCF_NONE`` - *Ignore* tag check faults +- ``PR_MTE_TCF_SYNC`` - *Synchronous* tag check fault mode +- ``PR_MTE_TCF_ASYNC`` - *Asynchronous* tag check fault mode + +The current tag check fault mode can be read using the +``prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0)`` system call. + +Tag checking can also be disabled for a user thread by setting the +``PSTATE.TCO`` bit with ``MSR TCO, #1``. + +**Note**: Signal handlers are always invoked with ``PSTATE.TCO = 0``, +irrespective of the interrupted context. ``PSTATE.TCO`` is restored on +``sigreturn()``. + +**Note**: There are no *match-all* logical tags available for user +applications. + +**Note**: Kernel accesses to the user address space (e.g. ``read()`` +system call) are not checked if the user thread tag checking mode is +``PR_MTE_TCF_NONE`` or ``PR_MTE_TCF_ASYNC``. If the tag checking mode is +``PR_MTE_TCF_SYNC``, the kernel makes a best effort to check its user +address accesses, however it cannot always guarantee it. + +Excluding Tags in the ``IRG``, ``ADDG`` and ``SUBG`` instructions +----------------------------------------------------------------- + +The architecture allows excluding certain tags to be randomly generated +via the ``GCR_EL1.Exclude`` register bit-field. By default, Linux +excludes all tags other than 0. A user thread can enable specific tags +in the randomly generated set using the ``prctl(PR_SET_TAGGED_ADDR_CTRL, +flags, 0, 0, 0)`` system call where ``flags`` contains the tags bitmap +in the ``PR_MTE_TAG_MASK`` bit-field. + +**Note**: The hardware uses an exclude mask but the ``prctl()`` +interface provides an include mask. An include mask of ``0`` (exclusion +mask ``0xffff``) results in the CPU always generating tag ``0``. + +Initial process state +--------------------- + +On ``execve()``, the new process has the following configuration: + +- ``PR_TAGGED_ADDR_ENABLE`` set to 0 (disabled) +- Tag checking mode set to ``PR_MTE_TCF_NONE`` +- ``PR_MTE_TAG_MASK`` set to 0 (all tags excluded) +- ``PSTATE.TCO`` set to 0 +- ``PROT_MTE`` not set on any of the initial memory maps + +On ``fork()``, the new process inherits the parent's configuration and +memory map attributes with the exception of the ``madvise()`` ranges +with ``MADV_WIPEONFORK`` which will have the data and tags cleared (set +to 0). + +The ``ptrace()`` interface +-------------------------- + +``PTRACE_PEEKMTETAGS`` and ``PTRACE_POKEMTETAGS`` allow a tracer to read +the tags from or set the tags to a tracee's address space. The +``ptrace()`` system call is invoked as ``ptrace(request, pid, addr, +data)`` where: + +- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_PEEKMTETAGS``. +- ``pid`` - the tracee's PID. +- ``addr`` - address in the tracee's address space. +- ``data`` - pointer to a ``struct iovec`` where ``iov_base`` points to + a buffer of ``iov_len`` length in the tracer's address space. + +The tags in the tracer's ``iov_base`` buffer are represented as one +4-bit tag per byte and correspond to a 16-byte MTE tag granule in the +tracee's address space. + +**Note**: If ``addr`` is not aligned to a 16-byte granule, the kernel +will use the corresponding aligned address. + +``ptrace()`` return value: + +- 0 - tags were copied, the tracer's ``iov_len`` was updated to the + number of tags transferred. This may be smaller than the requested + ``iov_len`` if the requested address range in the tracee's or the + tracer's space cannot be accessed or does not have valid tags. +- ``-EPERM`` - the specified process cannot be traced. +- ``-EIO`` - the tracee's address range cannot be accessed (e.g. invalid + address) and no tags copied. ``iov_len`` not updated. +- ``-EFAULT`` - fault on accessing the tracer's memory (``struct iovec`` + or ``iov_base`` buffer) and no tags copied. ``iov_len`` not updated. +- ``-EOPNOTSUPP`` - the tracee's address does not have valid tags (never + mapped with the ``PROT_MTE`` flag). ``iov_len`` not updated. + +**Note**: There are no transient errors for the requests above, so user +programs should not retry in case of a non-zero system call return. + +``PTRACE_GETREGSET`` and ``PTRACE_SETREGSET`` with ``addr == +``NT_ARM_TAGGED_ADDR_CTRL`` allow ``ptrace()`` access to the tagged +address ABI control and MTE configuration of a process as per the +``prctl()`` options described in +Documentation/arm64/tagged-address-abi.rst and above. The corresponding +``regset`` is 1 element of 8 bytes (``sizeof(long))``). + +Example of correct usage +======================== + +*MTE Example code* + +.. code-block:: c + + /* + * To be compiled with -march=armv8.5-a+memtag + */ + #include + #include + #include + #include + #include + #include + #include + #include + + /* + * From arch/arm64/include/uapi/asm/hwcap.h + */ + #define HWCAP2_MTE (1 << 18) + + /* + * From arch/arm64/include/uapi/asm/mman.h + */ + #define PROT_MTE 0x20 + + /* + * From include/uapi/linux/prctl.h + */ + #define PR_SET_TAGGED_ADDR_CTRL 55 + #define PR_GET_TAGGED_ADDR_CTRL 56 + # define PR_TAGGED_ADDR_ENABLE (1UL << 0) + # define PR_MTE_TCF_SHIFT 1 + # define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TCF_MASK (3UL << PR_MTE_TCF_SHIFT) + # define PR_MTE_TAG_SHIFT 3 + # define PR_MTE_TAG_MASK (0xffffUL << PR_MTE_TAG_SHIFT) + + /* + * Insert a random logical tag into the given pointer. + */ + #define insert_random_tag(ptr) ({ \ + uint64_t __val; \ + asm("irg %0, %1" : "=r" (__val) : "r" (ptr)); \ + __val; \ + }) + + /* + * Set the allocation tag on the destination address. + */ + #define set_tag(tagged_addr) do { \ + asm volatile("stg %0, [%0]" : : "r" (tagged_addr) : "memory"); \ + } while (0) + + int main() + { + unsigned char *a; + unsigned long page_sz = sysconf(_SC_PAGESIZE); + unsigned long hwcap2 = getauxval(AT_HWCAP2); + + /* check if MTE is present */ + if (!(hwcap2 & HWCAP2_MTE)) + return EXIT_FAILURE; + + /* + * Enable the tagged address ABI, synchronous MTE tag check faults and + * allow all non-zero tags in the randomly generated set. + */ + if (prctl(PR_SET_TAGGED_ADDR_CTRL, + PR_TAGGED_ADDR_ENABLE | PR_MTE_TCF_SYNC | (0xfffe << PR_MTE_TAG_SHIFT), + 0, 0, 0)) { + perror("prctl() failed"); + return EXIT_FAILURE; + } + + a = mmap(0, page_sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (a == MAP_FAILED) { + perror("mmap() failed"); + return EXIT_FAILURE; + } + + /* + * Enable MTE on the above anonymous mmap. The flag could be passed + * directly to mmap() and skip this step. + */ + if (mprotect(a, page_sz, PROT_READ | PROT_WRITE | PROT_MTE)) { + perror("mprotect() failed"); + return EXIT_FAILURE; + } + + /* access with the default tag (0) */ + a[0] = 1; + a[1] = 2; + + printf("a[0] = %hhu a[1] = %hhu\n", a[0], a[1]); + + /* set the logical and allocation tags */ + a = (unsigned char *)insert_random_tag(a); + set_tag(a); + + printf("%p\n", a); + + /* non-zero tag access */ + a[0] = 3; + printf("a[0] = %hhu a[1] = %hhu\n", a[0], a[1]); + + /* + * If MTE is enabled correctly the next instruction will generate an + * exception. + */ + printf("Expecting SIGSEGV...\n"); + a[16] = 0xdd; + + /* this should not be printed in the PR_MTE_TCF_SYNC mode */ + printf("...haven't got one\n"); + + return EXIT_FAILURE; + } From b4c971245925db1a6e0898878ad410f8f17ec59a Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Tue, 4 Aug 2020 16:53:47 +0800 Subject: [PATCH 032/148] arm64: traps: Add str of description to panic() in die() Currently, there are different description strings in die() such as die("Oops",,), die("Oops - BUG",,). And panic() called by die() will always show "Fatal exception" or "Fatal exception in interrupt". Note that panic() will run any panic handler via panic_notifier_list. And the string above will be formatted and placed in static buf[] which will be passed to handler. So panic handler can not distinguish which Oops it is from the buf if we want to do some things like reserve the string in memory or panic statistics. It's not benefit to debug. We need to add more codes to troubleshoot. Let's utilize existing resource to make debug much simpler. Signed-off-by: Yue Hu Link: https://lore.kernel.org/r/20200804085347.10720-1-zbestahu@gmail.com Signed-off-by: Will Deacon --- arch/arm64/kernel/traps.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 13ebd5ca2070..af25cedf54e9 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -200,9 +200,9 @@ void die(const char *str, struct pt_regs *regs, int err) oops_exit(); if (in_interrupt()) - panic("Fatal exception in interrupt"); + panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) - panic("Fatal exception"); + panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); From ffdbd3d83553beb0fda00081293d5640cba477a2 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 11 Aug 2020 13:35:05 +0800 Subject: [PATCH 033/148] arm64: perf: Add general hardware LLC events for PMUv3 This patch is to add the general hardware last level cache (LLC) events for PMUv3: one event is for LLC access and another is for LLC miss. With this change, perf tool can support last level cache profiling, below is an example to demonstrate the usage on Arm64: $ perf stat -e LLC-load-misses -e LLC-loads -- \ perf bench mem memcpy -s 1024MB -l 100 -f default [...] Performance counter stats for 'perf bench mem memcpy -s 1024MB -l 100 -f default': 35,534,262 LLC-load-misses # 2.16% of all LL-cache hits 1,643,946,443 LLC-loads [...] Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200811053505.21223-1-leo.yan@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 462f9a9cc44b..86e2328b0c2e 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -69,6 +69,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] [C(ITLB)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL, [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_L1I_TLB, + [C(LL)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD, + [C(LL)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_LL_CACHE_RD, + [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)] = ARMV8_PMUV3_PERFCTR_BR_PRED, [C(BPU)][C(OP_READ)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED, }; From d51eb416fa111c1129c46ea3320faab36bf4c99c Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Fri, 4 Sep 2020 10:21:37 +0800 Subject: [PATCH 034/148] drivers/perf: hisi: Add missing include of linux/module.h MODULE_*** is used in HiSilicon uncore PMU drivers and is provided by linux/module.h, but the header file is not directly included. Add the missing include. Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/1599186097-18599-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- drivers/perf/hisilicon/hisi_uncore_pmu.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index 25b0c97b3eb0..b59ec22169ab 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include From ad14c19242b5a5f87aa86c4c930e668121de2809 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 28 Aug 2020 11:18:22 +0800 Subject: [PATCH 035/148] arm64: fix some spelling mistakes in the comments by codespell arch/arm64/include/asm/cpu_ops.h:24: necesary ==> necessary arch/arm64/include/asm/kvm_arm.h:69: maintainance ==> maintenance arch/arm64/include/asm/cpufeature.h:361: capabilties ==> capabilities arch/arm64/kernel/perf_regs.c:19: compatability ==> compatibility arch/arm64/kernel/smp_spin_table.c:86: endianess ==> endianness arch/arm64/kernel/smp_spin_table.c:88: endianess ==> endianness arch/arm64/kvm/vgic/vgic-mmio-v3.c:1004: targetting ==> targeting arch/arm64/kvm/vgic/vgic-mmio-v3.c:1005: targetting ==> targeting Signed-off-by: Xiaoming Ni Link: https://lore.kernel.org/r/20200828031822.35928-1-nixiaoming@huawei.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpu_ops.h | 2 +- arch/arm64/include/asm/cpufeature.h | 2 +- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/kernel/perf_regs.c | 2 +- arch/arm64/kernel/smp_spin_table.c | 4 ++-- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 4 ++-- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h index d28e8f37d3b4..e95c4df83911 100644 --- a/arch/arm64/include/asm/cpu_ops.h +++ b/arch/arm64/include/asm/cpu_ops.h @@ -21,7 +21,7 @@ * mechanism for doing so, tests whether it is possible to boot * the given CPU. * @cpu_boot: Boots a cpu into the kernel. - * @cpu_postboot: Optionally, perform any post-boot cleanup or necesary + * @cpu_postboot: Optionally, perform any post-boot cleanup or necessary * synchronisation. Called from the cpu being booted. * @cpu_can_disable: Determines whether a CPU can be disabled based on * mechanism-specific information. diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 89b4f0142c28..3a42dc8e697c 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -358,7 +358,7 @@ static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap) } /* - * Generic helper for handling capabilties with multiple (match,enable) pairs + * Generic helper for handling capabilities with multiple (match,enable) pairs * of call backs, sharing the same capability bit. * Iterate over each entry to see if at least one matches. */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 1da8e3dc4455..32acf95d49c7 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -66,7 +66,7 @@ * TWI: Trap WFI * TIDCP: Trap L2CTLR/L2ECTLR * BSU_IS: Upgrade barriers to the inner shareable domain - * FB: Force broadcast of all maintainance operations + * FB: Force broadcast of all maintenance operations * AMO: Override CPSR.A and enable signaling with VA * IMO: Override CPSR.I and enable signaling with VI * FMO: Override CPSR.F and enable signaling with VF diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 666b225aeb3a..94e8718e7229 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -16,7 +16,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) /* * Our handling of compat tasks (PERF_SAMPLE_REGS_ABI_32) is weird, but - * we're stuck with it for ABI compatability reasons. + * we're stuck with it for ABI compatibility reasons. * * For a 32-bit consumer inspecting a 32-bit task, then it will look at * the first 16 registers (see arch/arm/include/uapi/asm/perf_regs.h). diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index c8a3fee00c11..5892e79fa429 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -83,9 +83,9 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) /* * We write the release address as LE regardless of the native - * endianess of the kernel. Therefore, any boot-loaders that + * endianness of the kernel. Therefore, any boot-loaders that * read this address need to convert this address to the - * boot-loader's endianess before jumping. This is mandated by + * boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index 5c786b915cd3..52d6f24f65dc 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -1001,8 +1001,8 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) raw_spin_lock_irqsave(&irq->irq_lock, flags); /* - * An access targetting Group0 SGIs can only generate - * those, while an access targetting Group1 SGIs can + * An access targeting Group0 SGIs can only generate + * those, while an access targeting Group1 SGIs can * generate interrupts of either group. */ if (!irq->group || allow_group1) { From 9a747c91e8d69d7283c850031d2462f4d27bf25b Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Tue, 1 Sep 2020 17:11:54 +0800 Subject: [PATCH 036/148] arm64/numa: Fix a typo in comment of arm64_numa_init Fix a typo in comment of arm64_numa_init. 'encomapssing' should be 'encompassing'. Signed-off-by: Yanfei Xu Link: https://lore.kernel.org/r/20200901091154.10112-1-yanfei.xu@windriver.com Signed-off-by: Will Deacon --- arch/arm64/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 73f8b49d485c..d402da26cdca 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -448,7 +448,7 @@ static int __init dummy_numa_init(void) * arm64_numa_init() - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The - * last fallback is dummy single node config encomapssing whole memory. + * last fallback is dummy single node config encompassing whole memory. */ void __init arm64_numa_init(void) { From 1ab64cf81489e871bcb94fb2dea35c7fac561fff Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 18 Aug 2020 14:36:24 +0800 Subject: [PATCH 037/148] ACPI/IORT: Drop the unused @ops of iort_add_device_replay() Since commit d2e1a003af56 ("ACPI/IORT: Don't call iommu_ops->add_device directly"), we use the IOMMU core API to replace a direct invoke of the specified callback. The parameter @ops has therefore became unused. Let's drop it. Signed-off-by: Zenghui Yu Acked-by: Lorenzo Pieralisi Acked-by: Hanjun Guo Link: https://lore.kernel.org/r/20200818063625.980-2-yuzenghui@huawei.com Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index ec782e4a0fe4..a0ece0e201b2 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -811,8 +811,7 @@ static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) return (fwspec && fwspec->ops) ? fwspec->ops : NULL; } -static inline int iort_add_device_replay(const struct iommu_ops *ops, - struct device *dev) +static inline int iort_add_device_replay(struct device *dev) { int err = 0; @@ -1072,7 +1071,7 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev, */ if (!err) { ops = iort_fwspec_iommu_ops(dev); - err = iort_add_device_replay(ops, dev); + err = iort_add_device_replay(dev); } /* Ignore all other errors apart from EPROBE_DEFER */ @@ -1089,8 +1088,7 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev, #else static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) { return NULL; } -static inline int iort_add_device_replay(const struct iommu_ops *ops, - struct device *dev) +static inline int iort_add_device_replay(struct device *dev) { return 0; } int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { return 0; } From c2bea7a1a1c0c4c5c970696ff556a73f55f998eb Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Tue, 18 Aug 2020 14:36:25 +0800 Subject: [PATCH 038/148] ACPI/IORT: Remove the unused inline functions Since commit 8212688600ed ("ACPI/IORT: Fix build error when IOMMU_SUPPORT is disabled"), iort_fwspec_iommu_ops() and iort_add_device_replay() are not needed anymore when CONFIG_IOMMU_API is not selected. Let's remove them. Signed-off-by: Zenghui Yu Acked-by: Lorenzo Pieralisi Acked-by: Hanjun Guo Link: https://lore.kernel.org/r/20200818063625.980-3-yuzenghui@huawei.com Signed-off-by: Will Deacon --- drivers/acpi/arm64/iort.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index a0ece0e201b2..e670785a6201 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1086,10 +1086,6 @@ const struct iommu_ops *iort_iommu_configure_id(struct device *dev, } #else -static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) -{ return NULL; } -static inline int iort_add_device_replay(struct device *dev) -{ return 0; } int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { return 0; } const struct iommu_ops *iort_iommu_configure_id(struct device *dev, From 120dc60d0bdbadcad7460222f74c9ed15cdeb73e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 25 Aug 2020 15:54:40 +0200 Subject: [PATCH 039/148] arm64: get rid of TEXT_OFFSET TEXT_OFFSET serves no purpose, and for this reason, it was redefined as 0x0 in the v5.8 timeframe. Since this does not appear to have caused any issues that require us to revisit that decision, let's get rid of the macro entirely, along with any references to it. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20200825135440.11288-1-ardb@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Makefile | 6 ------ arch/arm64/include/asm/boot.h | 3 +-- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/include/asm/memory.h | 2 +- arch/arm64/kernel/Makefile | 2 -- arch/arm64/kernel/head.S | 16 ++++++---------- arch/arm64/kernel/image.h | 1 - arch/arm64/kernel/vmlinux.lds.S | 4 ++-- drivers/firmware/efi/libstub/Makefile | 1 - drivers/firmware/efi/libstub/arm64-stub.c | 6 +++--- 10 files changed, 14 insertions(+), 29 deletions(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 130569f90c54..0fd4c1be4f64 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -11,7 +11,6 @@ # Copyright (C) 1995-2001 by Russell King LDFLAGS_vmlinux :=--no-undefined -X -CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour @@ -132,9 +131,6 @@ endif # Default value head-y := arch/arm64/kernel/head.o -# The byte offset of the kernel image in RAM from the start of RAM. -TEXT_OFFSET := 0x0 - ifeq ($(CONFIG_KASAN_SW_TAGS), y) KASAN_SHADOW_SCALE_SHIFT := 4 else @@ -145,8 +141,6 @@ KBUILD_CFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_CPPFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) KBUILD_AFLAGS += -DKASAN_SHADOW_SCALE_SHIFT=$(KASAN_SHADOW_SCALE_SHIFT) -export TEXT_OFFSET - core-y += arch/arm64/ libs-y := arch/arm64/lib/ $(libs-y) libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h index c7f67da13cd9..3e7943fd17a4 100644 --- a/arch/arm64/include/asm/boot.h +++ b/arch/arm64/include/asm/boot.h @@ -13,8 +13,7 @@ #define MAX_FDT_SIZE SZ_2M /* - * arm64 requires the kernel image to placed - * TEXT_OFFSET bytes beyond a 2 MB aligned base + * arm64 requires the kernel image to placed at a 2 MB aligned base address */ #define MIN_KIMG_ALIGN SZ_2M diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 329fb15f6bac..19ca76ea60d9 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -86,7 +86,7 @@ + EARLY_PGDS((vstart), (vend)) /* each PGDIR needs a next level page table */ \ + EARLY_PUDS((vstart), (vend)) /* each PUD needs a next level page table */ \ + EARLY_PMDS((vstart), (vend))) /* each PMD needs a next level page table */ -#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR + TEXT_OFFSET, _end)) +#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end)) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) #ifdef CONFIG_ARM64_SW_TTBR0_PAN diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index afa722504bfd..20a9b322d342 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -169,7 +169,7 @@ extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ #define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; }) -/* the virtual base of the kernel image (minus TEXT_OFFSET) */ +/* the virtual base of the kernel image */ extern u64 kimage_vaddr; /* the offset between the kernel virtual and physical mappings */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index a561cbb91d4d..4197ef2fb22e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -3,8 +3,6 @@ # Makefile for the linux kernel. # -CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) -AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 037421c66b14..d8d9caf02834 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -36,14 +36,10 @@ #include "efi-header.S" -#define __PHYS_OFFSET (KERNEL_START - TEXT_OFFSET) +#define __PHYS_OFFSET KERNEL_START -#if (TEXT_OFFSET & 0xfff) != 0 -#error TEXT_OFFSET must be at least 4KB aligned -#elif (PAGE_OFFSET & 0x1fffff) != 0 +#if (PAGE_OFFSET & 0x1fffff) != 0 #error PAGE_OFFSET must be at least 2MB aligned -#elif TEXT_OFFSET > 0x1fffff -#error TEXT_OFFSET must be less than 2MB #endif /* @@ -55,7 +51,7 @@ * x0 = physical address to the FDT blob. * * This code is mostly position independent so you call this at - * __pa(PAGE_OFFSET + TEXT_OFFSET). + * __pa(PAGE_OFFSET). * * Note that the callee-saved registers are used for storing variables * that are useful before the MMU is enabled. The allocations are described @@ -77,7 +73,7 @@ _head: b primary_entry // branch to kernel start, magic .long 0 // reserved #endif - le64sym _kernel_offset_le // Image load offset from start of RAM, little-endian + .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian .quad 0 // reserved @@ -382,7 +378,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) * Map the kernel image (starting with PHYS_OFFSET). */ adrp x0, init_pg_dir - mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) + mov_q x5, KIMAGE_VADDR // compile time __va(_text) add x5, x5, x23 // add KASLR displacement mov x4, PTRS_PER_PGD adrp x6, _end // runtime __pa(_end) @@ -474,7 +470,7 @@ SYM_FUNC_END(__primary_switched) .pushsection ".rodata", "a" SYM_DATA_START(kimage_vaddr) - .quad _text - TEXT_OFFSET + .quad _text SYM_DATA_END(kimage_vaddr) EXPORT_SYMBOL(kimage_vaddr) .popsection diff --git a/arch/arm64/kernel/image.h b/arch/arm64/kernel/image.h index c7d38c660372..7bc3ba897901 100644 --- a/arch/arm64/kernel/image.h +++ b/arch/arm64/kernel/image.h @@ -62,7 +62,6 @@ */ #define HEAD_SYMBOLS \ DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \ - DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \ DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS); #endif /* __ARM64_KERNEL_IMAGE_H */ diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7cba7623fcec..82801d98a2b7 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -105,7 +105,7 @@ SECTIONS *(.eh_frame) } - . = KIMAGE_VADDR + TEXT_OFFSET; + . = KIMAGE_VADDR; .head.text : { _text = .; @@ -274,4 +274,4 @@ ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ -ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned") +ASSERT(_text == KIMAGE_VADDR, "HEAD is misaligned") diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 296b18fbd7a2..5a80cf6bd606 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -64,7 +64,6 @@ lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM64) += arm64-stub.o lib-$(CONFIG_X86) += x86-stub.o CFLAGS_arm32-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) -CFLAGS_arm64-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) # # For x86, bootloaders like systemd-boot or grub-efi do not zero-initialize the diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index e5bfac79e5ac..fc178398f1ac 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -77,7 +77,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, kernel_size = _edata - _text; kernel_memsize = kernel_size + (_end - _edata); - *reserve_size = kernel_memsize + TEXT_OFFSET % min_kimg_align(); + *reserve_size = kernel_memsize; if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && phys_seed != 0) { /* @@ -91,7 +91,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } if (status != EFI_SUCCESS) { - if (IS_ALIGNED((u64)_text - TEXT_OFFSET, min_kimg_align())) { + if (IS_ALIGNED((u64)_text, min_kimg_align())) { /* * Just execute from wherever we were loaded by the * UEFI PE/COFF loader if the alignment is suitable. @@ -111,7 +111,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } } - *image_addr = *reserve_addr + TEXT_OFFSET % min_kimg_align(); + *image_addr = *reserve_addr; memcpy((void *)*image_addr, _text, kernel_size); return EFI_SUCCESS; From 44fdf4ed2693eb05dbfa83beaa6fe5fbd0c2f6d0 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Fri, 4 Sep 2020 17:57:38 +0800 Subject: [PATCH 040/148] arm64: perf: Remove unnecessary event_idx check event_idx is obtained from armv8pmu_get_event_idx(), and this idx must be between ARMV8_IDX_CYCLE_COUNTER and cpu_pmu->num_events. So it's unnecessary to do this check. Let's remove it. Signed-off-by: Qi Liu Link: https://lore.kernel.org/r/1599213458-28394-1-git-send-email-liuqi115@huawei.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 86e2328b0c2e..04d0ceb72a67 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -310,8 +310,6 @@ static struct attribute_group armv8_pmuv3_format_attr_group = { */ #define ARMV8_IDX_CYCLE_COUNTER 0 #define ARMV8_IDX_COUNTER0 1 -#define ARMV8_IDX_COUNTER_LAST(cpu_pmu) \ - (ARMV8_IDX_CYCLE_COUNTER + cpu_pmu->num_events - 1) /* @@ -368,12 +366,6 @@ static inline int armv8pmu_has_overflowed(u32 pmovsr) return pmovsr & ARMV8_PMU_OVERFLOWED_MASK; } -static inline int armv8pmu_counter_valid(struct arm_pmu *cpu_pmu, int idx) -{ - return idx >= ARMV8_IDX_CYCLE_COUNTER && - idx <= ARMV8_IDX_COUNTER_LAST(cpu_pmu); -} - static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) { return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); @@ -443,15 +435,11 @@ static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value) static u64 armv8pmu_read_counter(struct perf_event *event) { - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; u64 value = 0; - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u reading wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) + if (idx == ARMV8_IDX_CYCLE_COUNTER) value = read_sysreg(pmccntr_el0); else value = armv8pmu_read_hw_counter(event); @@ -480,16 +468,12 @@ static inline void armv8pmu_write_hw_counter(struct perf_event *event, static void armv8pmu_write_counter(struct perf_event *event, u64 value) { - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; value = armv8pmu_bias_long_counter(event, value); - if (!armv8pmu_counter_valid(cpu_pmu, idx)) - pr_err("CPU%u writing wrong counter %d\n", - smp_processor_id(), idx); - else if (idx == ARMV8_IDX_CYCLE_COUNTER) + if (idx == ARMV8_IDX_CYCLE_COUNTER) write_sysreg(value, pmccntr_el0); else armv8pmu_write_hw_counter(event, value); From c048ddf86cdd066db283ce838a0049d2bbefb9e5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 4 Sep 2020 14:00:59 +0530 Subject: [PATCH 041/148] arm64/mm/ptdump: Add address markers for BPF regions Kernel virtual region [BPF_JIT_REGION_START..BPF_JIT_REGION_END] is missing from address_markers[], hence relevant page table entries are not displayed with /sys/kernel/debug/kernel_page_tables. This adds those missing markers. While here, also rename arch/arm64/mm/dump.c which sounds bit ambiguous, as arch/arm64/mm/ptdump.c instead. Signed-off-by: Anshuman Khandual Reviewed-by: Steven Price Cc: Catalin Marinas Cc: Will Deacon Cc: Ard Biesheuvel Cc: Steven Price Cc: Andrew Morton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1599208259-11191-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/Makefile | 2 +- arch/arm64/mm/{dump.c => ptdump.c} | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) rename arch/arm64/mm/{dump.c => ptdump.c} (99%) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index d91030f0ffee..2a1d275cd4d7 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -4,7 +4,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump.o +obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/ptdump.c similarity index 99% rename from arch/arm64/mm/dump.c rename to arch/arm64/mm/ptdump.c index 0b8da1cc1c07..265284dc942d 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/ptdump.c @@ -41,6 +41,8 @@ static struct addr_marker address_markers[] = { { 0 /* KASAN_SHADOW_START */, "Kasan shadow start" }, { KASAN_SHADOW_END, "Kasan shadow end" }, #endif + { BPF_JIT_REGION_START, "BPF start" }, + { BPF_JIT_REGION_END, "BPF end" }, { MODULES_VADDR, "Modules start" }, { MODULES_END, "Modules end" }, { VMALLOC_START, "vmalloc() area" }, From 3102bc0e6ac752cc5df896acb557d779af4d82a1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 29 Aug 2020 14:00:16 +0100 Subject: [PATCH 042/148] arm64: topology: Stop using MPIDR for topology information In the absence of ACPI or DT topology data, we fallback to haphazardly decoding *something* out of MPIDR. Sadly, the contents of that register are mostly unusable due to the implementation leniancy and things like Aff0 having to be capped to 15 (despite being encoded on 8 bits). Consider a simple system with a single package of 32 cores, all under the same LLC. We ought to be shoving them in the same core_sibling mask, but MPIDR is going to look like: | CPU | 0 | ... | 15 | 16 | ... | 31 | |------+---+-----+----+----+-----+----+ | Aff0 | 0 | ... | 15 | 0 | ... | 15 | | Aff1 | 0 | ... | 0 | 1 | ... | 1 | | Aff2 | 0 | ... | 0 | 0 | ... | 0 | Which will eventually yield core_sibling(0-15) == 0-15 core_sibling(16-31) == 16-31 NUMA woes ========= If we try to play games with this and set up NUMA boundaries within those groups of 16 cores via e.g. QEMU: # Node0: 0-9; Node1: 10-19 $ qemu-system-aarch64 \ -smp 20 -numa node,cpus=0-9,nodeid=0 -numa node,cpus=10-19,nodeid=1 The scheduler's MC domain (all CPUs with same LLC) is going to be built via arch_topology.c::cpu_coregroup_mask() In there we try to figure out a sensible mask out of the topology information we have. In short, here we'll pick the smallest of NUMA or core sibling mask. node_mask(CPU9) == 0-9 core_sibling(CPU9) == 0-15 MC mask for CPU9 will thus be 0-9, not a problem. node_mask(CPU10) == 10-19 core_sibling(CPU10) == 0-15 MC mask for CPU10 will thus be 10-19, not a problem. node_mask(CPU16) == 10-19 core_sibling(CPU16) == 16-19 MC mask for CPU16 will thus be 16-19... Uh oh. CPUs 16-19 are in two different unique MC spans, and the scheduler has no idea what to make of that. That triggers the WARN_ON() added by commit ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") Fixing MPIDR-derived topology ============================= We could try to come up with some cleverer scheme to figure out which of the available masks to pick, but really if one of those masks resulted from MPIDR then it should be discarded because it's bound to be bogus. I was hoping to give MPIDR a chance for SMT, to figure out which threads are in the same core using Aff1-3 as core ID, but Sudeep and Robin pointed out to me that there are systems out there where *all* cores have non-zero values in their higher affinity fields (e.g. RK3288 has "5" in all of its cores' MPIDR.Aff1), which would expose a bogus core ID to userspace. Stop using MPIDR for topology information. When no other source of topology information is available, mark each CPU as its own core and its NUMA node as its LLC domain. Signed-off-by: Valentin Schneider Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/20200829130016.26106-1-valentin.schneider@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 0801a0f3c156..ff1dd1dbfe64 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -36,21 +36,23 @@ void store_cpu_topology(unsigned int cpuid) if (mpidr & MPIDR_UP_BITMASK) return; - /* Create cpu topology mapping based on MPIDR. */ - if (mpidr & MPIDR_MT_BITMASK) { - /* Multiprocessor system : Multi-threads per core */ - cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8; - } else { - /* Multiprocessor system : Single-thread per core */ - cpuid_topo->thread_id = -1; - cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0); - cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) | - MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 | - MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16; - } + /* + * This would be the place to create cpu topology based on MPIDR. + * + * However, it cannot be trusted to depict the actual topology; some + * pieces of the architecture enforce an artificial cap on Aff0 values + * (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an + * artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up + * having absolutely no relationship to the actual underlying system + * topology, and cannot be reasonably used as core / package ID. + * + * If the MT bit is set, Aff0 *could* be used to define a thread ID, but + * we still wouldn't be able to obtain a sane core ID. This means we + * need to entirely ignore MPIDR for any topology deduction. + */ + cpuid_topo->thread_id = -1; + cpuid_topo->core_id = cpuid; + cpuid_topo->package_id = cpu_to_node(cpuid); pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n", cpuid, cpuid_topo->package_id, cpuid_topo->core_id, From b65399f6111b03df870d8daa75b8d140c0de93f4 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 9 Sep 2020 10:23:02 +0530 Subject: [PATCH 043/148] arm64/mm: Change THP helpers to comply with generic MM semantics pmd_present() and pmd_trans_huge() are expected to behave in the following manner during various phases of a given PMD. It is derived from a previous detailed discussion on this topic [1] and present THP documentation [2]. pmd_present(pmd): - Returns true if pmd refers to system RAM with a valid pmd_page(pmd) - Returns false if pmd refers to a migration or swap entry pmd_trans_huge(pmd): - Returns true if pmd refers to system RAM and is a trans huge mapping ------------------------------------------------------------------------- | PMD states | pmd_present | pmd_trans_huge | ------------------------------------------------------------------------- | Mapped | Yes | Yes | ------------------------------------------------------------------------- | Splitting | Yes | Yes | ------------------------------------------------------------------------- | Migration/Swap | No | No | ------------------------------------------------------------------------- The problem: PMD is first invalidated with pmdp_invalidate() before it's splitting. This invalidation clears PMD_SECT_VALID as below. PMD Split -> pmdp_invalidate() -> pmd_mkinvalid -> Clears PMD_SECT_VALID Once PMD_SECT_VALID gets cleared, it results in pmd_present() return false on the PMD entry. It will need another bit apart from PMD_SECT_VALID to re- affirm pmd_present() as true during the THP split process. To comply with above mentioned semantics, pmd_trans_huge() should also check pmd_present() first before testing presence of an actual transparent huge mapping. The solution: Ideally PMD_TYPE_SECT should have been used here instead. But it shares the bit position with PMD_SECT_VALID which is used for THP invalidation. Hence it will not be there for pmd_present() check after pmdp_invalidate(). A new software defined PMD_PRESENT_INVALID (bit 59) can be set on the PMD entry during invalidation which can help pmd_present() return true and in recognizing the fact that it still points to memory. This bit is transient. During the split process it will be overridden by a page table page representing normal pages in place of erstwhile huge page. Other pmdp_invalidate() callers always write a fresh PMD value on the entry overriding this transient PMD_PRESENT_INVALID bit, which makes it safe. [1]: https://lkml.org/lkml/2018/10/17/231 [2]: https://www.kernel.org/doc/Documentation/vm/transhuge.txt Signed-off-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Marc Zyngier Cc: Suzuki Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1599627183-14453-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 7 ++++++ arch/arm64/include/asm/pgtable.h | 34 ++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 4d867c6446c4..2df4b75fce3c 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -19,6 +19,13 @@ #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) #define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */ +/* + * This bit indicates that the entry is present i.e. pmd_page() + * still points to a valid huge page in memory even if the pmd + * has been invalidated. + */ +#define PMD_PRESENT_INVALID (_AT(pteval_t, 1) << 59) /* only when !PMD_SECT_VALID */ + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d5d3fbe73953..d8258ae8fce0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -145,6 +145,18 @@ static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) return pte; } +static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + pmd_val(pmd) &= ~pgprot_val(prot); + return pmd; +} + +static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot) +{ + pmd_val(pmd) |= pgprot_val(prot); + return pmd; +} + static inline pte_t pte_wrprotect(pte_t pte) { pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); @@ -363,15 +375,24 @@ static inline int pmd_protnone(pmd_t pmd) } #endif +#define pmd_present_invalid(pmd) (!!(pmd_val(pmd) & PMD_PRESENT_INVALID)) + +static inline int pmd_present(pmd_t pmd) +{ + return pte_present(pmd_pte(pmd)) || pmd_present_invalid(pmd); +} + /* * THP definitions. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) && pmd_present(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#define pmd_present(pmd) pte_present(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) @@ -381,7 +402,14 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) + +static inline pmd_t pmd_mkinvalid(pmd_t pmd) +{ + pmd = set_pmd_bit(pmd, __pgprot(PMD_PRESENT_INVALID)); + pmd = clear_pmd_bit(pmd, __pgprot(PMD_SECT_VALID)); + + return pmd; +} #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) From 53fa117bb33c9ebc4c0746b3830e273f5e9b97b7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 9 Sep 2020 10:23:03 +0530 Subject: [PATCH 044/148] arm64/mm: Enable THP migration In certain page migration situations, a THP page can be migrated without being split into it's constituent subpages. This saves time required to split a THP and put it back together when required. But it also saves an wider address range translation covered by a single TLB entry, reducing future page fault costs. A previous patch changed platform THP helpers per generic memory semantics, clearing the path for THP migration support. This adds two more THP helpers required to create PMD migration swap entries. Now enable THP migration via ARCH_ENABLE_THP_MIGRATION. Signed-off-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Marc Zyngier Cc: Suzuki Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1599627183-14453-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 4 ++++ arch/arm64/include/asm/pgtable.h | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d232837cbee..e21b94061780 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1876,6 +1876,10 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on HUGETLB_PAGE && MIGRATION +config ARCH_ENABLE_THP_MIGRATION + def_bool y + depends on TRANSPARENT_HUGEPAGE + menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d8258ae8fce0..bc68da9f5706 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -875,6 +875,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +#define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) +#define __swp_entry_to_pmd(swp) __pmd((swp).val) +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + /* * Ensure that there are not more swap files than can be encoded in the kernel * PTEs. From 4e56de82d4ec9d98ffdc9e0387d8fdecbf496226 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 9 Sep 2020 11:18:55 +0530 Subject: [PATCH 045/148] arm64/cpuinfo: Define HWCAP name arrays per their actual bit definitions HWCAP name arrays (hwcap_str, compat_hwcap_str, compat_hwcap2_str) that are scanned for /proc/cpuinfo are detached from their bit definitions making it vulnerable and difficult to correlate. It is also bit problematic because during /proc/cpuinfo dump these arrays get traversed sequentially assuming they reflect and match actual HWCAP bit sequence, to test various features for a given CPU. This redefines name arrays per their HWCAP bit definitions . It also warns after detecting any feature which is not expected on arm64. Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Brown Cc: Dave Martin Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1599630535-29337-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hwcap.h | 9 ++ arch/arm64/kernel/cpuinfo.c | 179 +++++++++++++++++---------------- 2 files changed, 102 insertions(+), 86 deletions(-) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 22f73fe09030..6493a4c63a2f 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -8,18 +8,27 @@ #include #include +#define COMPAT_HWCAP_SWP (1 << 0) #define COMPAT_HWCAP_HALF (1 << 1) #define COMPAT_HWCAP_THUMB (1 << 2) +#define COMPAT_HWCAP_26BIT (1 << 3) #define COMPAT_HWCAP_FAST_MULT (1 << 4) +#define COMPAT_HWCAP_FPA (1 << 5) #define COMPAT_HWCAP_VFP (1 << 6) #define COMPAT_HWCAP_EDSP (1 << 7) +#define COMPAT_HWCAP_JAVA (1 << 8) +#define COMPAT_HWCAP_IWMMXT (1 << 9) +#define COMPAT_HWCAP_CRUNCH (1 << 10) +#define COMPAT_HWCAP_THUMBEE (1 << 11) #define COMPAT_HWCAP_NEON (1 << 12) #define COMPAT_HWCAP_VFPv3 (1 << 13) +#define COMPAT_HWCAP_VFPV3D16 (1 << 14) #define COMPAT_HWCAP_TLS (1 << 15) #define COMPAT_HWCAP_VFPv4 (1 << 16) #define COMPAT_HWCAP_IDIVA (1 << 17) #define COMPAT_HWCAP_IDIVT (1 << 18) #define COMPAT_HWCAP_IDIV (COMPAT_HWCAP_IDIVA|COMPAT_HWCAP_IDIVT) +#define COMPAT_HWCAP_VFPD32 (1 << 19) #define COMPAT_HWCAP_LPAE (1 << 20) #define COMPAT_HWCAP_EVTSTRM (1 << 21) diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index d0076c2159e6..25113245825c 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -43,94 +43,92 @@ static const char *icache_policy_str[] = { unsigned long __icache_flags; static const char *const hwcap_str[] = { - "fp", - "asimd", - "evtstrm", - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - "atomics", - "fphp", - "asimdhp", - "cpuid", - "asimdrdm", - "jscvt", - "fcma", - "lrcpc", - "dcpop", - "sha3", - "sm3", - "sm4", - "asimddp", - "sha512", - "sve", - "asimdfhm", - "dit", - "uscat", - "ilrcpc", - "flagm", - "ssbs", - "sb", - "paca", - "pacg", - "dcpodp", - "sve2", - "sveaes", - "svepmull", - "svebitperm", - "svesha3", - "svesm4", - "flagm2", - "frint", - "svei8mm", - "svef32mm", - "svef64mm", - "svebf16", - "i8mm", - "bf16", - "dgh", - "rng", - "bti", - /* reserved for "mte" */ - NULL + [KERNEL_HWCAP_FP] = "fp", + [KERNEL_HWCAP_ASIMD] = "asimd", + [KERNEL_HWCAP_EVTSTRM] = "evtstrm", + [KERNEL_HWCAP_AES] = "aes", + [KERNEL_HWCAP_PMULL] = "pmull", + [KERNEL_HWCAP_SHA1] = "sha1", + [KERNEL_HWCAP_SHA2] = "sha2", + [KERNEL_HWCAP_CRC32] = "crc32", + [KERNEL_HWCAP_ATOMICS] = "atomics", + [KERNEL_HWCAP_FPHP] = "fphp", + [KERNEL_HWCAP_ASIMDHP] = "asimdhp", + [KERNEL_HWCAP_CPUID] = "cpuid", + [KERNEL_HWCAP_ASIMDRDM] = "asimdrdm", + [KERNEL_HWCAP_JSCVT] = "jscvt", + [KERNEL_HWCAP_FCMA] = "fcma", + [KERNEL_HWCAP_LRCPC] = "lrcpc", + [KERNEL_HWCAP_DCPOP] = "dcpop", + [KERNEL_HWCAP_SHA3] = "sha3", + [KERNEL_HWCAP_SM3] = "sm3", + [KERNEL_HWCAP_SM4] = "sm4", + [KERNEL_HWCAP_ASIMDDP] = "asimddp", + [KERNEL_HWCAP_SHA512] = "sha512", + [KERNEL_HWCAP_SVE] = "sve", + [KERNEL_HWCAP_ASIMDFHM] = "asimdfhm", + [KERNEL_HWCAP_DIT] = "dit", + [KERNEL_HWCAP_USCAT] = "uscat", + [KERNEL_HWCAP_ILRCPC] = "ilrcpc", + [KERNEL_HWCAP_FLAGM] = "flagm", + [KERNEL_HWCAP_SSBS] = "ssbs", + [KERNEL_HWCAP_SB] = "sb", + [KERNEL_HWCAP_PACA] = "paca", + [KERNEL_HWCAP_PACG] = "pacg", + [KERNEL_HWCAP_DCPODP] = "dcpodp", + [KERNEL_HWCAP_SVE2] = "sve2", + [KERNEL_HWCAP_SVEAES] = "sveaes", + [KERNEL_HWCAP_SVEPMULL] = "svepmull", + [KERNEL_HWCAP_SVEBITPERM] = "svebitperm", + [KERNEL_HWCAP_SVESHA3] = "svesha3", + [KERNEL_HWCAP_SVESM4] = "svesm4", + [KERNEL_HWCAP_FLAGM2] = "flagm2", + [KERNEL_HWCAP_FRINT] = "frint", + [KERNEL_HWCAP_SVEI8MM] = "svei8mm", + [KERNEL_HWCAP_SVEF32MM] = "svef32mm", + [KERNEL_HWCAP_SVEF64MM] = "svef64mm", + [KERNEL_HWCAP_SVEBF16] = "svebf16", + [KERNEL_HWCAP_I8MM] = "i8mm", + [KERNEL_HWCAP_BF16] = "bf16", + [KERNEL_HWCAP_DGH] = "dgh", + [KERNEL_HWCAP_RNG] = "rng", + [KERNEL_HWCAP_BTI] = "bti", }; #ifdef CONFIG_COMPAT +#define COMPAT_KERNEL_HWCAP(x) const_ilog2(COMPAT_HWCAP_ ## x) static const char *const compat_hwcap_str[] = { - "swp", - "half", - "thumb", - "26bit", - "fastmult", - "fpa", - "vfp", - "edsp", - "java", - "iwmmxt", - "crunch", - "thumbee", - "neon", - "vfpv3", - "vfpv3d16", - "tls", - "vfpv4", - "idiva", - "idivt", - "vfpd32", - "lpae", - "evtstrm", - NULL + [COMPAT_KERNEL_HWCAP(SWP)] = "swp", + [COMPAT_KERNEL_HWCAP(HALF)] = "half", + [COMPAT_KERNEL_HWCAP(THUMB)] = "thumb", + [COMPAT_KERNEL_HWCAP(26BIT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(FAST_MULT)] = "fastmult", + [COMPAT_KERNEL_HWCAP(FPA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(VFP)] = "vfp", + [COMPAT_KERNEL_HWCAP(EDSP)] = "edsp", + [COMPAT_KERNEL_HWCAP(JAVA)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(IWMMXT)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(CRUNCH)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(THUMBEE)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(NEON)] = "neon", + [COMPAT_KERNEL_HWCAP(VFPv3)] = "vfpv3", + [COMPAT_KERNEL_HWCAP(VFPV3D16)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(TLS)] = "tls", + [COMPAT_KERNEL_HWCAP(VFPv4)] = "vfpv4", + [COMPAT_KERNEL_HWCAP(IDIVA)] = "idiva", + [COMPAT_KERNEL_HWCAP(IDIVT)] = "idivt", + [COMPAT_KERNEL_HWCAP(VFPD32)] = NULL, /* Not possible on arm64 */ + [COMPAT_KERNEL_HWCAP(LPAE)] = "lpae", + [COMPAT_KERNEL_HWCAP(EVTSTRM)] = "evtstrm", }; +#define COMPAT_KERNEL_HWCAP2(x) const_ilog2(COMPAT_HWCAP2_ ## x) static const char *const compat_hwcap2_str[] = { - "aes", - "pmull", - "sha1", - "sha2", - "crc32", - NULL + [COMPAT_KERNEL_HWCAP2(AES)] = "aes", + [COMPAT_KERNEL_HWCAP2(PMULL)] = "pmull", + [COMPAT_KERNEL_HWCAP2(SHA1)] = "sha1", + [COMPAT_KERNEL_HWCAP2(SHA2)] = "sha2", + [COMPAT_KERNEL_HWCAP2(CRC32)] = "crc32", }; #endif /* CONFIG_COMPAT */ @@ -166,16 +164,25 @@ static int c_show(struct seq_file *m, void *v) seq_puts(m, "Features\t:"); if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; compat_hwcap_str[j]; j++) - if (compat_elf_hwcap & (1 << j)) - seq_printf(m, " %s", compat_hwcap_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; - for (j = 0; compat_hwcap2_str[j]; j++) + seq_printf(m, " %s", compat_hwcap_str[j]); + } + } + + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) if (compat_elf_hwcap2 & (1 << j)) seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ } else { - for (j = 0; hwcap_str[j]; j++) + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) if (cpu_have_feature(j)) seq_printf(m, " %s", hwcap_str[j]); } From 11e339d53a739e4cf885c1e2a843a4004204d271 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Sep 2020 19:59:34 +1000 Subject: [PATCH 046/148] arm64/mm: Remove CONT_RANGE_OFFSET The macro was introduced by commit ("arm64: PTE/PMD contiguous bit definition") at the beginning. It's only used by commit <348a65cdcbbf> ("arm64: Mark kernel page ranges contiguous"), which was reverted later by commit <667c27597ca8>. This makes the macro unused. This removes the unused macro (CONT_RANGE_OFFSET). Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200910095936.20307-1-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-hwdef.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d400a4d9aee2..8a399e666837 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -98,8 +98,6 @@ #define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT)) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) -/* the numerical offset of the PTE within a range of CONT_PTES */ -#define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) /* * Hardware page table definitions. From c0d6de327f18d39ef759aa883bffc3c32bc69015 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Sep 2020 19:59:35 +1000 Subject: [PATCH 047/148] arm64/mm: Unify CONT_PTE_SHIFT CONT_PTE_SHIFT actually depends on CONFIG_ARM64_CONT_SHIFT. It's reasonable to reflect the dependency: * This renames CONFIG_ARM64_CONT_SHIFT to CONFIG_ARM64_CONT_PTE_SHIFT, so that we can introduce CONFIG_ARM64_CONT_PMD_SHIFT later. * CONT_{SHIFT, SIZE, MASK}, defined in page-def.h are removed as they are not used by anyone. * CONT_PTE_SHIFT is determined by CONFIG_ARM64_CONT_PTE_SHIFT. Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200910095936.20307-2-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/page-def.h | 5 ----- arch/arm64/include/asm/pgtable-hwdef.h | 4 +--- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e21b94061780..4f3b7b24b5e5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -211,7 +211,7 @@ config ARM64_PAGE_SHIFT default 14 if ARM64_16K_PAGES default 12 -config ARM64_CONT_SHIFT +config ARM64_CONT_PTE_SHIFT int default 5 if ARM64_64K_PAGES default 7 if ARM64_16K_PAGES diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h index f99d48ecbeef..2403f7b4cdbf 100644 --- a/arch/arm64/include/asm/page-def.h +++ b/arch/arm64/include/asm/page-def.h @@ -11,13 +11,8 @@ #include /* PAGE_SHIFT determines the page size */ -/* CONT_SHIFT determines the number of pages which can be tracked together */ #define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT -#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT #define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) -#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) -#define CONT_MASK (~(CONT_SIZE-1)) - #endif /* __ASM_PAGE_DEF_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 8a399e666837..6c9c67f62551 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -81,14 +81,12 @@ /* * Contiguous page definitions. */ +#define CONT_PTE_SHIFT (CONFIG_ARM64_CONT_PTE_SHIFT + PAGE_SHIFT) #ifdef CONFIG_ARM64_64K_PAGES -#define CONT_PTE_SHIFT (5 + PAGE_SHIFT) #define CONT_PMD_SHIFT (5 + PMD_SHIFT) #elif defined(CONFIG_ARM64_16K_PAGES) -#define CONT_PTE_SHIFT (7 + PAGE_SHIFT) #define CONT_PMD_SHIFT (5 + PMD_SHIFT) #else -#define CONT_PTE_SHIFT (4 + PAGE_SHIFT) #define CONT_PMD_SHIFT (4 + PMD_SHIFT) #endif From e676594115f0bcc12d4791f9ee919e20d8d750ee Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Sep 2020 19:59:36 +1000 Subject: [PATCH 048/148] arm64/mm: Unify CONT_PMD_SHIFT Similar to how CONT_PTE_SHIFT is determined, this introduces a new kernel option (CONFIG_CONT_PMD_SHIFT) to determine CONT_PMD_SHIFT. Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200910095936.20307-3-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 6 ++++++ arch/arm64/include/asm/pgtable-hwdef.h | 10 ++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4f3b7b24b5e5..609629e36779 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -217,6 +217,12 @@ config ARM64_CONT_PTE_SHIFT default 7 if ARM64_16K_PAGES default 4 +config ARM64_CONT_PMD_SHIFT + int + default 5 if ARM64_64K_PAGES + default 5 if ARM64_16K_PAGES + default 4 + config ARCH_MMAP_RND_BITS_MIN default 14 if ARM64_64K_PAGES default 16 if ARM64_16K_PAGES diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 6c9c67f62551..94b3f2ac2e9d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -82,17 +82,11 @@ * Contiguous page definitions. */ #define CONT_PTE_SHIFT (CONFIG_ARM64_CONT_PTE_SHIFT + PAGE_SHIFT) -#ifdef CONFIG_ARM64_64K_PAGES -#define CONT_PMD_SHIFT (5 + PMD_SHIFT) -#elif defined(CONFIG_ARM64_16K_PAGES) -#define CONT_PMD_SHIFT (5 + PMD_SHIFT) -#else -#define CONT_PMD_SHIFT (4 + PMD_SHIFT) -#endif - #define CONT_PTES (1 << (CONT_PTE_SHIFT - PAGE_SHIFT)) #define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) #define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) + +#define CONT_PMD_SHIFT (CONFIG_ARM64_CONT_PMD_SHIFT + PMD_SHIFT) #define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT)) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) From 2a493132146152c21587414d96ba9026e43acc3d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 9 Sep 2020 17:28:02 +0800 Subject: [PATCH 049/148] arm64: Remove the unused include statements linux/arm-smccc.h is included more than once, Remove the one that isn't necessary. Signed-off-by: Tian Tao Reviewed-by: Steven Price Link: https://lore.kernel.org/r/1599643682-10404-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index c332d49780dc..9df3f9bb7950 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -169,8 +169,6 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn, } #endif /* CONFIG_KVM_INDIRECT_VECTORS */ -#include - static void __maybe_unused call_smc_arch_workaround_1(void) { arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); From 72789a4a6a914601912a8992b0d43bd5abc05128 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 28 Aug 2020 21:39:57 +0800 Subject: [PATCH 050/148] arm64/relocate_kernel: remove redundant code Kernel startup entry point requires disabling MMU and D-cache. As for kexec-reboot, taking a close look at "msr sctlr_el1, x12" in __cpu_soft_restart as the following: -1. booted at EL1 The instruction is enough to disable MMU and I/D cache for EL1 regime. -2. booted at EL2, using VHE Access to SCTLR_EL1 is redirected to SCTLR_EL2 in EL2. So the instruction is enough to disable MMU and clear I+C bits for EL2 regime. -3. booted at EL2, not using VHE The instruction itself can not affect EL2 regime. But The hyp-stub doesn't enable the MMU and I/D cache for EL2 regime. And KVM also disable them for EL2 regime when its unloaded, or execute a HVC_SOFT_RESTART call. So when kexec-reboot, the code in KVM has prepare the requirement. As a conclusion, disabling MMU and clearing I+C bits in SYM_CODE_START(arm64_relocate_new_kernel) is redundant, and can be removed Signed-off-by: Pingfan Liu Cc: James Morse Cc: Geoff Levand Cc: Catalin Marinas Cc: Will Deacon Cc: Mark Rutland Cc: Marc Zyngier Cc: Mark Brown Cc: Kees Cook Cc: Remi Denis-Courmont Cc: Ard Biesheuvel Cc: kvmarm@lists.cs.columbia.edu Link: https://lore.kernel.org/r/1598621998-20563-1-git-send-email-kernelfans@gmail.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu-reset.S | 4 ++++ arch/arm64/kernel/relocate_kernel.S | 12 ------------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S index 4a18055b2ff9..37721eb6f9a1 100644 --- a/arch/arm64/kernel/cpu-reset.S +++ b/arch/arm64/kernel/cpu-reset.S @@ -35,6 +35,10 @@ SYM_CODE_START(__cpu_soft_restart) mov_q x13, SCTLR_ELx_FLAGS bic x12, x12, x13 pre_disable_mmu_workaround + /* + * either disable EL1&0 translation regime or disable EL2&0 translation + * regime if HCR_EL2.E2H == 1 + */ msr sctlr_el1, x12 isb diff --git a/arch/arm64/kernel/relocate_kernel.S b/arch/arm64/kernel/relocate_kernel.S index 542d6edc6806..84eec95ec06c 100644 --- a/arch/arm64/kernel/relocate_kernel.S +++ b/arch/arm64/kernel/relocate_kernel.S @@ -36,18 +36,6 @@ SYM_CODE_START(arm64_relocate_new_kernel) mov x14, xzr /* x14 = entry ptr */ mov x13, xzr /* x13 = copy dest */ - /* Clear the sctlr_el2 flags. */ - mrs x0, CurrentEL - cmp x0, #CurrentEL_EL2 - b.ne 1f - mrs x0, sctlr_el2 - mov_q x1, SCTLR_ELx_FLAGS - bic x0, x0, x1 - pre_disable_mmu_workaround - msr sctlr_el2, x0 - isb -1: - /* Check if the new image needs relocation. */ tbnz x16, IND_DONE_BIT, .Ldone From 3a1793066fdfbbd657d9591cc03d6e1b6634a587 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 28 Aug 2020 21:39:58 +0800 Subject: [PATCH 051/148] Documentation/kvm/arm: improve description of HVC_SOFT_RESTART Besides disabling MMU, HVC_SOFT_RESTART also clears I+D bits. These behaviors are what kexec-reboot expects, so describe it more precisely. Signed-off-by: Pingfan Liu Cc: James Morse Cc: Marc Zyngier Cc: Geoff Levand Cc: Catalin Marinas Cc: Will Deacon Cc: Julien Thierry Cc: Suzuki K Poulose Cc: linux-doc@vger.kernel.org Cc: kvmarm@lists.cs.columbia.edu Link: https://lore.kernel.org/r/1598621998-20563-2-git-send-email-kernelfans@gmail.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Will Deacon --- Documentation/virt/kvm/arm/hyp-abi.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/arm/hyp-abi.rst b/Documentation/virt/kvm/arm/hyp-abi.rst index d9eba93aa364..83cadd8186fa 100644 --- a/Documentation/virt/kvm/arm/hyp-abi.rst +++ b/Documentation/virt/kvm/arm/hyp-abi.rst @@ -54,9 +54,9 @@ these functions (see arch/arm{,64}/include/asm/virt.h): x3 = x1's value when entering the next payload (arm64) x4 = x2's value when entering the next payload (arm64) - Mask all exceptions, disable the MMU, move the arguments into place - (arm64 only), and jump to the restart address while at HYP/EL2. This - hypercall is not expected to return to its caller. + Mask all exceptions, disable the MMU, clear I+D bits, move the arguments + into place (arm64 only), and jump to the restart address while at HYP/EL2. + This hypercall is not expected to return to its caller. Any other value of r0/x0 triggers a hypervisor-specific handling, which is not documented here. From 93396936ed0ce2c6f44140bd14728611d0bb065e Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:51 +0530 Subject: [PATCH 052/148] arm64: kprobe: add checks for ARMv8.3-PAuth combined instructions Currently the ARMv8.3-PAuth combined branch instructions (braa, retaa etc.) are not simulated for out-of-line execution with a handler. Hence the uprobe of such instructions leads to kernel warnings in a loop as they are not explicitly checked and fall into INSN_GOOD categories. Other combined instructions like LDRAA and LDRBB can be probed. The issue of the combined branch instructions is fixed by adding group definitions of all such instructions and rejecting their probes. The instruction groups added are br_auth(braa, brab, braaz and brabz), blr_auth(blraa, blrab, blraaz and blrabz), ret_auth(retaa and retab) and eret_auth(eretaa and eretab). Warning log: WARNING: CPU: 0 PID: 156 at arch/arm64/kernel/probes/uprobes.c:182 uprobe_single_step_handler+0x34/0x50 Modules linked in: CPU: 0 PID: 156 Comm: func Not tainted 5.9.0-rc3 #188 Hardware name: Foundation-v8A (DT) pstate: 804003c9 (Nzcv DAIF +PAN -UAO BTYPE=--) pc : uprobe_single_step_handler+0x34/0x50 lr : single_step_handler+0x70/0xf8 sp : ffff800012af3e30 x29: ffff800012af3e30 x28: ffff000878723b00 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 x23: 0000000060001000 x22: 00000000cb000022 x21: ffff800012065ce8 x20: ffff800012af3ec0 x19: ffff800012068d50 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : ffff800010085c90 x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff80001205a9c8 x5 : ffff80001205a000 x4 : ffff80001233db80 x3 : ffff8000100a7a60 x2 : 0020000000000003 x1 : 0000fffffffff008 x0 : ffff800012af3ec0 Call trace: uprobe_single_step_handler+0x34/0x50 single_step_handler+0x70/0xf8 do_debug_exception+0xb8/0x130 el0_sync_handler+0x138/0x1b8 el0_sync+0x158/0x180 Fixes: 74afda4016a7 ("arm64: compile the kernel with ptrauth return address signing") Fixes: 04ca3204fa09 ("arm64: enable pointer authentication") Signed-off-by: Amit Daniel Kachhap Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200914083656.21428-2-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/insn.h | 4 ++++ arch/arm64/kernel/insn.c | 5 ++++- arch/arm64/kernel/probes/decode-insn.c | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 0bc46149e491..4b39293d0f72 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -359,9 +359,13 @@ __AARCH64_INSN_FUNCS(brk, 0xFFE0001F, 0xD4200000) __AARCH64_INSN_FUNCS(exception, 0xFF000000, 0xD4000000) __AARCH64_INSN_FUNCS(hint, 0xFFFFF01F, 0xD503201F) __AARCH64_INSN_FUNCS(br, 0xFFFFFC1F, 0xD61F0000) +__AARCH64_INSN_FUNCS(br_auth, 0xFEFFF800, 0xD61F0800) __AARCH64_INSN_FUNCS(blr, 0xFFFFFC1F, 0xD63F0000) +__AARCH64_INSN_FUNCS(blr_auth, 0xFEFFF800, 0xD63F0800) __AARCH64_INSN_FUNCS(ret, 0xFFFFFC1F, 0xD65F0000) +__AARCH64_INSN_FUNCS(ret_auth, 0xFFFFFBFF, 0xD65F0BFF) __AARCH64_INSN_FUNCS(eret, 0xFFFFFFFF, 0xD69F03E0) +__AARCH64_INSN_FUNCS(eret_auth, 0xFFFFFBFF, 0xD69F0BFF) __AARCH64_INSN_FUNCS(mrs, 0xFFF00000, 0xD5300000) __AARCH64_INSN_FUNCS(msr_imm, 0xFFF8F01F, 0xD500401F) __AARCH64_INSN_FUNCS(msr_reg, 0xFFF00000, 0xD5100000) diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index a107375005bc..ccc8c9e22b25 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -176,7 +176,7 @@ bool __kprobes aarch64_insn_uses_literal(u32 insn) bool __kprobes aarch64_insn_is_branch(u32 insn) { - /* b, bl, cb*, tb*, b.cond, br, blr */ + /* b, bl, cb*, tb*, ret*, b.cond, br*, blr* */ return aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) || @@ -185,8 +185,11 @@ bool __kprobes aarch64_insn_is_branch(u32 insn) aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) || aarch64_insn_is_ret(insn) || + aarch64_insn_is_ret_auth(insn) || aarch64_insn_is_br(insn) || + aarch64_insn_is_br_auth(insn) || aarch64_insn_is_blr(insn) || + aarch64_insn_is_blr_auth(insn) || aarch64_insn_is_bcond(insn); } diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index 263d5fba4c8a..c541fb48886e 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -29,7 +29,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) aarch64_insn_is_msr_imm(insn) || aarch64_insn_is_msr_reg(insn) || aarch64_insn_is_exception(insn) || - aarch64_insn_is_eret(insn)) + aarch64_insn_is_eret(insn) || + aarch64_insn_is_eret_auth(insn)) return false; /* From 4ef333b2d10680b5d966a733ed7171f72164fcd5 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:52 +0530 Subject: [PATCH 053/148] arm64: traps: Allow force_signal_inject to pass esr error code Some error signal need to pass proper ARM esr error code to userspace to better identify the cause of the signal. So the function force_signal_inject is extended to pass this as a parameter. The existing code is not affected by this change. Signed-off-by: Amit Daniel Kachhap Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200914083656.21428-3-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/traps.h | 2 +- arch/arm64/kernel/fpsimd.c | 4 ++-- arch/arm64/kernel/traps.c | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index cee5928e1b7d..d96dc2c7c09d 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -24,7 +24,7 @@ struct undef_hook { void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -void force_signal_inject(int signal, int code, unsigned long address); +void force_signal_inject(int signal, int code, unsigned long address, unsigned int err); void arm64_notify_segfault(unsigned long addr); void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str); void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 55c8f3ec6705..77484359d44a 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -312,7 +312,7 @@ static void fpsimd_save(void) * re-enter user with corrupt state. * There's no way to recover, so kill it: */ - force_signal_inject(SIGKILL, SI_KERNEL, 0); + force_signal_inject(SIGKILL, SI_KERNEL, 0, 0); return; } @@ -936,7 +936,7 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs) { /* Even if we chose not to use SVE, the hardware could still trap: */ if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) { - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 13ebd5ca2070..29fd00fe94f2 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -412,7 +412,7 @@ exit: return fn ? fn(regs, instr) : 1; } -void force_signal_inject(int signal, int code, unsigned long address) +void force_signal_inject(int signal, int code, unsigned long address, unsigned int err) { const char *desc; struct pt_regs *regs = current_pt_regs(); @@ -438,7 +438,7 @@ void force_signal_inject(int signal, int code, unsigned long address) signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, 0); + arm64_notify_die(desc, regs, signal, code, (void __user *)address, err); } /* @@ -455,7 +455,7 @@ void arm64_notify_segfault(unsigned long addr) code = SEGV_ACCERR; mmap_read_unlock(current->mm); - force_signal_inject(SIGSEGV, code, addr); + force_signal_inject(SIGSEGV, code, addr, 0); } void do_undefinstr(struct pt_regs *regs) @@ -468,14 +468,14 @@ void do_undefinstr(struct pt_regs *regs) return; BUG_ON(!user_mode(regs)); - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } NOKPROBE_SYMBOL(do_undefinstr); void do_bti(struct pt_regs *regs) { BUG_ON(!user_mode(regs)); - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } NOKPROBE_SYMBOL(do_bti); @@ -528,7 +528,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) __user_cache_maint("ic ivau", address, ret); break; default: - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -581,7 +581,7 @@ static void mrs_handler(unsigned int esr, struct pt_regs *regs) sysreg = esr_sys64_to_sysreg(esr); if (do_emulate_mrs(regs, sysreg, rt) != 0) - force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc); + force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } static void wfi_handler(unsigned int esr, struct pt_regs *regs) From e16aeb072682d3dcdbdad452c974baa0d2b0c6db Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:53 +0530 Subject: [PATCH 054/148] arm64: ptrauth: Introduce Armv8.3 pointer authentication enhancements Some Armv8.3 Pointer Authentication enhancements have been introduced which are mandatory for Armv8.6 and optional for Armv8.3. These features are, * ARMv8.3-PAuth2 - An enhanced PAC generation logic is added which hardens finding the correct PAC value of the authenticated pointer. * ARMv8.3-FPAC - Fault is generated now when the ptrauth authentication instruction fails in authenticating the PAC present in the address. This is different from earlier case when such failures just adds an error code in the top byte and waits for subsequent load/store to abort. The ptrauth instructions which may cause this fault are autiasp, retaa etc. The above features are now represented by additional configurations for the Address Authentication cpufeature and a new ESR exception class. The userspace fault received in the kernel due to ARMv8.3-FPAC is treated as Illegal instruction and hence signal SIGILL is injected with ILL_ILLOPN as the signal code. Note that this is different from earlier ARMv8.3 ptrauth where signal SIGSEGV is issued due to Pointer authentication failures. The in-kernel PAC fault causes kernel to crash. Signed-off-by: Amit Daniel Kachhap Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200914083656.21428-4-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/esr.h | 4 +++- arch/arm64/include/asm/exception.h | 1 + arch/arm64/include/asm/sysreg.h | 24 ++++++++++++++++-------- arch/arm64/kernel/entry-common.c | 21 +++++++++++++++++++++ arch/arm64/kernel/traps.c | 12 ++++++++++++ 5 files changed, 53 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 035003acfa87..22c81f1edda2 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -35,7 +35,9 @@ #define ESR_ELx_EC_SYS64 (0x18) #define ESR_ELx_EC_SVE (0x19) #define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ -/* Unallocated EC: 0x1b - 0x1E */ +/* Unallocated EC: 0x1B */ +#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ +/* Unallocated EC: 0x1D - 0x1E */ #define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW (0x20) #define ESR_ELx_EC_IABT_CUR (0x21) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 7577a754d443..99b9383cd036 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -47,4 +47,5 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr); void do_cp15instr(unsigned int esr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); +void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr); #endif /* __ASM_EXCEPTION_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 554a7e8ecb07..b738bc793369 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -636,14 +636,22 @@ #define ID_AA64ISAR1_APA_SHIFT 4 #define ID_AA64ISAR1_DPB_SHIFT 0 -#define ID_AA64ISAR1_APA_NI 0x0 -#define ID_AA64ISAR1_APA_ARCHITECTED 0x1 -#define ID_AA64ISAR1_API_NI 0x0 -#define ID_AA64ISAR1_API_IMP_DEF 0x1 -#define ID_AA64ISAR1_GPA_NI 0x0 -#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1 -#define ID_AA64ISAR1_GPI_NI 0x0 -#define ID_AA64ISAR1_GPI_IMP_DEF 0x1 +#define ID_AA64ISAR1_APA_NI 0x0 +#define ID_AA64ISAR1_APA_ARCHITECTED 0x1 +#define ID_AA64ISAR1_APA_ARCH_EPAC 0x2 +#define ID_AA64ISAR1_APA_ARCH_EPAC2 0x3 +#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC 0x4 +#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC_CMB 0x5 +#define ID_AA64ISAR1_API_NI 0x0 +#define ID_AA64ISAR1_API_IMP_DEF 0x1 +#define ID_AA64ISAR1_API_IMP_DEF_EPAC 0x2 +#define ID_AA64ISAR1_API_IMP_DEF_EPAC2 0x3 +#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC 0x4 +#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC_CMB 0x5 +#define ID_AA64ISAR1_GPA_NI 0x0 +#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1 +#define ID_AA64ISAR1_GPI_NI 0x0 +#define ID_AA64ISAR1_GPI_IMP_DEF 0x1 /* id_aa64pfr0 */ #define ID_AA64PFR0_CSV3_SHIFT 60 diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index d3be9dbf5490..43d4c329775f 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -66,6 +66,13 @@ static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr) } NOKPROBE_SYMBOL(el1_dbg); +static void notrace el1_fpac(struct pt_regs *regs, unsigned long esr) +{ + local_daif_inherit(regs); + do_ptrauth_fault(regs, esr); +} +NOKPROBE_SYMBOL(el1_fpac); + asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -92,6 +99,9 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BRK64: el1_dbg(regs, esr); break; + case ESR_ELx_EC_FPAC: + el1_fpac(regs, esr); + break; default: el1_inv(regs, esr); } @@ -227,6 +237,14 @@ static void notrace el0_svc(struct pt_regs *regs) } NOKPROBE_SYMBOL(el0_svc); +static void notrace el0_fpac(struct pt_regs *regs, unsigned long esr) +{ + user_exit_irqoff(); + local_daif_restore(DAIF_PROCCTX); + do_ptrauth_fault(regs, esr); +} +NOKPROBE_SYMBOL(el0_fpac); + asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -272,6 +290,9 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) case ESR_ELx_EC_BRK64: el0_dbg(regs, esr); break; + case ESR_ELx_EC_FPAC: + el0_fpac(regs, esr); + break; default: el0_inv(regs, esr); } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 29fd00fe94f2..b24f81197a68 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -479,6 +479,17 @@ void do_bti(struct pt_regs *regs) } NOKPROBE_SYMBOL(do_bti); +void do_ptrauth_fault(struct pt_regs *regs, unsigned int esr) +{ + /* + * Unexpected FPAC exception or pointer authentication failure in + * the kernel: kill the task before it does any more harm. + */ + BUG_ON(!user_mode(regs)); + force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); +} +NOKPROBE_SYMBOL(do_ptrauth_fault); + #define __user_cache_maint(insn, address, res) \ if (address >= user_addr_max()) { \ res = -EFAULT; \ @@ -775,6 +786,7 @@ static const char *esr_class_str[] = { [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", + [ESR_ELx_EC_FPAC] = "FPAC", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", From ba9d1d3e3e7c34826b62498f7d6563b73c22ac13 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:54 +0530 Subject: [PATCH 055/148] arm64: cpufeature: Modify address authentication cpufeature to exact The current address authentication cpufeature levels are set as LOWER_SAFE which is not compatible with the different configurations added for Armv8.3 ptrauth enhancements as the different levels have different behaviour and there is no tunable to enable the lower safe versions. This is rectified by setting those cpufeature type as EXACT. The current cpufeature framework also does not interfere in the booting of non-exact secondary cpus but rather marks them as tainted. As a workaround this is fixed by replacing the generic match handler with a new handler specific to ptrauth. After this change, if there is any variation in ptrauth configurations in secondary cpus from boot cpu then those mismatched cpus are parked in an infinite loop. Following ptrauth crash log is observed in Arm fastmodel with simulated mismatched cpus without this fix, CPU features: SANITY CHECK: Unexpected variation in SYS_ID_AA64ISAR1_EL1. Boot CPU: 0x11111110211402, CPU4: 0x11111110211102 CPU features: Unsupported CPU feature variation detected. GICv3: CPU4: found redistributor 100 region 0:0x000000002f180000 CPU4: Booted secondary processor 0x0000000100 [0x410fd0f0] Unable to handle kernel paging request at virtual address bfff800010dadf3c Mem abort info: ESR = 0x86000004 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 [bfff800010dadf3c] address between user and kernel address ranges Internal error: Oops: 86000004 [#1] PREEMPT SMP Modules linked in: CPU: 4 PID: 29 Comm: migration/4 Tainted: G S 5.8.0-rc4-00005-ge658591d66d1-dirty #158 Hardware name: Foundation-v8A (DT) pstate: 60000089 (nZCv daIf -PAN -UAO BTYPE=--) pc : 0xbfff800010dadf3c lr : __schedule+0x2b4/0x5a8 sp : ffff800012043d70 x29: ffff800012043d70 x28: 0080000000000000 x27: ffff800011cbe000 x26: ffff00087ad37580 x25: ffff00087ad37000 x24: ffff800010de7d50 x23: ffff800011674018 x22: 0784800010dae2a8 x21: ffff00087ad37000 x20: ffff00087acb8000 x19: ffff00087f742100 x18: 0000000000000030 x17: 0000000000000000 x16: 0000000000000000 x15: ffff800011ac1000 x14: 00000000000001bd x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 71519a147ddfeb82 x9 : 825d5ec0fb246314 x8 : ffff00087ad37dd8 x7 : 0000000000000000 x6 : 00000000fffedb0e x5 : 00000000ffffffff x4 : 0000000000000000 x3 : 0000000000000028 x2 : ffff80086e11e000 x1 : ffff00087ad37000 x0 : ffff00087acdc600 Call trace: 0xbfff800010dadf3c schedule+0x78/0x110 schedule_preempt_disabled+0x24/0x40 __kthread_parkme+0x68/0xd0 kthread+0x138/0x160 ret_from_fork+0x10/0x34 Code: bad PC value After this fix, the mismatched CPU4 is parked as, CPU features: CPU4: Detected conflict for capability 39 (Address authentication (IMP DEF algorithm)), System: 1, CPU: 0 CPU4: will not boot CPU4: failed to come online CPU4: died during early boot [Suzuki: Introduce new matching function for address authentication] Suggested-by: Suzuki K Poulose Signed-off-by: Amit Daniel Kachhap Reviewed-by: Suzuki K Poulose Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20200914083656.21428-5-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 44 +++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6424584be01e..fdbc26c0d712 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -197,9 +197,9 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_API_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_API_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH), - FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_APA_SHIFT, 4, 0), + FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_APA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -1648,11 +1648,37 @@ static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused) #endif /* CONFIG_ARM64_RAS_EXTN */ #ifdef CONFIG_ARM64_PTR_AUTH -static bool has_address_auth(const struct arm64_cpu_capabilities *entry, - int __unused) +static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope) { - return __system_matches_cap(ARM64_HAS_ADDRESS_AUTH_ARCH) || - __system_matches_cap(ARM64_HAS_ADDRESS_AUTH_IMP_DEF); + int boot_val, sec_val; + + /* We don't expect to be called with SCOPE_SYSTEM */ + WARN_ON(scope == SCOPE_SYSTEM); + /* + * The ptr-auth feature levels are not intercompatible with lower + * levels. Hence we must match ptr-auth feature level of the secondary + * CPUs with that of the boot CPU. The level of boot cpu is fetched + * from the sanitised register whereas direct register read is done for + * the secondary CPUs. + * The sanitised feature state is guaranteed to match that of the + * boot CPU as a mismatched secondary CPU is parked before it gets + * a chance to update the state, with the capability. + */ + boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg), + entry->field_pos, entry->sign); + if (scope & SCOPE_BOOT_CPU) + return boot_val >= entry->min_field_value; + /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */ + sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg), + entry->field_pos, entry->sign); + return sec_val == boot_val; +} + +static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry, + int scope) +{ + return has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH], scope) || + has_address_auth_cpucap(cpu_hwcaps_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope); } static bool has_generic_auth(const struct arm64_cpu_capabilities *entry, @@ -2021,7 +2047,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .field_pos = ID_AA64ISAR1_APA_SHIFT, .min_field_value = ID_AA64ISAR1_APA_ARCHITECTED, - .matches = has_cpuid_feature, + .matches = has_address_auth_cpucap, }, { .desc = "Address authentication (IMP DEF algorithm)", @@ -2031,12 +2057,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sign = FTR_UNSIGNED, .field_pos = ID_AA64ISAR1_API_SHIFT, .min_field_value = ID_AA64ISAR1_API_IMP_DEF, - .matches = has_cpuid_feature, + .matches = has_address_auth_cpucap, }, { .capability = ARM64_HAS_ADDRESS_AUTH, .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, - .matches = has_address_auth, + .matches = has_address_auth_metacap, }, { .desc = "Generic authentication (architected algorithm)", From 6560edca515e53bb2e7c637ab324313680a133f4 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:55 +0530 Subject: [PATCH 056/148] arm64: kprobe: disable probe of fault prone ptrauth instruction With the addition of ARMv8.3-FPAC feature, the probe of authenticate ptrauth instructions (AUT*) may cause ptrauth fault exception in case of authenticate failure so they cannot be safely single stepped. Hence the probe of authenticate instructions is disallowed but the corresponding pac ptrauth instruction (PAC*) is not affected and they can still be probed. Also AUTH* instructions do not make sense at function entry points so most realistic probes would be unaffected by this change. Signed-off-by: Amit Daniel Kachhap Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200914083656.21428-6-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/insn.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index ccc8c9e22b25..6c0de2f60ea9 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -60,16 +60,10 @@ bool __kprobes aarch64_insn_is_steppable_hint(u32 insn) case AARCH64_INSN_HINT_XPACLRI: case AARCH64_INSN_HINT_PACIA_1716: case AARCH64_INSN_HINT_PACIB_1716: - case AARCH64_INSN_HINT_AUTIA_1716: - case AARCH64_INSN_HINT_AUTIB_1716: case AARCH64_INSN_HINT_PACIAZ: case AARCH64_INSN_HINT_PACIASP: case AARCH64_INSN_HINT_PACIBZ: case AARCH64_INSN_HINT_PACIBSP: - case AARCH64_INSN_HINT_AUTIAZ: - case AARCH64_INSN_HINT_AUTIASP: - case AARCH64_INSN_HINT_AUTIBZ: - case AARCH64_INSN_HINT_AUTIBSP: case AARCH64_INSN_HINT_BTI: case AARCH64_INSN_HINT_BTIC: case AARCH64_INSN_HINT_BTIJ: From 03c9c8fad6cb5e8fdfb40287fa1cdf8ee2db0b67 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 14 Sep 2020 14:06:56 +0530 Subject: [PATCH 057/148] arm64: kprobe: clarify the comment of steppable hint instructions The existing comment about steppable hint instruction is not complete and only describes NOP instructions as steppable. As the function aarch64_insn_is_steppable_hint allows all white-listed instruction to be probed so the comment is updated to reflect this. Signed-off-by: Amit Daniel Kachhap Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200914083656.21428-7-amit.kachhap@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/probes/decode-insn.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index c541fb48886e..104101f633b1 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -43,8 +43,10 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) != AARCH64_INSN_SPCLREG_DAIF; /* - * The HINT instruction is is problematic when single-stepping, - * except for the NOP case. + * The HINT instruction is steppable only if it is in whitelist + * and the rest of other such instructions are blocked for + * single stepping as they may cause exception or other + * unintended behaviour. */ if (aarch64_insn_is_hint(insn)) return aarch64_insn_is_steppable_hint(insn); From 2cf660eb81e93f58ae31215af67fee8499901dd9 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Mon, 14 Sep 2020 09:47:30 +1000 Subject: [PATCH 058/148] arm64/mm: Refactor {pgd, pud, pmd, pte}_ERROR() The function __{pgd, pud, pmd, pte}_error() are introduced so that they can be called by {pgd, pud, pmd, pte}_ERROR(). However, some of the functions could never be called when the corresponding page table level isn't enabled. For example, __{pud, pmd}_error() are unused when PUD and PMD are folded to PGD. This removes __{pgd, pud, pmd, pte}_error() and call pr_err() from {pgd, pud, pmd, pte}_ERROR() directly, similar to what x86/powerpc are doing. With this, the code looks a bit simplified either. Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20200913234730.23145-1-gshan@redhat.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 17 ++++++++--------- arch/arm64/kernel/traps.c | 20 -------------------- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d5d3fbe73953..e0ab81923c30 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -35,11 +35,6 @@ extern struct page *vmemmap; -extern void __pte_error(const char *file, int line, unsigned long val); -extern void __pmd_error(const char *file, int line, unsigned long val); -extern void __pud_error(const char *file, int line, unsigned long val); -extern void __pgd_error(const char *file, int line, unsigned long val); - #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE @@ -57,7 +52,8 @@ extern void __pgd_error(const char *file, int line, unsigned long val); extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) -#define pte_ERROR(pte) __pte_error(__FILE__, __LINE__, pte_val(pte)) +#define pte_ERROR(e) \ + pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) /* * Macros to convert between a physical address and its placement in a @@ -541,7 +537,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #if CONFIG_PGTABLE_LEVELS > 2 -#define pmd_ERROR(pmd) __pmd_error(__FILE__, __LINE__, pmd_val(pmd)) +#define pmd_ERROR(e) \ + pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) (!(pud_val(pud) & PUD_TABLE_BIT)) @@ -608,7 +605,8 @@ static inline unsigned long pud_page_vaddr(pud_t pud) #if CONFIG_PGTABLE_LEVELS > 3 -#define pud_ERROR(pud) __pud_error(__FILE__, __LINE__, pud_val(pud)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_none(p4d) (!p4d_val(p4d)) #define p4d_bad(p4d) (!(p4d_val(p4d) & 2)) @@ -667,7 +665,8 @@ static inline unsigned long p4d_page_vaddr(p4d_t p4d) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ -#define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) +#define pgd_ERROR(e) \ + pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e)) #define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) #define pgd_clear_fixmap() clear_fixmap(FIX_PGD) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index af25cedf54e9..1e48b869be21 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -935,26 +935,6 @@ asmlinkage void enter_from_user_mode(void) } NOKPROBE_SYMBOL(enter_from_user_mode); -void __pte_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pte %016lx.\n", file, line, val); -} - -void __pmd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pmd %016lx.\n", file, line, val); -} - -void __pud_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pud %016lx.\n", file, line, val); -} - -void __pgd_error(const char *file, int line, unsigned long val) -{ - pr_err("%s:%d: bad pgd %016lx.\n", file, line, val); -} - /* GENERIC_BUG traps */ int is_valid_bugaddr(unsigned long addr) From 118bb62f271a2b3e2f6ab058b6f43d2601f85480 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 14 Sep 2020 15:28:42 +0800 Subject: [PATCH 059/148] arm64: hibernate: Remove unused including Remove including that don't need it. Signed-off-by: Tian Tao Link: https://lore.kernel.org/r/1600068522-54499-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/kernel/hibernate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 68e14152d6e9..735cfd8a744f 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include From 2b694fc92a34e8c8b774a17266656d72b8cd4429 Mon Sep 17 00:00:00 2001 From: Tuan Phan Date: Mon, 14 Sep 2020 11:04:16 -0700 Subject: [PATCH 060/148] perf: arm_dsu: Support DSU ACPI devices Add support for probing device from ACPI node. Each DSU ACPI node and its associated cpus are inside a cluster node. Signed-off-by: Tuan Phan Reviewed-by: Suzuki K Poulose Link: https://lore.kernel.org/r/1600106656-9542-1-git-send-email-tuanphan@os.amperecomputing.com Signed-off-by: Will Deacon --- drivers/perf/arm_dsu_pmu.c | 63 ++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c index 96ed93cc78e6..98e68ed7db85 100644 --- a/drivers/perf/arm_dsu_pmu.c +++ b/drivers/perf/arm_dsu_pmu.c @@ -11,6 +11,7 @@ #define DRVNAME PMUNAME "_pmu" #define pr_fmt(fmt) DRVNAME ": " fmt +#include #include #include #include @@ -603,18 +604,19 @@ static struct dsu_pmu *dsu_pmu_alloc(struct platform_device *pdev) } /** - * dsu_pmu_dt_get_cpus: Get the list of CPUs in the cluster. + * dsu_pmu_dt_get_cpus: Get the list of CPUs in the cluster + * from device tree. */ -static int dsu_pmu_dt_get_cpus(struct device_node *dev, cpumask_t *mask) +static int dsu_pmu_dt_get_cpus(struct device *dev, cpumask_t *mask) { int i = 0, n, cpu; struct device_node *cpu_node; - n = of_count_phandle_with_args(dev, "cpus", NULL); + n = of_count_phandle_with_args(dev->of_node, "cpus", NULL); if (n <= 0) return -ENODEV; for (; i < n; i++) { - cpu_node = of_parse_phandle(dev, "cpus", i); + cpu_node = of_parse_phandle(dev->of_node, "cpus", i); if (!cpu_node) break; cpu = of_cpu_node_to_id(cpu_node); @@ -631,6 +633,36 @@ static int dsu_pmu_dt_get_cpus(struct device_node *dev, cpumask_t *mask) return 0; } +/** + * dsu_pmu_acpi_get_cpus: Get the list of CPUs in the cluster + * from ACPI. + */ +static int dsu_pmu_acpi_get_cpus(struct device *dev, cpumask_t *mask) +{ +#ifdef CONFIG_ACPI + int cpu; + + /* + * A dsu pmu node is inside a cluster parent node along with cpu nodes. + * We need to find out all cpus that have the same parent with this pmu. + */ + for_each_possible_cpu(cpu) { + struct acpi_device *acpi_dev; + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpu_dev) + continue; + + acpi_dev = ACPI_COMPANION(cpu_dev); + if (acpi_dev && + acpi_dev->parent == ACPI_COMPANION(dev)->parent) + cpumask_set_cpu(cpu, mask); + } +#endif + + return 0; +} + /* * dsu_pmu_probe_pmu: Probe the PMU details on a CPU in the cluster. */ @@ -676,6 +708,7 @@ static int dsu_pmu_device_probe(struct platform_device *pdev) { int irq, rc; struct dsu_pmu *dsu_pmu; + struct fwnode_handle *fwnode = dev_fwnode(&pdev->dev); char *name; static atomic_t pmu_idx = ATOMIC_INIT(-1); @@ -683,7 +716,16 @@ static int dsu_pmu_device_probe(struct platform_device *pdev) if (IS_ERR(dsu_pmu)) return PTR_ERR(dsu_pmu); - rc = dsu_pmu_dt_get_cpus(pdev->dev.of_node, &dsu_pmu->associated_cpus); + if (IS_ERR_OR_NULL(fwnode)) + return -ENOENT; + + if (is_of_node(fwnode)) + rc = dsu_pmu_dt_get_cpus(&pdev->dev, &dsu_pmu->associated_cpus); + else if (is_acpi_device_node(fwnode)) + rc = dsu_pmu_acpi_get_cpus(&pdev->dev, &dsu_pmu->associated_cpus); + else + return -ENOENT; + if (rc) { dev_warn(&pdev->dev, "Failed to parse the CPUs\n"); return rc; @@ -752,11 +794,21 @@ static const struct of_device_id dsu_pmu_of_match[] = { { .compatible = "arm,dsu-pmu", }, {}, }; +MODULE_DEVICE_TABLE(of, dsu_pmu_of_match); + +#ifdef CONFIG_ACPI +static const struct acpi_device_id dsu_pmu_acpi_match[] = { + { "ARMHD500", 0}, + {}, +}; +MODULE_DEVICE_TABLE(acpi, dsu_pmu_acpi_match); +#endif static struct platform_driver dsu_pmu_driver = { .driver = { .name = DRVNAME, .of_match_table = of_match_ptr(dsu_pmu_of_match), + .acpi_match_table = ACPI_PTR(dsu_pmu_acpi_match), .suppress_bind_attrs = true, }, .probe = dsu_pmu_device_probe, @@ -826,7 +878,6 @@ static void __exit dsu_pmu_exit(void) module_init(dsu_pmu_init); module_exit(dsu_pmu_exit); -MODULE_DEVICE_TABLE(of, dsu_pmu_of_match); MODULE_DESCRIPTION("Perf driver for ARM DynamIQ Shared Unit"); MODULE_AUTHOR("Suzuki K Poulose "); MODULE_LICENSE("GPL v2"); From 5fd39dc220279108131edbc85832a5fef821a826 Mon Sep 17 00:00:00 2001 From: Clint Sbisa Date: Fri, 18 Sep 2020 03:33:12 +0000 Subject: [PATCH 061/148] arm64: Enable PCI write-combine resources under sysfs This change exposes write-combine mappings under sysfs for prefetchable PCI resources on arm64. Originally, the usage of "write combine" here was driven by the x86 definition of write combine. This definition is specific to x86 and does not generalize to other architectures. However, the usage of WC has mutated to "write combine" semantics, which is implemented differently on each arch. Generally, prefetchable BARs are accepted to allow speculative accesses, write combining, and re-ordering-- from the PCI perspective, this means there are no read side effects. (This contradicts the PCI spec which allows prefetchable BARs to have read side effects, but this definition is ill-advised as it is impossible to meet.) On x86, prefetchable BARs are mapped as WC as originally defined (with some conditionals on arch features). On arm64, WC is taken to mean normal non-cacheable memory. In practice, write combine semantics are used to minimize write operations. A common usage of this is minimizing PCI TLPs which can significantly improve performance with PCI devices. In order to provide the same benefits to userspace, we need to allow userspace to map prefetchable BARs with write combine semantics. The resourceX_wc mapping is used today by userspace programs and libraries. While this model is flawed as "write combine" is very ill-defined, it is already used by multiple non-x86 archs to expose write combine semantics to user space. We enable this on arm64 to give userspace on arm64 an equivalent mechanism for utilizing write combining with PCI devices. Signed-off-by: Clint Sbisa Acked-by: Catalin Marinas Acked-by: Lorenzo Pieralisi Acked-by: Ard Biesheuvel Cc: Benjamin Herrenschmidt Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Jason Gunthorpe Cc: Lorenzo Pieralisi Cc: Will Deacon Link: https://lore.kernel.org/r/20200918033312.ddfpibgfylfjpex2@amazon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/pci.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h index 70b323cf8300..b33ca260e3c9 100644 --- a/arch/arm64/include/asm/pci.h +++ b/arch/arm64/include/asm/pci.h @@ -17,6 +17,7 @@ #define pcibios_assign_all_busses() \ (pci_has_flag(PCI_REASSIGN_ALL_BUS)) +#define arch_can_pci_mmap_wc() 1 #define ARCH_GENERIC_PCI_MMAP_RESOURCE 1 extern int isa_dma_bridge_buggy; From e74e1d55728509b352e4eec4283dd5b2781b2070 Mon Sep 17 00:00:00 2001 From: Boyan Karatotev Date: Fri, 18 Sep 2020 11:47:12 +0100 Subject: [PATCH 062/148] kselftests/arm64: add a basic Pointer Authentication test PAuth signs and verifies return addresses on the stack. It does so by inserting a Pointer Authentication code (PAC) into some of the unused top bits of an address. This is achieved by adding paciasp/autiasp instructions at the beginning and end of a function. This feature is partially backwards compatible with earlier versions of the ARM architecture. To coerce the compiler into emitting fully backwards compatible code the main file is compiled to target an earlier ARM version. This allows the tests to check for the feature and print meaningful error messages instead of crashing. Add a test to verify that corrupting the return address results in a SIGSEGV on return. Signed-off-by: Boyan Karatotev Reviewed-by: Vincenzo Frascino Reviewed-by: Amit Daniel Kachhap Acked-by: Shuah Khan Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20200918104715.182310-2-boian4o1@gmail.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/Makefile | 2 +- .../testing/selftests/arm64/pauth/.gitignore | 1 + tools/testing/selftests/arm64/pauth/Makefile | 32 ++++++++++++++ tools/testing/selftests/arm64/pauth/helper.h | 9 ++++ tools/testing/selftests/arm64/pauth/pac.c | 44 +++++++++++++++++++ .../selftests/arm64/pauth/pac_corruptor.S | 19 ++++++++ 6 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/pauth/.gitignore create mode 100644 tools/testing/selftests/arm64/pauth/Makefile create mode 100644 tools/testing/selftests/arm64/pauth/helper.h create mode 100644 tools/testing/selftests/arm64/pauth/pac.c create mode 100644 tools/testing/selftests/arm64/pauth/pac_corruptor.S diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 93b567d23c8b..525506fd97b9 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal +ARM64_SUBTARGETS ?= tags signal pauth else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/pauth/.gitignore b/tools/testing/selftests/arm64/pauth/.gitignore new file mode 100644 index 000000000000..b557c916720a --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/.gitignore @@ -0,0 +1 @@ +pac diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile new file mode 100644 index 000000000000..01d35aaa610a --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/Makefile @@ -0,0 +1,32 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020 ARM Limited + +# preserve CC value from top level Makefile +ifeq ($(CC),cc) +CC := $(CROSS_COMPILE)gcc +endif + +CFLAGS += -mbranch-protection=pac-ret +# check if the compiler supports ARMv8.3 and branch protection with PAuth +pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi) + +ifeq ($(pauth_cc_support),1) +TEST_GEN_PROGS := pac +TEST_GEN_FILES := pac_corruptor.o +endif + +include ../../lib.mk + +ifeq ($(pauth_cc_support),1) +# pac* and aut* instructions are not available on architectures berfore +# ARMv8.3. Therefore target ARMv8.3 wherever they are used directly +$(OUTPUT)/pac_corruptor.o: pac_corruptor.S + $(CC) -c $^ -o $@ $(CFLAGS) -march=armv8.3-a + +# when -mbranch-protection is enabled and the target architecture is ARMv8.3 or +# greater, gcc emits pac* instructions which are not in HINT NOP space, +# preventing the tests from occurring at all. Compile for ARMv8.2 so tests can +# run on earlier targets and print a meaningful error messages +$(OUTPUT)/pac: pac.c $(OUTPUT)/pac_corruptor.o + $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a +endif diff --git a/tools/testing/selftests/arm64/pauth/helper.h b/tools/testing/selftests/arm64/pauth/helper.h new file mode 100644 index 000000000000..3e0a2a404bf4 --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/helper.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ARM Limited */ + +#ifndef _HELPER_H_ +#define _HELPER_H_ + +void pac_corruptor(void); + +#endif diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c new file mode 100644 index 000000000000..0293310ba70a --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#include +#include +#include + +#include "../../kselftest_harness.h" +#include "helper.h" + +#define ASSERT_PAUTH_ENABLED() \ +do { \ + unsigned long hwcaps = getauxval(AT_HWCAP); \ + /* data key instructions are not in NOP space. This prevents a SIGILL */ \ + ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); \ +} while (0) + +sigjmp_buf jmpbuf; +void pac_signal_handler(int signum, siginfo_t *si, void *uc) +{ + if (signum == SIGSEGV || signum == SIGILL) + siglongjmp(jmpbuf, 1); +} + +/* check that a corrupted PAC results in SIGSEGV or SIGILL */ +TEST(corrupt_pac) +{ + struct sigaction sa; + + ASSERT_PAUTH_ENABLED(); + if (sigsetjmp(jmpbuf, 1) == 0) { + sa.sa_sigaction = pac_signal_handler; + sa.sa_flags = SA_SIGINFO | SA_RESETHAND; + sigemptyset(&sa.sa_mask); + + sigaction(SIGSEGV, &sa, NULL); + sigaction(SIGILL, &sa, NULL); + + pac_corruptor(); + ASSERT_TRUE(0) TH_LOG("SIGSEGV/SIGILL signal did not occur"); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/arm64/pauth/pac_corruptor.S b/tools/testing/selftests/arm64/pauth/pac_corruptor.S new file mode 100644 index 000000000000..aa6588050752 --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/pac_corruptor.S @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ARM Limited */ + +.global pac_corruptor + +.text +/* + * Corrupting a single bit of the PAC ensures the authentication will fail. It + * also guarantees no possible collision. TCR_EL1.TBI0 is set by default so no + * top byte PAC is tested + */ + pac_corruptor: + paciasp + + /* corrupt the top bit of the PAC */ + eor lr, lr, #1 << 53 + + autiasp + ret From 766d95b1ed93ebdd07ac87490e60e442342f5dc4 Mon Sep 17 00:00:00 2001 From: Boyan Karatotev Date: Fri, 18 Sep 2020 11:47:13 +0100 Subject: [PATCH 063/148] kselftests/arm64: add nop checks for PAuth tests PAuth adds sign/verify controls to enable and disable groups of instructions in hardware for compatibility with libraries that do not implement PAuth. The kernel always enables them if it detects PAuth. Add a test that checks that each group of instructions is enabled, if the kernel reports PAuth as detected. Note: For groups, for the purpose of this patch, we intend instructions that use a certain key. Signed-off-by: Boyan Karatotev Reviewed-by: Vincenzo Frascino Reviewed-by: Amit Daniel Kachhap Acked-by: Shuah Khan Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20200918104715.182310-3-boian4o1@gmail.com Signed-off-by: Will Deacon --- .../testing/selftests/arm64/pauth/.gitignore | 1 + tools/testing/selftests/arm64/pauth/Makefile | 7 ++- tools/testing/selftests/arm64/pauth/helper.c | 39 ++++++++++++++ tools/testing/selftests/arm64/pauth/helper.h | 9 ++++ tools/testing/selftests/arm64/pauth/pac.c | 51 +++++++++++++++++++ 5 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/arm64/pauth/helper.c diff --git a/tools/testing/selftests/arm64/pauth/.gitignore b/tools/testing/selftests/arm64/pauth/.gitignore index b557c916720a..155137d92722 100644 --- a/tools/testing/selftests/arm64/pauth/.gitignore +++ b/tools/testing/selftests/arm64/pauth/.gitignore @@ -1 +1,2 @@ +exec_target pac diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile index 01d35aaa610a..5c0dd129562f 100644 --- a/tools/testing/selftests/arm64/pauth/Makefile +++ b/tools/testing/selftests/arm64/pauth/Makefile @@ -12,7 +12,7 @@ pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/nu ifeq ($(pauth_cc_support),1) TEST_GEN_PROGS := pac -TEST_GEN_FILES := pac_corruptor.o +TEST_GEN_FILES := pac_corruptor.o helper.o endif include ../../lib.mk @@ -23,10 +23,13 @@ ifeq ($(pauth_cc_support),1) $(OUTPUT)/pac_corruptor.o: pac_corruptor.S $(CC) -c $^ -o $@ $(CFLAGS) -march=armv8.3-a +$(OUTPUT)/helper.o: helper.c + $(CC) -c $^ -o $@ $(CFLAGS) -march=armv8.3-a + # when -mbranch-protection is enabled and the target architecture is ARMv8.3 or # greater, gcc emits pac* instructions which are not in HINT NOP space, # preventing the tests from occurring at all. Compile for ARMv8.2 so tests can # run on earlier targets and print a meaningful error messages -$(OUTPUT)/pac: pac.c $(OUTPUT)/pac_corruptor.o +$(OUTPUT)/pac: pac.c $(OUTPUT)/pac_corruptor.o $(OUTPUT)/helper.o $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a endif diff --git a/tools/testing/selftests/arm64/pauth/helper.c b/tools/testing/selftests/arm64/pauth/helper.c new file mode 100644 index 000000000000..2c201e7d0d50 --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/helper.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#include "helper.h" + +size_t keyia_sign(size_t ptr) +{ + asm volatile("paciza %0" : "+r" (ptr)); + return ptr; +} + +size_t keyib_sign(size_t ptr) +{ + asm volatile("pacizb %0" : "+r" (ptr)); + return ptr; +} + +size_t keyda_sign(size_t ptr) +{ + asm volatile("pacdza %0" : "+r" (ptr)); + return ptr; +} + +size_t keydb_sign(size_t ptr) +{ + asm volatile("pacdzb %0" : "+r" (ptr)); + return ptr; +} + +size_t keyg_sign(size_t ptr) +{ + /* output is encoded in the upper 32 bits */ + size_t dest = 0; + size_t modifier = 0; + + asm volatile("pacga %0, %1, %2" : "=r" (dest) : "r" (ptr), "r" (modifier)); + + return dest; +} diff --git a/tools/testing/selftests/arm64/pauth/helper.h b/tools/testing/selftests/arm64/pauth/helper.h index 3e0a2a404bf4..35c4f3357ae3 100644 --- a/tools/testing/selftests/arm64/pauth/helper.h +++ b/tools/testing/selftests/arm64/pauth/helper.h @@ -4,6 +4,15 @@ #ifndef _HELPER_H_ #define _HELPER_H_ +#include + void pac_corruptor(void); +/* PAuth sign a value with key ia and modifier value 0 */ +size_t keyia_sign(size_t val); +size_t keyib_sign(size_t val); +size_t keyda_sign(size_t val); +size_t keydb_sign(size_t val); +size_t keyg_sign(size_t val); + #endif diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index 0293310ba70a..bd3d4c0eca9d 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -8,12 +8,25 @@ #include "../../kselftest_harness.h" #include "helper.h" +#define PAC_COLLISION_ATTEMPTS 10 +/* + * The kernel sets TBID by default. So bits 55 and above should remain + * untouched no matter what. + * The VA space size is 48 bits. Bigger is opt-in. + */ +#define PAC_MASK (~0xff80ffffffffffff) #define ASSERT_PAUTH_ENABLED() \ do { \ unsigned long hwcaps = getauxval(AT_HWCAP); \ /* data key instructions are not in NOP space. This prevents a SIGILL */ \ ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); \ } while (0) +#define ASSERT_GENERIC_PAUTH_ENABLED() \ +do { \ + unsigned long hwcaps = getauxval(AT_HWCAP); \ + /* generic key instructions are not in NOP space. This prevents a SIGILL */ \ + ASSERT_NE(0, hwcaps & HWCAP_PACG) TH_LOG("Generic PAUTH not enabled"); \ +} while (0) sigjmp_buf jmpbuf; void pac_signal_handler(int signum, siginfo_t *si, void *uc) @@ -41,4 +54,42 @@ TEST(corrupt_pac) } } +/* + * There are no separate pac* and aut* controls so checking only the pac* + * instructions is sufficient + */ +TEST(pac_instructions_not_nop) +{ + size_t keyia = 0; + size_t keyib = 0; + size_t keyda = 0; + size_t keydb = 0; + + ASSERT_PAUTH_ENABLED(); + + for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) { + keyia |= keyia_sign(i) & PAC_MASK; + keyib |= keyib_sign(i) & PAC_MASK; + keyda |= keyda_sign(i) & PAC_MASK; + keydb |= keydb_sign(i) & PAC_MASK; + } + + ASSERT_NE(0, keyia) TH_LOG("keyia instructions did nothing"); + ASSERT_NE(0, keyib) TH_LOG("keyib instructions did nothing"); + ASSERT_NE(0, keyda) TH_LOG("keyda instructions did nothing"); + ASSERT_NE(0, keydb) TH_LOG("keydb instructions did nothing"); +} + +TEST(pac_instructions_not_nop_generic) +{ + size_t keyg = 0; + + ASSERT_GENERIC_PAUTH_ENABLED(); + + for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) + keyg |= keyg_sign(i) & PAC_MASK; + + ASSERT_NE(0, keyg) TH_LOG("keyg instructions did nothing"); +} + TEST_HARNESS_MAIN From 806a15b2545e6b5949f14ea8401ce295bd38a018 Mon Sep 17 00:00:00 2001 From: Boyan Karatotev Date: Fri, 18 Sep 2020 11:47:14 +0100 Subject: [PATCH 064/148] kselftests/arm64: add PAuth test for whether exec() changes keys Kernel documentation states that it will change PAuth keys on exec() calls. Verify that all keys are correctly switched to new ones. Signed-off-by: Boyan Karatotev Reviewed-by: Vincenzo Frascino Reviewed-by: Amit Daniel Kachhap Acked-by: Shuah Khan Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20200918104715.182310-4-boian4o1@gmail.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/pauth/Makefile | 4 + .../selftests/arm64/pauth/exec_target.c | 34 ++++ tools/testing/selftests/arm64/pauth/helper.h | 10 ++ tools/testing/selftests/arm64/pauth/pac.c | 150 ++++++++++++++++++ 4 files changed, 198 insertions(+) create mode 100644 tools/testing/selftests/arm64/pauth/exec_target.c diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile index 5c0dd129562f..72e290b0b10c 100644 --- a/tools/testing/selftests/arm64/pauth/Makefile +++ b/tools/testing/selftests/arm64/pauth/Makefile @@ -13,6 +13,7 @@ pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/nu ifeq ($(pauth_cc_support),1) TEST_GEN_PROGS := pac TEST_GEN_FILES := pac_corruptor.o helper.o +TEST_GEN_PROGS_EXTENDED := exec_target endif include ../../lib.mk @@ -30,6 +31,9 @@ $(OUTPUT)/helper.o: helper.c # greater, gcc emits pac* instructions which are not in HINT NOP space, # preventing the tests from occurring at all. Compile for ARMv8.2 so tests can # run on earlier targets and print a meaningful error messages +$(OUTPUT)/exec_target: exec_target.c $(OUTPUT)/helper.o + $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a + $(OUTPUT)/pac: pac.c $(OUTPUT)/pac_corruptor.o $(OUTPUT)/helper.o $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a endif diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c new file mode 100644 index 000000000000..4435600ca400 --- /dev/null +++ b/tools/testing/selftests/arm64/pauth/exec_target.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#include +#include +#include + +#include "helper.h" + +int main(void) +{ + struct signatures signed_vals; + unsigned long hwcaps; + size_t val; + + fread(&val, sizeof(size_t), 1, stdin); + + /* don't try to execute illegal (unimplemented) instructions) caller + * should have checked this and keep worker simple + */ + hwcaps = getauxval(AT_HWCAP); + + if (hwcaps & HWCAP_PACA) { + signed_vals.keyia = keyia_sign(val); + signed_vals.keyib = keyib_sign(val); + signed_vals.keyda = keyda_sign(val); + signed_vals.keydb = keydb_sign(val); + } + signed_vals.keyg = (hwcaps & HWCAP_PACG) ? keyg_sign(val) : 0; + + fwrite(&signed_vals, sizeof(struct signatures), 1, stdout); + + return 0; +} diff --git a/tools/testing/selftests/arm64/pauth/helper.h b/tools/testing/selftests/arm64/pauth/helper.h index 35c4f3357ae3..652496c7b411 100644 --- a/tools/testing/selftests/arm64/pauth/helper.h +++ b/tools/testing/selftests/arm64/pauth/helper.h @@ -6,6 +6,16 @@ #include +#define NKEYS 5 + +struct signatures { + size_t keyia; + size_t keyib; + size_t keyda; + size_t keydb; + size_t keyg; +}; + void pac_corruptor(void); /* PAuth sign a value with key ia and modifier value 0 */ diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index bd3d4c0eca9d..b363ad6a0b50 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -2,6 +2,8 @@ // Copyright (C) 2020 ARM Limited #include +#include +#include #include #include @@ -28,6 +30,117 @@ do { \ ASSERT_NE(0, hwcaps & HWCAP_PACG) TH_LOG("Generic PAUTH not enabled"); \ } while (0) +void sign_specific(struct signatures *sign, size_t val) +{ + sign->keyia = keyia_sign(val); + sign->keyib = keyib_sign(val); + sign->keyda = keyda_sign(val); + sign->keydb = keydb_sign(val); +} + +void sign_all(struct signatures *sign, size_t val) +{ + sign->keyia = keyia_sign(val); + sign->keyib = keyib_sign(val); + sign->keyda = keyda_sign(val); + sign->keydb = keydb_sign(val); + sign->keyg = keyg_sign(val); +} + +int n_same(struct signatures *old, struct signatures *new, int nkeys) +{ + int res = 0; + + res += old->keyia == new->keyia; + res += old->keyib == new->keyib; + res += old->keyda == new->keyda; + res += old->keydb == new->keydb; + if (nkeys == NKEYS) + res += old->keyg == new->keyg; + + return res; +} + +int exec_sign_all(struct signatures *signed_vals, size_t val) +{ + int new_stdin[2]; + int new_stdout[2]; + int status; + ssize_t ret; + pid_t pid; + + ret = pipe(new_stdin); + if (ret == -1) { + perror("pipe returned error"); + return -1; + } + + ret = pipe(new_stdout); + if (ret == -1) { + perror("pipe returned error"); + return -1; + } + + pid = fork(); + // child + if (pid == 0) { + dup2(new_stdin[0], STDIN_FILENO); + if (ret == -1) { + perror("dup2 returned error"); + exit(1); + } + + dup2(new_stdout[1], STDOUT_FILENO); + if (ret == -1) { + perror("dup2 returned error"); + exit(1); + } + + close(new_stdin[0]); + close(new_stdin[1]); + close(new_stdout[0]); + close(new_stdout[1]); + + ret = execl("exec_target", "exec_target", (char *)NULL); + if (ret == -1) { + perror("exec returned error"); + exit(1); + } + } + + close(new_stdin[0]); + close(new_stdout[1]); + + ret = write(new_stdin[1], &val, sizeof(size_t)); + if (ret == -1) { + perror("write returned error"); + return -1; + } + + /* + * wait for the worker to finish, so that read() reads all data + * will also context switch with worker so that this function can be used + * for context switch tests + */ + waitpid(pid, &status, 0); + if (WIFEXITED(status) == 0) { + fprintf(stderr, "worker exited unexpectedly\n"); + return -1; + } + if (WEXITSTATUS(status) != 0) { + fprintf(stderr, "worker exited with error\n"); + return -1; + } + + ret = read(new_stdout[0], signed_vals, sizeof(struct signatures)); + if (ret == -1) { + perror("read returned error"); + return -1; + } + + return 0; +} + sigjmp_buf jmpbuf; void pac_signal_handler(int signum, siginfo_t *si, void *uc) { @@ -92,4 +205,41 @@ TEST(pac_instructions_not_nop_generic) ASSERT_NE(0, keyg) TH_LOG("keyg instructions did nothing"); } +/* + * fork() does not change keys. Only exec() does so call a worker program. + * Its only job is to sign a value and report back the resutls + */ +TEST(exec_changed_keys) +{ + struct signatures new_keys; + struct signatures old_keys; + int ret; + int same = 10; + int nkeys = NKEYS; + unsigned long hwcaps = getauxval(AT_HWCAP); + + /* generic and data key instructions are not in NOP space. This prevents a SIGILL */ + ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); + if (!(hwcaps & HWCAP_PACG)) { + TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks"); + nkeys = NKEYS - 1; + } + + for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) { + ret = exec_sign_all(&new_keys, i); + ASSERT_EQ(0, ret) TH_LOG("failed to run worker"); + + if (nkeys == NKEYS) + sign_all(&old_keys, i); + else + sign_specific(&old_keys, i); + + ret = n_same(&old_keys, &new_keys, nkeys); + if (ret < same) + same = ret; + } + + ASSERT_EQ(0, same) TH_LOG("exec() did not change %d keys", same); +} + TEST_HARNESS_MAIN From d21435e9670b11a02bcb1b5683dddd50da61966d Mon Sep 17 00:00:00 2001 From: Boyan Karatotev Date: Fri, 18 Sep 2020 11:47:15 +0100 Subject: [PATCH 065/148] kselftests/arm64: add PAuth tests for single threaded consistency and differently initialized keys PAuth adds 5 different keys that can be used to sign addresses. Add a test that verifies that the kernel initializes them to different values and preserves them across context switches. Signed-off-by: Boyan Karatotev Reviewed-by: Vincenzo Frascino Reviewed-by: Amit Daniel Kachhap Acked-by: Shuah Khan Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20200918104715.182310-5-boian4o1@gmail.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/pauth/pac.c | 123 ++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c index b363ad6a0b50..592fe538506e 100644 --- a/tools/testing/selftests/arm64/pauth/pac.c +++ b/tools/testing/selftests/arm64/pauth/pac.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2020 ARM Limited +#define _GNU_SOURCE + #include #include #include #include #include +#include #include "../../kselftest_harness.h" #include "helper.h" @@ -17,6 +20,7 @@ * The VA space size is 48 bits. Bigger is opt-in. */ #define PAC_MASK (~0xff80ffffffffffff) +#define ARBITRARY_VALUE (0x1234) #define ASSERT_PAUTH_ENABLED() \ do { \ unsigned long hwcaps = getauxval(AT_HWCAP); \ @@ -61,13 +65,37 @@ int n_same(struct signatures *old, struct signatures *new, int nkeys) return res; } +int n_same_single_set(struct signatures *sign, int nkeys) +{ + size_t vals[nkeys]; + int same = 0; + + vals[0] = sign->keyia & PAC_MASK; + vals[1] = sign->keyib & PAC_MASK; + vals[2] = sign->keyda & PAC_MASK; + vals[3] = sign->keydb & PAC_MASK; + + if (nkeys >= 4) + vals[4] = sign->keyg & PAC_MASK; + + for (int i = 0; i < nkeys - 1; i++) { + for (int j = i + 1; j < nkeys; j++) { + if (vals[i] == vals[j]) + same += 1; + } + } + return same; +} + int exec_sign_all(struct signatures *signed_vals, size_t val) { int new_stdin[2]; int new_stdout[2]; int status; + int i; ssize_t ret; pid_t pid; + cpu_set_t mask; ret = pipe(new_stdin); if (ret == -1) { @@ -81,6 +109,20 @@ int exec_sign_all(struct signatures *signed_vals, size_t val) return -1; } + /* + * pin this process and all its children to a single CPU, so it can also + * guarantee a context switch with its child + */ + sched_getaffinity(0, sizeof(mask), &mask); + + for (i = 0; i < sizeof(cpu_set_t); i++) + if (CPU_ISSET(i, &mask)) + break; + + CPU_ZERO(&mask); + CPU_SET(i, &mask); + sched_setaffinity(0, sizeof(mask), &mask); + pid = fork(); // child if (pid == 0) { @@ -205,6 +247,44 @@ TEST(pac_instructions_not_nop_generic) ASSERT_NE(0, keyg) TH_LOG("keyg instructions did nothing"); } +TEST(single_thread_different_keys) +{ + int same = 10; + int nkeys = NKEYS; + int tmp; + struct signatures signed_vals; + unsigned long hwcaps = getauxval(AT_HWCAP); + + /* generic and data key instructions are not in NOP space. This prevents a SIGILL */ + ASSERT_NE(0, hwcaps & HWCAP_PACA) TH_LOG("PAUTH not enabled"); + if (!(hwcaps & HWCAP_PACG)) { + TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks"); + nkeys = NKEYS - 1; + } + + /* + * In Linux the PAC field can be up to 7 bits wide. Even if keys are + * different, there is about 5% chance for PACs to collide with + * different addresses. This chance rapidly increases with fewer bits + * allocated for the PAC (e.g. wider address). A comparison of the keys + * directly will be more reliable. + * All signed values need to be different at least once out of n + * attempts to be certain that the keys are different + */ + for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) { + if (nkeys == NKEYS) + sign_all(&signed_vals, i); + else + sign_specific(&signed_vals, i); + + tmp = n_same_single_set(&signed_vals, nkeys); + if (tmp < same) + same = tmp; + } + + ASSERT_EQ(0, same) TH_LOG("%d keys clashed every time", same); +} + /* * fork() does not change keys. Only exec() does so call a worker program. * Its only job is to sign a value and report back the resutls @@ -242,4 +322,47 @@ TEST(exec_changed_keys) ASSERT_EQ(0, same) TH_LOG("exec() did not change %d keys", same); } +TEST(context_switch_keep_keys) +{ + int ret; + struct signatures trash; + struct signatures before; + struct signatures after; + + ASSERT_PAUTH_ENABLED(); + + sign_specific(&before, ARBITRARY_VALUE); + + /* will context switch with a process with different keys at least once */ + ret = exec_sign_all(&trash, ARBITRARY_VALUE); + ASSERT_EQ(0, ret) TH_LOG("failed to run worker"); + + sign_specific(&after, ARBITRARY_VALUE); + + ASSERT_EQ(before.keyia, after.keyia) TH_LOG("keyia changed after context switching"); + ASSERT_EQ(before.keyib, after.keyib) TH_LOG("keyib changed after context switching"); + ASSERT_EQ(before.keyda, after.keyda) TH_LOG("keyda changed after context switching"); + ASSERT_EQ(before.keydb, after.keydb) TH_LOG("keydb changed after context switching"); +} + +TEST(context_switch_keep_keys_generic) +{ + int ret; + struct signatures trash; + size_t before; + size_t after; + + ASSERT_GENERIC_PAUTH_ENABLED(); + + before = keyg_sign(ARBITRARY_VALUE); + + /* will context switch with a process with different keys at least once */ + ret = exec_sign_all(&trash, ARBITRARY_VALUE); + ASSERT_EQ(0, ret) TH_LOG("failed to run worker"); + + after = keyg_sign(ARBITRARY_VALUE); + + ASSERT_EQ(before, after) TH_LOG("keyg changed after context switching"); +} + TEST_HARNESS_MAIN From ca765153eb90577e5fda281485048427b80a9a77 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:32 +0100 Subject: [PATCH 066/148] selftests: arm64: Test case for enumeration of SVE vector lengths Add a test case that verifies that we can enumerate the SVE vector lengths on systems where we detect SVE, and that those SVE vector lengths are valid. This program was written by Dave Martin and adapted to kselftest by me. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-2-broonie@kernel.org Signed-off-by: Will Deacon --- .../selftests/arm64/fp/sve-probe-vls.c | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tools/testing/selftests/arm64/fp/sve-probe-vls.c diff --git a/tools/testing/selftests/arm64/fp/sve-probe-vls.c b/tools/testing/selftests/arm64/fp/sve-probe-vls.c new file mode 100644 index 000000000000..b29cbc642c57 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/sve-probe-vls.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015-2020 ARM Limited. + * Original author: Dave Martin + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest.h" + +int main(int argc, char **argv) +{ + unsigned int vq; + int vl; + static unsigned int vqs[SVE_VQ_MAX]; + unsigned int nvqs = 0; + + ksft_print_header(); + ksft_set_plan(2); + + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + ksft_exit_skip("SVE not available"); + + /* + * Enumerate up to SVE_VQ_MAX vector lengths + */ + for (vq = SVE_VQ_MAX; vq > 0; --vq) { + vl = prctl(PR_SVE_SET_VL, vq * 16); + if (vl == -1) + ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n", + strerror(errno), errno); + + vl &= PR_SVE_VL_LEN_MASK; + + if (!sve_vl_valid(vl)) + ksft_exit_fail_msg("VL %d invalid\n", vl); + vq = sve_vq_from_vl(vl); + + if (!(nvqs < SVE_VQ_MAX)) + ksft_exit_fail_msg("Too many VLs %u >= SVE_VQ_MAX\n", + nvqs); + vqs[nvqs++] = vq; + } + ksft_test_result_pass("Enumerated %d vector lengths\n", nvqs); + ksft_test_result_pass("All vector lengths valid\n"); + + /* Print out the vector lengths in ascending order: */ + while (nvqs--) + ksft_print_msg("%u\n", 16 * vqs[nvqs]); + + ksft_exit_pass(); +} From 0dca276ac4d20d4071b3d3095b1ad33269ea5272 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:33 +0100 Subject: [PATCH 067/148] selftests: arm64: Add test for the SVE ptrace interface Add a test case that does some basic verification of the SVE ptrace interface, forking off a child with known values in the registers and then using ptrace to inspect and manipulate the SVE registers of the child, including in FPSIMD mode to account for sharing between the SVE and FPSIMD registers. This program was written by Dave Martin and modified for kselftest by me. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-3-broonie@kernel.org Signed-off-by: Will Deacon --- .../selftests/arm64/fp/sve-ptrace-asm.S | 33 ++ tools/testing/selftests/arm64/fp/sve-ptrace.c | 336 ++++++++++++++++++ 2 files changed, 369 insertions(+) create mode 100644 tools/testing/selftests/arm64/fp/sve-ptrace-asm.S create mode 100644 tools/testing/selftests/arm64/fp/sve-ptrace.c diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S b/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S new file mode 100644 index 000000000000..3e81f9fab574 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2015-2019 ARM Limited. +// Original author: Dave Martin +#include + +.arch_extension sve + +.globl sve_store_patterns + +sve_store_patterns: + mov x1, x0 + + index z0.b, #0, #1 + str q0, [x1] + + mov w8, #__NR_getpid + svc #0 + str q0, [x1, #0x10] + + mov z1.d, z0.d + str q0, [x1, #0x20] + + mov w8, #__NR_getpid + svc #0 + str q0, [x1, #0x30] + + mov z1.d, z0.d + str q0, [x1, #0x40] + + ret + +.size sve_store_patterns, . - sve_store_patterns +.type sve_store_patterns, @function diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c new file mode 100644 index 000000000000..b2282be6f938 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015-2020 ARM Limited. + * Original author: Dave Martin + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest.h" + +/* and don't like each other, so: */ +#ifndef NT_ARM_SVE +#define NT_ARM_SVE 0x405 +#endif + +/* Number of registers filled in by sve_store_patterns */ +#define NR_VREGS 5 + +void sve_store_patterns(__uint128_t v[NR_VREGS]); + +static void dump(const void *buf, size_t size) +{ + size_t i; + const unsigned char *p = buf; + + for (i = 0; i < size; ++i) + printf(" %.2x", *p++); +} + +static int check_vregs(const __uint128_t vregs[NR_VREGS]) +{ + int i; + int ok = 1; + + for (i = 0; i < NR_VREGS; ++i) { + printf("# v[%d]:", i); + dump(&vregs[i], sizeof vregs[i]); + putchar('\n'); + + if (vregs[i] != vregs[0]) + ok = 0; + } + + return ok; +} + +static int do_child(void) +{ + if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) + ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno)); + + if (raise(SIGSTOP)) + ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno)); + + return EXIT_SUCCESS; +} + +static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size) +{ + struct user_sve_header *sve; + void *p; + size_t sz = sizeof *sve; + struct iovec iov; + + while (1) { + if (*size < sz) { + p = realloc(*buf, sz); + if (!p) { + errno = ENOMEM; + goto error; + } + + *buf = p; + *size = sz; + } + + iov.iov_base = *buf; + iov.iov_len = sz; + if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov)) + goto error; + + sve = *buf; + if (sve->size <= sz) + break; + + sz = sve->size; + } + + return sve; + +error: + return NULL; +} + +static int set_sve(pid_t pid, const struct user_sve_header *sve) +{ + struct iovec iov; + + iov.iov_base = (void *)sve; + iov.iov_len = sve->size; + return ptrace(PTRACE_SETREGSET, pid, NT_ARM_SVE, &iov); +} + +static void dump_sve_regs(const struct user_sve_header *sve, unsigned int num, + unsigned int vlmax) +{ + unsigned int vq; + unsigned int i; + + if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_SVE) + ksft_exit_fail_msg("Dumping non-SVE register\n"); + + if (vlmax > sve->vl) + vlmax = sve->vl; + + vq = sve_vq_from_vl(sve->vl); + for (i = 0; i < num; ++i) { + printf("# z%u:", i); + dump((const char *)sve + SVE_PT_SVE_ZREG_OFFSET(vq, i), + vlmax); + printf("%s\n", vlmax == sve->vl ? "" : " ..."); + } +} + +static int do_parent(pid_t child) +{ + int ret = EXIT_FAILURE; + pid_t pid; + int status; + siginfo_t si; + void *svebuf = NULL, *newsvebuf; + size_t svebufsz = 0, newsvebufsz; + struct user_sve_header *sve, *new_sve; + struct user_fpsimd_state *fpsimd; + unsigned int i, j; + unsigned char *p; + unsigned int vq; + + /* Attach to the child */ + while (1) { + int sig; + + pid = wait(&status); + if (pid == -1) { + perror("wait"); + goto error; + } + + /* + * This should never happen but it's hard to flag in + * the framework. + */ + if (pid != child) + continue; + + if (WIFEXITED(status) || WIFSIGNALED(status)) + ksft_exit_fail_msg("Child died unexpectedly\n"); + + ksft_test_result(WIFSTOPPED(status), "WIFSTOPPED(%d)\n", + status); + if (!WIFSTOPPED(status)) + goto error; + + sig = WSTOPSIG(status); + + if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) { + if (errno == ESRCH) + goto disappeared; + + if (errno == EINVAL) { + sig = 0; /* bust group-stop */ + goto cont; + } + + ksft_test_result_fail("PTRACE_GETSIGINFO: %s\n", + strerror(errno)); + goto error; + } + + if (sig == SIGSTOP && si.si_code == SI_TKILL && + si.si_pid == pid) + break; + + cont: + if (ptrace(PTRACE_CONT, pid, NULL, sig)) { + if (errno == ESRCH) + goto disappeared; + + ksft_test_result_fail("PTRACE_CONT: %s\n", + strerror(errno)); + goto error; + } + } + + sve = get_sve(pid, &svebuf, &svebufsz); + if (!sve) { + int e = errno; + + ksft_test_result_fail("get_sve: %s\n", strerror(errno)); + if (e == ESRCH) + goto disappeared; + + goto error; + } else { + ksft_test_result_pass("get_sve\n"); + } + + ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD, + "FPSIMD registers\n"); + if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD) + goto error; + + fpsimd = (struct user_fpsimd_state *)((char *)sve + + SVE_PT_FPSIMD_OFFSET); + for (i = 0; i < 32; ++i) { + p = (unsigned char *)&fpsimd->vregs[i]; + + for (j = 0; j < sizeof fpsimd->vregs[i]; ++j) + p[j] = j; + } + + if (set_sve(pid, sve)) { + int e = errno; + + ksft_test_result_fail("set_sve(FPSIMD): %s\n", + strerror(errno)); + if (e == ESRCH) + goto disappeared; + + goto error; + } + + vq = sve_vq_from_vl(sve->vl); + + newsvebufsz = SVE_PT_SVE_ZREG_OFFSET(vq, 1); + new_sve = newsvebuf = malloc(newsvebufsz); + if (!new_sve) { + errno = ENOMEM; + perror(NULL); + goto error; + } + + *new_sve = *sve; + new_sve->flags &= ~SVE_PT_REGS_MASK; + new_sve->flags |= SVE_PT_REGS_SVE; + memset((char *)new_sve + SVE_PT_SVE_ZREG_OFFSET(vq, 0), + 0, SVE_PT_SVE_ZREG_SIZE(vq)); + new_sve->size = SVE_PT_SVE_ZREG_OFFSET(vq, 1); + if (set_sve(pid, new_sve)) { + int e = errno; + + ksft_test_result_fail("set_sve(ZREG): %s\n", strerror(errno)); + if (e == ESRCH) + goto disappeared; + + goto error; + } + + new_sve = get_sve(pid, &newsvebuf, &newsvebufsz); + if (!new_sve) { + int e = errno; + + ksft_test_result_fail("get_sve(ZREG): %s\n", strerror(errno)); + if (e == ESRCH) + goto disappeared; + + goto error; + } + + ksft_test_result((new_sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE, + "SVE registers\n"); + if ((new_sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_SVE) + goto error; + + dump_sve_regs(new_sve, 3, sizeof fpsimd->vregs[0]); + + p = (unsigned char *)new_sve + SVE_PT_SVE_ZREG_OFFSET(vq, 1); + for (i = 0; i < sizeof fpsimd->vregs[0]; ++i) { + unsigned char expected = i; + + if (__BYTE_ORDER == __BIG_ENDIAN) + expected = sizeof fpsimd->vregs[0] - 1 - expected; + + ksft_test_result(p[i] == expected, "p[%d] == expected\n", i); + if (p[i] != expected) + goto error; + } + + ret = EXIT_SUCCESS; + +error: + kill(child, SIGKILL); + +disappeared: + return ret; +} + +int main(void) +{ + int ret = EXIT_SUCCESS; + __uint128_t v[NR_VREGS]; + pid_t child; + + ksft_print_header(); + ksft_set_plan(20); + + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) + ksft_exit_skip("SVE not available\n"); + + sve_store_patterns(v); + + if (!check_vregs(v)) + ksft_exit_fail_msg("Initial check_vregs() failed\n"); + + child = fork(); + if (!child) + return do_child(); + + if (do_parent(child)) + ret = EXIT_FAILURE; + + ksft_print_cnts(); + + return 0; +} From 5e992c638ea55d928e43d3c1aab1bd8e13835e6e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:34 +0100 Subject: [PATCH 068/148] selftests: arm64: Add stress tests for FPSMID and SVE context switching Add programs sve-test and fpsimd-test which spin reading and writing to the SVE and FPSIMD registers, verifying the operations they perform. The intended use is to leave them running to stress the context switch code's handling of these registers which isn't compatible with what kselftest does so they're not integrated into the framework but there's no other obvious testsuite where they fit so let's store them here. These tests were written by Dave Martin and lightly adapted by me. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-4-broonie@kernel.org Signed-off-by: Will Deacon --- .../testing/selftests/arm64/fp/asm-offsets.h | 11 + tools/testing/selftests/arm64/fp/assembler.h | 57 ++ .../testing/selftests/arm64/fp/fpsimd-test.S | 482 +++++++++++++ tools/testing/selftests/arm64/fp/sve-test.S | 672 ++++++++++++++++++ 4 files changed, 1222 insertions(+) create mode 100644 tools/testing/selftests/arm64/fp/asm-offsets.h create mode 100644 tools/testing/selftests/arm64/fp/assembler.h create mode 100644 tools/testing/selftests/arm64/fp/fpsimd-test.S create mode 100644 tools/testing/selftests/arm64/fp/sve-test.S diff --git a/tools/testing/selftests/arm64/fp/asm-offsets.h b/tools/testing/selftests/arm64/fp/asm-offsets.h new file mode 100644 index 000000000000..a180851496ec --- /dev/null +++ b/tools/testing/selftests/arm64/fp/asm-offsets.h @@ -0,0 +1,11 @@ +#define sa_sz 32 +#define sa_flags 8 +#define sa_handler 0 +#define sa_mask_sz 8 +#define SIGUSR1 10 +#define SIGTERM 15 +#define SIGINT 2 +#define SIGABRT 6 +#define SA_NODEFER 1073741824 +#define SA_SIGINFO 4 +#define ucontext_regs 184 diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h new file mode 100644 index 000000000000..8944f2189206 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/assembler.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2015-2019 ARM Limited. +// Original author: Dave Martin + +#ifndef ASSEMBLER_H +#define ASSEMBLER_H + +.macro __for from:req, to:req + .if (\from) == (\to) + _for__body %\from + .else + __for \from, %(\from) + ((\to) - (\from)) / 2 + __for %(\from) + ((\to) - (\from)) / 2 + 1, \to + .endif +.endm + +.macro _for var:req, from:req, to:req, insn:vararg + .macro _for__body \var:req + .noaltmacro + \insn + .altmacro + .endm + + .altmacro + __for \from, \to + .noaltmacro + + .purgem _for__body +.endm + +.macro function name + .macro endfunction + .type \name, @function + .purgem endfunction + .endm +\name: +.endm + +.macro define_accessor name, num, insn + .macro \name\()_entry n + \insn \n, 1 + ret + .endm + +function \name + adr x2, .L__accessor_tbl\@ + add x2, x2, x0, lsl #3 + br x2 + +.L__accessor_tbl\@: + _for x, 0, (\num) - 1, \name\()_entry \x +endfunction + + .purgem \name\()_entry +.endm + +#endif /* ! ASSEMBLER_H */ diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S new file mode 100644 index 000000000000..1c5556bdd11d --- /dev/null +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2015-2019 ARM Limited. +// Original author: Dave Martin +// +// Simple FPSIMD context switch test +// Repeatedly writes unique test patterns into each FPSIMD register +// and reads them back to verify integrity. +// +// for x in `seq 1 NR_CPUS`; do fpsimd-test & pids=$pids\ $! ; done +// (leave it running for as long as you want...) +// kill $pids + +#include +#include "assembler.h" +#include "asm-offsets.h" + +#define NVR 32 +#define MAXVL_B (128 / 8) + +.macro _vldr Vn:req, Xt:req + ld1 {v\Vn\().2d}, [x\Xt] +.endm + +.macro _vstr Vn:req, Xt:req + st1 {v\Vn\().2d}, [x\Xt] +.endm + +// Generate accessor functions to read/write programmatically selected +// FPSIMD registers. +// x0 is the register index to access +// x1 is the memory address to read from (getv,setp) or store to (setv,setp) +// All clobber x0-x2 +define_accessor setv, NVR, _vldr +define_accessor getv, NVR, _vstr + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrb w3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +// Print an unsigned decimal number x0 to stdout +// Clobbers x0-x4,x8 +function putdec + mov x1, sp + str x30, [sp, #-32]! // Result can't be > 20 digits + + mov x2, #0 + strb w2, [x1, #-1]! // Write the NUL terminator + + mov x2, #10 +0: udiv x3, x0, x2 // div-mod loop to generate the digits + msub x0, x3, x2, x0 + add w0, w0, #'0' + strb w0, [x1, #-1]! + mov x0, x3 + cbnz x3, 0b + + ldrb w0, [x1] + cbnz w0, 1f + mov w0, #'0' // Print "0" for 0, not "" + strb w0, [x1, #-1]! + +1: mov x0, x1 + bl puts + + ldr x30, [sp], #32 + ret +endfunction + +// Print an unsigned decimal number x0 to stdout, followed by a newline +// Clobbers x0-x5,x8 +function putdecn + mov x5, x30 + + bl putdec + mov x0, #'\n' + bl putc + + ret x5 +endfunction + + +// Clobbers x0-x3,x8 +function puthexb + str x30, [sp, #-0x10]! + + mov w3, w0 + lsr w0, w0, #4 + bl puthexnibble + mov w0, w3 + + ldr x30, [sp], #0x10 + // fall through to puthexnibble +endfunction +// Clobbers x0-x2,x8 +function puthexnibble + and w0, w0, #0xf + cmp w0, #10 + blo 1f + add w0, w0, #'a' - ('9' + 1) +1: add w0, w0, #'0' + b putc +endfunction + +// x0=data in, x1=size in, clobbers x0-x5,x8 +function dumphex + str x30, [sp, #-0x10]! + + mov x4, x0 + mov x5, x1 + +0: subs x5, x5, #1 + b.lo 1f + ldrb w0, [x4], #1 + bl puthexb + b 0b + +1: ldr x30, [sp], #0x10 + ret +endfunction + +// Declare some storate space to shadow the SVE register contents: +.pushsection .text +.data +.align 4 +vref: + .space MAXVL_B * NVR +scratch: + .space MAXVL_B +.popsection + +// Trivial memory copy: copy x2 bytes, starting at address x1, to address x0. +// Clobbers x0-x3 +function memcpy + cmp x2, #0 + b.eq 1f +0: ldrb w3, [x1], #1 + strb w3, [x0], #1 + subs x2, x2, #1 + b.ne 0b +1: ret +endfunction + +// Generate a test pattern for storage in SVE registers +// x0: pid (16 bits) +// x1: register number (6 bits) +// x2: generation (4 bits) +function pattern + orr w1, w0, w1, lsl #16 + orr w2, w1, w2, lsl #28 + + ldr x0, =scratch + mov w1, #MAXVL_B / 4 + +0: str w2, [x0], #4 + add w2, w2, #(1 << 22) + subs w1, w1, #1 + bne 0b + + ret +endfunction + +// Get the address of shadow data for FPSIMD V-register V +.macro _adrv xd, xn, nrtmp + ldr \xd, =vref + mov x\nrtmp, #16 + madd \xd, x\nrtmp, \xn, \xd +.endm + +// Set up test pattern in a FPSIMD V-register +// x0: pid +// x1: register number +// x2: generation +function setup_vreg + mov x4, x30 + + mov x6, x1 + bl pattern + _adrv x0, x6, 2 + mov x5, x0 + ldr x1, =scratch + bl memcpy + + mov x0, x6 + mov x1, x5 + bl setv + + ret x4 +endfunction + +// Fill x1 bytes starting at x0 with 0xae (for canary purposes) +// Clobbers x1, x2. +function memfill_ae + mov w2, #0xae + b memfill +endfunction + +// Fill x1 bytes starting at x0 with 0. +// Clobbers x1, x2. +function memclr + mov w2, #0 +endfunction + // fall through to memfill + +// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2 +// Clobbers x1 +function memfill + cmp x1, #0 + b.eq 1f + +0: strb w2, [x0], #1 + subs x1, x1, #1 + b.ne 0b + +1: ret +endfunction + +// Trivial memory compare: compare x2 bytes starting at address x0 with +// bytes starting at address x1. +// Returns only if all bytes match; otherwise, the program is aborted. +// Clobbers x0-x5. +function memcmp + cbz x2, 1f + + mov x5, #0 +0: ldrb w3, [x0, x5] + ldrb w4, [x1, x5] + add x5, x5, #1 + cmp w3, w4 + b.ne barf + subs x2, x2, #1 + b.ne 0b + +1: ret +endfunction + +// Verify that a FPSIMD V-register matches its shadow in memory, else abort +// x0: reg number +// Clobbers x0-x5. +function check_vreg + mov x3, x30 + + _adrv x5, x0, 6 + mov x4, x0 + ldr x7, =scratch + + mov x0, x7 + mov x1, x6 + bl memfill_ae + + mov x0, x4 + mov x1, x7 + bl getv + + mov x0, x5 + mov x1, x7 + mov x2, x6 + mov x30, x3 + b memcmp +endfunction + +// Any SVE register modified here can cause corruption in the main +// thread -- but *only* the registers modified here. +function irritator_handler + // Increment the irritation signal count (x23): + ldr x0, [x2, #ucontext_regs + 8 * 23] + add x0, x0, #1 + str x0, [x2, #ucontext_regs + 8 * 23] + + // Corrupt some random V-regs + adr x0, .text + (irritator_handler - .text) / 16 * 16 + movi v0.8b, #7 + movi v9.16b, #9 + movi v31.8b, #31 + + ret +endfunction + +function terminate_handler + mov w21, w0 + mov x20, x2 + + puts "Terminated by signal " + mov w0, w21 + bl putdec + puts ", no error, iterations=" + ldr x0, [x20, #ucontext_regs + 8 * 22] + bl putdec + puts ", signals=" + ldr x0, [x20, #ucontext_regs + 8 * 23] + bl putdecn + + mov x0, #0 + mov x8, #__NR_exit + svc #0 +endfunction + +// w0: signal number +// x1: sa_action +// w2: sa_flags +// Clobbers x0-x6,x8 +function setsignal + str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]! + + mov w4, w0 + mov x5, x1 + mov w6, w2 + + add x0, sp, #16 + mov x1, #sa_sz + bl memclr + + mov w0, w4 + add x1, sp, #16 + str w6, [x1, #sa_flags] + str x5, [x1, #sa_handler] + mov x2, #0 + mov x3, #sa_mask_sz + mov x8, #__NR_rt_sigaction + svc #0 + + cbz w0, 1f + + puts "sigaction failure\n" + b .Labort + +1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16) + ret +endfunction + +// Main program entry point +.globl _start +function _start +_start: + // Sanity-check and report the vector length + + mov x19, #128 + cmp x19, #128 + b.lo 1f + cmp x19, #2048 + b.hi 1f + tst x19, #(8 - 1) + b.eq 2f + +1: puts "Bad vector length: " + mov x0, x19 + bl putdecn + b .Labort + +2: puts "Vector length:\t" + mov x0, x19 + bl putdec + puts " bits\n" + + // Obtain our PID, to ensure test pattern uniqueness between processes + + mov x8, #__NR_getpid + svc #0 + mov x20, x0 + + puts "PID:\t" + mov x0, x20 + bl putdecn + + mov x23, #0 // Irritation signal count + + mov w0, #SIGINT + adr x1, terminate_handler + mov w2, #SA_SIGINFO + bl setsignal + + mov w0, #SIGTERM + adr x1, terminate_handler + mov w2, #SA_SIGINFO + bl setsignal + + mov w0, #SIGUSR1 + adr x1, irritator_handler + mov w2, #SA_SIGINFO + orr w2, w2, #SA_NODEFER + bl setsignal + + mov x22, #0 // generation number, increments per iteration +.Ltest_loop: + + mov x21, #0 // Set up V-regs & shadow with test pattern +0: mov x0, x20 + mov x1, x21 + and x2, x22, #0xf + bl setup_vreg + add x21, x21, #1 + cmp x21, #NVR + b.lo 0b + +// Can't do this when SVE state is volatile across SVC: + mov x8, #__NR_sched_yield // Encourage preemption + svc #0 + + mov x21, #0 +0: mov x0, x21 + bl check_vreg + add x21, x21, #1 + cmp x21, #NVR + b.lo 0b + + add x22, x22, #1 + b .Ltest_loop + +.Labort: + mov x0, #0 + mov x1, #SIGABRT + mov x8, #__NR_kill + svc #0 +endfunction + +function barf + mov x10, x0 // expected data + mov x11, x1 // actual data + mov x12, x2 // data size + + puts "Mistatch: PID=" + mov x0, x20 + bl putdec + puts ", iteration=" + mov x0, x22 + bl putdec + puts ", reg=" + mov x0, x21 + bl putdecn + puts "\tExpected [" + mov x0, x10 + mov x1, x12 + bl dumphex + puts "]\n\tGot [" + mov x0, x11 + mov x1, x12 + bl dumphex + puts "]\n" + + mov x8, #__NR_exit + mov x1, #1 + svc #0 +endfunction diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S new file mode 100644 index 000000000000..f95074c9b48b --- /dev/null +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright (C) 2015-2019 ARM Limited. +// Original author: Dave Martin +// +// Simple Scalable Vector Extension context switch test +// Repeatedly writes unique test patterns into each SVE register +// and reads them back to verify integrity. +// +// for x in `seq 1 NR_CPUS`; do sve-test & pids=$pids\ $! ; done +// (leave it running for as long as you want...) +// kill $pids + +#include +#include "assembler.h" +#include "asm-offsets.h" + +#define NZR 32 +#define NPR 16 +#define MAXVL_B (2048 / 8) + +.arch_extension sve + +.macro _sve_ldr_v zt, xn + ldr z\zt, [x\xn] +.endm + +.macro _sve_str_v zt, xn + str z\zt, [x\xn] +.endm + +.macro _sve_ldr_p pt, xn + ldr p\pt, [x\xn] +.endm + +.macro _sve_str_p pt, xn + str p\pt, [x\xn] +.endm + +// Generate accessor functions to read/write programmatically selected +// SVE registers. +// x0 is the register index to access +// x1 is the memory address to read from (getz,setp) or store to (setz,setp) +// All clobber x0-x2 +define_accessor setz, NZR, _sve_ldr_v +define_accessor getz, NZR, _sve_str_v +define_accessor setp, NPR, _sve_ldr_p +define_accessor getp, NPR, _sve_str_p + +// Print a single character x0 to stdout +// Clobbers x0-x2,x8 +function putc + str x0, [sp, #-16]! + + mov x0, #1 // STDOUT_FILENO + mov x1, sp + mov x2, #1 + mov x8, #__NR_write + svc #0 + + add sp, sp, #16 + ret +endfunction + +// Print a NUL-terminated string starting at address x0 to stdout +// Clobbers x0-x3,x8 +function puts + mov x1, x0 + + mov x2, #0 +0: ldrb w3, [x0], #1 + cbz w3, 1f + add x2, x2, #1 + b 0b + +1: mov w0, #1 // STDOUT_FILENO + mov x8, #__NR_write + svc #0 + + ret +endfunction + +// Utility macro to print a literal string +// Clobbers x0-x4,x8 +.macro puts string + .pushsection .rodata.str1.1, "aMS", 1 +.L__puts_literal\@: .string "\string" + .popsection + + ldr x0, =.L__puts_literal\@ + bl puts +.endm + +// Print an unsigned decimal number x0 to stdout +// Clobbers x0-x4,x8 +function putdec + mov x1, sp + str x30, [sp, #-32]! // Result can't be > 20 digits + + mov x2, #0 + strb w2, [x1, #-1]! // Write the NUL terminator + + mov x2, #10 +0: udiv x3, x0, x2 // div-mod loop to generate the digits + msub x0, x3, x2, x0 + add w0, w0, #'0' + strb w0, [x1, #-1]! + mov x0, x3 + cbnz x3, 0b + + ldrb w0, [x1] + cbnz w0, 1f + mov w0, #'0' // Print "0" for 0, not "" + strb w0, [x1, #-1]! + +1: mov x0, x1 + bl puts + + ldr x30, [sp], #32 + ret +endfunction + +// Print an unsigned decimal number x0 to stdout, followed by a newline +// Clobbers x0-x5,x8 +function putdecn + mov x5, x30 + + bl putdec + mov x0, #'\n' + bl putc + + ret x5 +endfunction + +// Clobbers x0-x3,x8 +function puthexb + str x30, [sp, #-0x10]! + + mov w3, w0 + lsr w0, w0, #4 + bl puthexnibble + mov w0, w3 + + ldr x30, [sp], #0x10 + // fall through to puthexnibble +endfunction +// Clobbers x0-x2,x8 +function puthexnibble + and w0, w0, #0xf + cmp w0, #10 + blo 1f + add w0, w0, #'a' - ('9' + 1) +1: add w0, w0, #'0' + b putc +endfunction + +// x0=data in, x1=size in, clobbers x0-x5,x8 +function dumphex + str x30, [sp, #-0x10]! + + mov x4, x0 + mov x5, x1 + +0: subs x5, x5, #1 + b.lo 1f + ldrb w0, [x4], #1 + bl puthexb + b 0b + +1: ldr x30, [sp], #0x10 + ret +endfunction + +// Declare some storate space to shadow the SVE register contents: +.pushsection .text +.data +.align 4 +zref: + .space MAXVL_B * NZR +pref: + .space MAXVL_B / 8 * NPR +ffrref: + .space MAXVL_B / 8 +scratch: + .space MAXVL_B +.popsection + +// Trivial memory copy: copy x2 bytes, starting at address x1, to address x0. +// Clobbers x0-x3 +function memcpy + cmp x2, #0 + b.eq 1f +0: ldrb w3, [x1], #1 + strb w3, [x0], #1 + subs x2, x2, #1 + b.ne 0b +1: ret +endfunction + +// Generate a test pattern for storage in SVE registers +// x0: pid (16 bits) +// x1: register number (6 bits) +// x2: generation (4 bits) + +// These values are used to constuct a 32-bit pattern that is repeated in the +// scratch buffer as many times as will fit: +// bits 31:28 generation number (increments once per test_loop) +// bits 27:22 32-bit lane index +// bits 21:16 register number +// bits 15: 0 pid + +function pattern + orr w1, w0, w1, lsl #16 + orr w2, w1, w2, lsl #28 + + ldr x0, =scratch + mov w1, #MAXVL_B / 4 + +0: str w2, [x0], #4 + add w2, w2, #(1 << 22) + subs w1, w1, #1 + bne 0b + + ret +endfunction + +// Get the address of shadow data for SVE Z-register Z +.macro _adrz xd, xn, nrtmp + ldr \xd, =zref + rdvl x\nrtmp, #1 + madd \xd, x\nrtmp, \xn, \xd +.endm + +// Get the address of shadow data for SVE P-register P +.macro _adrp xd, xn, nrtmp + ldr \xd, =pref + rdvl x\nrtmp, #1 + lsr x\nrtmp, x\nrtmp, #3 + sub \xn, \xn, #NZR + madd \xd, x\nrtmp, \xn, \xd +.endm + +// Set up test pattern in a SVE Z-register +// x0: pid +// x1: register number +// x2: generation +function setup_zreg + mov x4, x30 + + mov x6, x1 + bl pattern + _adrz x0, x6, 2 + mov x5, x0 + ldr x1, =scratch + bl memcpy + + mov x0, x6 + mov x1, x5 + bl setz + + ret x4 +endfunction + +// Set up test pattern in a SVE P-register +// x0: pid +// x1: register number +// x2: generation +function setup_preg + mov x4, x30 + + mov x6, x1 + bl pattern + _adrp x0, x6, 2 + mov x5, x0 + ldr x1, =scratch + bl memcpy + + mov x0, x6 + mov x1, x5 + bl setp + + ret x4 +endfunction + +// Set up test pattern in the FFR +// x0: pid +// x2: generation +// Beware: corrupts P0. +function setup_ffr + mov x4, x30 + + bl pattern + ldr x0, =ffrref + ldr x1, =scratch + rdvl x2, #1 + lsr x2, x2, #3 + bl memcpy + + mov x0, #0 + ldr x1, =ffrref + bl setp + + wrffr p0.b + + ret x4 +endfunction + +// Fill x1 bytes starting at x0 with 0xae (for canary purposes) +// Clobbers x1, x2. +function memfill_ae + mov w2, #0xae + b memfill +endfunction + +// Fill x1 bytes starting at x0 with 0. +// Clobbers x1, x2. +function memclr + mov w2, #0 +endfunction + // fall through to memfill + +// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2 +// Clobbers x1 +function memfill + cmp x1, #0 + b.eq 1f + +0: strb w2, [x0], #1 + subs x1, x1, #1 + b.ne 0b + +1: ret +endfunction + +// Trivial memory compare: compare x2 bytes starting at address x0 with +// bytes starting at address x1. +// Returns only if all bytes match; otherwise, the program is aborted. +// Clobbers x0-x5. +function memcmp + cbz x2, 2f + + stp x0, x1, [sp, #-0x20]! + str x2, [sp, #0x10] + + mov x5, #0 +0: ldrb w3, [x0, x5] + ldrb w4, [x1, x5] + add x5, x5, #1 + cmp w3, w4 + b.ne 1f + subs x2, x2, #1 + b.ne 0b + +1: ldr x2, [sp, #0x10] + ldp x0, x1, [sp], #0x20 + b.ne barf + +2: ret +endfunction + +// Verify that a SVE Z-register matches its shadow in memory, else abort +// x0: reg number +// Clobbers x0-x7. +function check_zreg + mov x3, x30 + + _adrz x5, x0, 6 + mov x4, x0 + ldr x7, =scratch + + mov x0, x7 + mov x1, x6 + bl memfill_ae + + mov x0, x4 + mov x1, x7 + bl getz + + mov x0, x5 + mov x1, x7 + mov x2, x6 + mov x30, x3 + b memcmp +endfunction + +// Verify that a SVE P-register matches its shadow in memory, else abort +// x0: reg number +// Clobbers x0-x7. +function check_preg + mov x3, x30 + + _adrp x5, x0, 6 + mov x4, x0 + ldr x7, =scratch + + mov x0, x7 + mov x1, x6 + bl memfill_ae + + mov x0, x4 + mov x1, x7 + bl getp + + mov x0, x5 + mov x1, x7 + mov x2, x6 + mov x30, x3 + b memcmp +endfunction + +// Verify that the FFR matches its shadow in memory, else abort +// Beware -- corrupts P0. +// Clobbers x0-x5. +function check_ffr + mov x3, x30 + + ldr x4, =scratch + rdvl x5, #1 + lsr x5, x5, #3 + + mov x0, x4 + mov x1, x5 + bl memfill_ae + + rdffr p0.b + mov x0, #0 + mov x1, x4 + bl getp + + ldr x0, =ffrref + mov x1, x4 + mov x2, x5 + mov x30, x3 + b memcmp +endfunction + +// Any SVE register modified here can cause corruption in the main +// thread -- but *only* the registers modified here. +function irritator_handler + // Increment the irritation signal count (x23): + ldr x0, [x2, #ucontext_regs + 8 * 23] + add x0, x0, #1 + str x0, [x2, #ucontext_regs + 8 * 23] + + // Corrupt some random Z-regs + adr x0, .text + (irritator_handler - .text) / 16 * 16 + movi v0.8b, #1 + movi v9.16b, #2 + movi v31.8b, #3 + // And P0 + rdffr p0.b + // And FFR + wrffr p15.b + + ret +endfunction + +function terminate_handler + mov w21, w0 + mov x20, x2 + + puts "Terminated by signal " + mov w0, w21 + bl putdec + puts ", no error, iterations=" + ldr x0, [x20, #ucontext_regs + 8 * 22] + bl putdec + puts ", signals=" + ldr x0, [x20, #ucontext_regs + 8 * 23] + bl putdecn + + mov x0, #0 + mov x8, #__NR_exit + svc #0 +endfunction + +// w0: signal number +// x1: sa_action +// w2: sa_flags +// Clobbers x0-x6,x8 +function setsignal + str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]! + + mov w4, w0 + mov x5, x1 + mov w6, w2 + + add x0, sp, #16 + mov x1, #sa_sz + bl memclr + + mov w0, w4 + add x1, sp, #16 + str w6, [x1, #sa_flags] + str x5, [x1, #sa_handler] + mov x2, #0 + mov x3, #sa_mask_sz + mov x8, #__NR_rt_sigaction + svc #0 + + cbz w0, 1f + + puts "sigaction failure\n" + b .Labort + +1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16) + ret +endfunction + +// Main program entry point +.globl _start +function _start +_start: + // Sanity-check and report the vector length + + rdvl x19, #8 + cmp x19, #128 + b.lo 1f + cmp x19, #2048 + b.hi 1f + tst x19, #(8 - 1) + b.eq 2f + +1: puts "Bad vector length: " + mov x0, x19 + bl putdecn + b .Labort + +2: puts "Vector length:\t" + mov x0, x19 + bl putdec + puts " bits\n" + + // Obtain our PID, to ensure test pattern uniqueness between processes + + mov x8, #__NR_getpid + svc #0 + mov x20, x0 + + puts "PID:\t" + mov x0, x20 + bl putdecn + + mov x23, #0 // Irritation signal count + + mov w0, #SIGINT + adr x1, terminate_handler + mov w2, #SA_SIGINFO + bl setsignal + + mov w0, #SIGTERM + adr x1, terminate_handler + mov w2, #SA_SIGINFO + bl setsignal + + mov w0, #SIGUSR1 + adr x1, irritator_handler + mov w2, #SA_SIGINFO + orr w2, w2, #SA_NODEFER + bl setsignal + + mov x22, #0 // generation number, increments per iteration +.Ltest_loop: + rdvl x0, #8 + cmp x0, x19 + b.ne vl_barf + + mov x21, #0 // Set up Z-regs & shadow with test pattern +0: mov x0, x20 + mov x1, x21 + and x2, x22, #0xf + bl setup_zreg + add x21, x21, #1 + cmp x21, #NZR + b.lo 0b + + mov x0, x20 // Set up FFR & shadow with test pattern + mov x1, #NZR + NPR + and x2, x22, #0xf + bl setup_ffr + +0: mov x0, x20 // Set up P-regs & shadow with test pattern + mov x1, x21 + and x2, x22, #0xf + bl setup_preg + add x21, x21, #1 + cmp x21, #NZR + NPR + b.lo 0b + +// Can't do this when SVE state is volatile across SVC: +// mov x8, #__NR_sched_yield // Encourage preemption +// svc #0 + + mov x21, #0 +0: mov x0, x21 + bl check_zreg + add x21, x21, #1 + cmp x21, #NZR + b.lo 0b + +0: mov x0, x21 + bl check_preg + add x21, x21, #1 + cmp x21, #NZR + NPR + b.lo 0b + + bl check_ffr + + add x22, x22, #1 + b .Ltest_loop + +.Labort: + mov x0, #0 + mov x1, #SIGABRT + mov x8, #__NR_kill + svc #0 +endfunction + +function barf +// fpsimd.c acitivty log dump hack +// ldr w0, =0xdeadc0de +// mov w8, #__NR_exit +// svc #0 +// end hack + mov x10, x0 // expected data + mov x11, x1 // actual data + mov x12, x2 // data size + + puts "Mistatch: PID=" + mov x0, x20 + bl putdec + puts ", iteration=" + mov x0, x22 + bl putdec + puts ", reg=" + mov x0, x21 + bl putdecn + puts "\tExpected [" + mov x0, x10 + mov x1, x12 + bl dumphex + puts "]\n\tGot [" + mov x0, x11 + mov x1, x12 + bl dumphex + puts "]\n" + + mov x8, #__NR_getpid + svc #0 +// fpsimd.c acitivty log dump hack +// ldr w0, =0xdeadc0de +// mov w8, #__NR_exit +// svc #0 +// ^ end of hack + mov x1, #SIGABRT + mov x8, #__NR_kill + svc #0 +// mov x8, #__NR_exit +// mov x1, #1 +// svc #0 +endfunction + +function vl_barf + mov x10, x0 + + puts "Bad active VL: " + mov x0, x10 + bl putdecn + + mov x8, #__NR_exit + mov x1, #1 + svc #0 +endfunction From fc7e611f9f38de7166c5f8951df93f7351542448 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:35 +0100 Subject: [PATCH 069/148] selftests: arm64: Add utility to set SVE vector lengths vlset is a small utility for use in conjunction with tests like the sve-test stress test which allows another executable to be invoked with a configured SVE vector length. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-5-broonie@kernel.org Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/fp/vlset.c | 155 +++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 tools/testing/selftests/arm64/fp/vlset.c diff --git a/tools/testing/selftests/arm64/fp/vlset.c b/tools/testing/selftests/arm64/fp/vlset.c new file mode 100644 index 000000000000..308d27a68226 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/vlset.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015-2019 ARM Limited. + * Original author: Dave Martin + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int inherit = 0; +static int no_inherit = 0; +static int force = 0; +static unsigned long vl; + +static const struct option options[] = { + { "force", no_argument, NULL, 'f' }, + { "inherit", no_argument, NULL, 'i' }, + { "max", no_argument, NULL, 'M' }, + { "no-inherit", no_argument, &no_inherit, 1 }, + { "help", no_argument, NULL, '?' }, + {} +}; + +static char const *program_name; + +static int parse_options(int argc, char **argv) +{ + int c; + char *rest; + + program_name = strrchr(argv[0], '/'); + if (program_name) + ++program_name; + else + program_name = argv[0]; + + while ((c = getopt_long(argc, argv, "Mfhi", options, NULL)) != -1) + switch (c) { + case 'M': vl = SVE_VL_MAX; break; + case 'f': force = 1; break; + case 'i': inherit = 1; break; + case 0: break; + default: goto error; + } + + if (inherit && no_inherit) + goto error; + + if (!vl) { + /* vector length */ + if (optind >= argc) + goto error; + + errno = 0; + vl = strtoul(argv[optind], &rest, 0); + if (*rest) { + vl = ULONG_MAX; + errno = EINVAL; + } + if (vl == ULONG_MAX && errno) { + fprintf(stderr, "%s: %s: %s\n", + program_name, argv[optind], strerror(errno)); + goto error; + } + + ++optind; + } + + /* command */ + if (optind >= argc) + goto error; + + return 0; + +error: + fprintf(stderr, + "Usage: %s [-f | --force] " + "[-i | --inherit | --no-inherit] " + "{-M | --max | } " + " [ ...]\n", + program_name); + return -1; +} + +int main(int argc, char **argv) +{ + int ret = 126; /* same as sh(1) command-not-executable error */ + long flags; + char *path; + int t, e; + + if (parse_options(argc, argv)) + return 2; /* same as sh(1) builtin incorrect-usage */ + + if (vl & ~(vl & PR_SVE_VL_LEN_MASK)) { + fprintf(stderr, "%s: Invalid vector length %lu\n", + program_name, vl); + return 2; /* same as sh(1) builtin incorrect-usage */ + } + + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + fprintf(stderr, "%s: Scalable Vector Extension not present\n", + program_name); + + if (!force) + goto error; + + fputs("Going ahead anyway (--force): " + "This is a debug option. Don't rely on it.\n", + stderr); + } + + flags = PR_SVE_SET_VL_ONEXEC; + if (inherit) + flags |= PR_SVE_VL_INHERIT; + + t = prctl(PR_SVE_SET_VL, vl | flags); + if (t < 0) { + fprintf(stderr, "%s: PR_SVE_SET_VL: %s\n", + program_name, strerror(errno)); + goto error; + } + + t = prctl(PR_SVE_GET_VL); + if (t == -1) { + fprintf(stderr, "%s: PR_SVE_GET_VL: %s\n", + program_name, strerror(errno)); + goto error; + } + flags = PR_SVE_VL_LEN_MASK; + flags = t & ~flags; + + assert(optind < argc); + path = argv[optind]; + + execvp(path, &argv[optind]); + e = errno; + if (errno == ENOENT) + ret = 127; /* same as sh(1) not-found error */ + fprintf(stderr, "%s: %s: %s\n", program_name, path, strerror(e)); + +error: + return ret; /* same as sh(1) not-executable error */ +} From 25f47e3eb66e6ee31b3ed3f8c0b7bdce098106fe Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:36 +0100 Subject: [PATCH 070/148] selftests: arm64: Add wrapper scripts for stress tests Add wrapper scripts which invoke fpsimd-test and sve-test with several copies per CPU such that the context switch code will be appropriately exercised. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-6-broonie@kernel.org Signed-off-by: Will Deacon --- .../testing/selftests/arm64/fp/fpsimd-stress | 60 +++++++++++++++++++ tools/testing/selftests/arm64/fp/sve-stress | 59 ++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100755 tools/testing/selftests/arm64/fp/fpsimd-stress create mode 100755 tools/testing/selftests/arm64/fp/sve-stress diff --git a/tools/testing/selftests/arm64/fp/fpsimd-stress b/tools/testing/selftests/arm64/fp/fpsimd-stress new file mode 100755 index 000000000000..781b5b022eaf --- /dev/null +++ b/tools/testing/selftests/arm64/fp/fpsimd-stress @@ -0,0 +1,60 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2015-2019 ARM Limited. +# Original author: Dave Martin + +set -ue + +NR_CPUS=`nproc` + +pids= +logs= + +cleanup () { + trap - INT TERM CHLD + set +e + + if [ -n "$pids" ]; then + kill $pids + wait $pids + pids= + fi + + if [ -n "$logs" ]; then + cat $logs + rm $logs + logs= + fi +} + +interrupt () { + cleanup + exit 0 +} + +child_died () { + cleanup + exit 1 +} + +trap interrupt INT TERM EXIT +trap child_died CHLD + +for x in `seq 0 $((NR_CPUS * 4))`; do + log=`mktemp` + logs=$logs\ $log + ./fpsimd-test >$log & + pids=$pids\ $! +done + +# Wait for all child processes to be created: +sleep 10 + +while :; do + kill -USR1 $pids +done & +pids=$pids\ $! + +wait + +exit 1 diff --git a/tools/testing/selftests/arm64/fp/sve-stress b/tools/testing/selftests/arm64/fp/sve-stress new file mode 100755 index 000000000000..24dd0922cc02 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/sve-stress @@ -0,0 +1,59 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only +# Copyright (C) 2015-2019 ARM Limited. +# Original author: Dave Martin + +set -ue + +NR_CPUS=`nproc` + +pids= +logs= + +cleanup () { + trap - INT TERM CHLD + set +e + + if [ -n "$pids" ]; then + kill $pids + wait $pids + pids= + fi + + if [ -n "$logs" ]; then + cat $logs + rm $logs + logs= + fi +} + +interrupt () { + cleanup + exit 0 +} + +child_died () { + cleanup + exit 1 +} + +trap interrupt INT TERM EXIT + +for x in `seq 0 $((NR_CPUS * 4))`; do + log=`mktemp` + logs=$logs\ $log + ./sve-test >$log & + pids=$pids\ $! +done + +# Wait for all child processes to be created: +sleep 10 + +while :; do + kill -USR1 $pids +done & +pids=$pids\ $! + +wait + +exit 1 From e093256d14fbf9e8aaf02ed0ac69087eef6792bd Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 19 Aug 2020 12:48:37 +0100 Subject: [PATCH 071/148] selftests: arm64: Add build and documentation for FP tests Integrate the FP tests with the build system and add some documentation for the ones run outside the kselftest infrastructure. The content in the README was largely written by Dave Martin with edits by me. Signed-off-by: Mark Brown Acked-by: Dave Martin Acked-by: Shuah Khan Link: https://lore.kernel.org/r/20200819114837.51466-7-broonie@kernel.org Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/Makefile | 2 +- tools/testing/selftests/arm64/fp/.gitignore | 5 + tools/testing/selftests/arm64/fp/Makefile | 17 ++++ tools/testing/selftests/arm64/fp/README | 100 ++++++++++++++++++++ 4 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/fp/.gitignore create mode 100644 tools/testing/selftests/arm64/fp/Makefile create mode 100644 tools/testing/selftests/arm64/fp/README diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 525506fd97b9..463d56278f8f 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal pauth +ARM64_SUBTARGETS ?= tags signal pauth fp else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore new file mode 100644 index 000000000000..d66f76d2a650 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/.gitignore @@ -0,0 +1,5 @@ +fpsimd-test +sve-probe-vls +sve-ptrace +sve-test +vlset diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile new file mode 100644 index 000000000000..a57009d3a0dc --- /dev/null +++ b/tools/testing/selftests/arm64/fp/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -I../../../../../usr/include/ +TEST_GEN_PROGS := sve-ptrace sve-probe-vls +TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress sve-test sve-stress vlset + +all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED) + +fpsimd-test: fpsimd-test.o + $(CC) -nostdlib $^ -o $@ +sve-ptrace: sve-ptrace.o sve-ptrace-asm.o +sve-probe-vls: sve-probe-vls.o +sve-test: sve-test.o + $(CC) -nostdlib $^ -o $@ +vlset: vlset.o + +include ../../lib.mk diff --git a/tools/testing/selftests/arm64/fp/README b/tools/testing/selftests/arm64/fp/README new file mode 100644 index 000000000000..03e3dad865d8 --- /dev/null +++ b/tools/testing/selftests/arm64/fp/README @@ -0,0 +1,100 @@ +This directory contains a mix of tests integrated with kselftest and +standalone stress tests. + +kselftest tests +=============== + +sve-probe-vls - Checks the SVE vector length enumeration interface +sve-ptrace - Checks the SVE ptrace interface + +Running the non-kselftest tests +=============================== + +sve-stress performs an SVE context switch stress test, as described +below. + +(The fpsimd-stress test works the same way; just substitute "fpsimd" for +"sve" in the following commands.) + + +The test runs until killed by the user. + +If no context switch error was detected, you will see output such as +the following: + +$ ./sve-stress +(wait for some time) +^C +Vector length: 512 bits +PID: 1573 +Terminated by signal 15, no error, iterations=9467, signals=1014 +Vector length: 512 bits +PID: 1575 +Terminated by signal 15, no error, iterations=9448, signals=1028 +Vector length: 512 bits +PID: 1577 +Terminated by signal 15, no error, iterations=9436, signals=1039 +Vector length: 512 bits +PID: 1579 +Terminated by signal 15, no error, iterations=9421, signals=1039 +Vector length: 512 bits +PID: 1581 +Terminated by signal 15, no error, iterations=9403, signals=1039 +Vector length: 512 bits +PID: 1583 +Terminated by signal 15, no error, iterations=9385, signals=1036 +Vector length: 512 bits +PID: 1585 +Terminated by signal 15, no error, iterations=9376, signals=1039 +Vector length: 512 bits +PID: 1587 +Terminated by signal 15, no error, iterations=9361, signals=1039 +Vector length: 512 bits +PID: 1589 +Terminated by signal 15, no error, iterations=9350, signals=1039 + + +If an error was detected, details of the mismatch will be printed +instead of "no error". + +Ideally, the test should be allowed to run for many minutes or hours +to maximise test coverage. + + +KVM stress testing +================== + +To try to reproduce the bugs that we have been observing, sve-stress +should be run in parallel in two KVM guests, while simultaneously +running on the host. + +1) Start 2 guests, using the following command for each: + +$ lkvm run --console=virtio -pconsole=hvc0 --sve Image + +(Depending on the hardware GIC implementation, you may also need +--irqchip=gicv3. New kvmtool defaults to that if appropriate, but I +can't remember whether my branch is new enough for that. Try without +the option first.) + +Kvmtool occupies the terminal until you kill it (Ctrl+A x), +or until the guest terminates. It is therefore recommended to run +each instance in separate terminal (use screen or ssh etc.) This +allows multiple guests to be run in parallel while running other +commands on the host. + +Within the guest, the host filesystem is accessible, mounted on /host. + +2) Run the sve-stress on *each* guest with the Vector-Length set to 32: +guest$ ./vlset --inherit 32 ./sve-stress + +3) Run the sve-stress on the host with the maximum Vector-Length: +host$ ./vlset --inherit --max ./sve-stress + + +Again, the test should be allowed to run for many minutes or hours to +maximise test coverage. + +If no error is detected, you will see output from each sve-stress +instance similar to that illustrated above; otherwise details of the +observed mismatches will be printed. From 264c03a245de7c5b1cc3836db45de6b991f877ca Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Sep 2020 16:34:07 +0100 Subject: [PATCH 072/148] stacktrace: Remove reliable argument from arch_stack_walk() callback Currently the callback passed to arch_stack_walk() has an argument called reliable passed to it to indicate if the stack entry is reliable, a comment says that this is used by some printk() consumers. However in the current kernel none of the arch_stack_walk() implementations ever set this flag to true and the only callback implementation we have is in the generic stacktrace code which ignores the flag. It therefore appears that this flag is redundant so we can simplify and clarify things by removing it. Signed-off-by: Mark Brown Reviewed-by: Miroslav Benes Link: https://lore.kernel.org/r/20200914153409.25097-2-broonie@kernel.org Signed-off-by: Will Deacon --- arch/s390/kernel/stacktrace.c | 4 ++-- arch/x86/kernel/stacktrace.c | 10 +++++----- include/linux/stacktrace.h | 5 +---- kernel/stacktrace.c | 8 +++----- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index fc5419ac64c8..7f1266c24f6b 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -19,7 +19,7 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, unwind_for_each_frame(&state, task, regs, 0) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } @@ -56,7 +56,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, return -EINVAL; #endif - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 2fd698e28e4d..8627fda8d993 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -18,13 +18,13 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct unwind_state state; unsigned long addr; - if (regs && !consume_entry(cookie, regs->ip, false)) + if (regs && !consume_entry(cookie, regs->ip)) return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || !consume_entry(cookie, addr, false)) + if (!addr || !consume_entry(cookie, addr)) break; } } @@ -72,7 +72,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, if (!addr) return -EINVAL; - if (!consume_entry(cookie, addr, false)) + if (!consume_entry(cookie, addr)) return -EINVAL; } @@ -114,7 +114,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, { const void __user *fp = (const void __user *)regs->bp; - if (!consume_entry(cookie, regs->ip, false)) + if (!consume_entry(cookie, regs->ip)) return; while (1) { @@ -128,7 +128,7 @@ void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, break; if (!frame.ret_addr) break; - if (!consume_entry(cookie, frame.ret_addr, false)) + if (!consume_entry(cookie, frame.ret_addr)) break; fp = frame.next_fp; } diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index b7af8cc13eda..50e2df30b0aa 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -29,14 +29,11 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); * stack_trace_consume_fn - Callback for arch_stack_walk() * @cookie: Caller supplied pointer handed back by arch_stack_walk() * @addr: The stack entry address to consume - * @reliable: True when the stack entry is reliable. Required by - * some printk based consumers. * * Return: True, if the entry was consumed or skipped * False, if there is no space left to store */ -typedef bool (*stack_trace_consume_fn)(void *cookie, unsigned long addr, - bool reliable); +typedef bool (*stack_trace_consume_fn)(void *cookie, unsigned long addr); /** * arch_stack_walk - Architecture specific function to walk the stack * @consume_entry: Callback which is invoked by the architecture code for diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 946f44a9e86a..9f8117c7cfdd 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -78,8 +78,7 @@ struct stacktrace_cookie { unsigned int len; }; -static bool stack_trace_consume_entry(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry(void *cookie, unsigned long addr) { struct stacktrace_cookie *c = cookie; @@ -94,12 +93,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr, return c->len < c->size; } -static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, - bool reliable) +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr) { if (in_sched_functions(addr)) return true; - return stack_trace_consume_entry(cookie, addr, reliable); + return stack_trace_consume_entry(cookie, addr); } /** From baa2cd417053cb674deb90ee66b88afea0517335 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Sep 2020 16:34:08 +0100 Subject: [PATCH 073/148] arm64: stacktrace: Make stack walk callback consistent with generic code As with the generic arch_stack_walk() code the arm64 stack walk code takes a callback that is called per stack frame. Currently the arm64 code always passes a struct stackframe to the callback and the generic code just passes the pc, however none of the users ever reference anything in the struct other than the pc value. The arm64 code also uses a return type of int while the generic code uses a return type of bool though in both cases the return value is a boolean value and the sense is inverted between the two. In order to reduce code duplication when arm64 is converted to use arch_stack_walk() change the signature and return sense of the arm64 specific callback to match that of the generic code. Signed-off-by: Mark Brown Reviewed-by: Miroslav Benes Link: https://lore.kernel.org/r/20200914153409.25097-3-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/stacktrace.h | 2 +- arch/arm64/kernel/perf_callchain.c | 6 +++--- arch/arm64/kernel/return_address.c | 8 ++++---- arch/arm64/kernel/stacktrace.c | 11 +++++------ 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index fc7613023c19..eb29b1fe8255 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -63,7 +63,7 @@ struct stackframe { extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data); + bool (*fn)(void *, unsigned long), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index b0e03e052dd1..88ff471b0bce 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -137,11 +137,11 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, * whist unwinding the stackframe and is like a subroutine return so we use * the PC. */ -static int callchain_trace(struct stackframe *frame, void *data) +static bool callchain_trace(void *data, unsigned long pc) { struct perf_callchain_entry_ctx *entry = data; - perf_callchain_store(entry, frame->pc); - return 0; + perf_callchain_store(entry, pc); + return true; } void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index a5e8b3b9d798..a6d18755652f 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -18,16 +18,16 @@ struct return_address_data { void *addr; }; -static int save_return_addr(struct stackframe *frame, void *d) +static bool save_return_addr(void *d, unsigned long pc) { struct return_address_data *data = d; if (!data->level) { - data->addr = (void *)frame->pc; - return 1; + data->addr = (void *)pc; + return false; } else { --data->level; - return 0; + return true; } } NOKPROBE_SYMBOL(save_return_addr); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 2dd8e3b8b94b..05eaba21fd46 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -118,12 +118,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) NOKPROBE_SYMBOL(unwind_frame); void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, - int (*fn)(struct stackframe *, void *), void *data) + bool (*fn)(void *, unsigned long), void *data) { while (1) { int ret; - if (fn(frame, data)) + if (!fn(data, frame->pc)) break; ret = unwind_frame(tsk, frame); if (ret < 0) @@ -139,17 +139,16 @@ struct stack_trace_data { unsigned int skip; }; -static int save_trace(struct stackframe *frame, void *d) +static bool save_trace(void *d, unsigned long addr) { struct stack_trace_data *data = d; struct stack_trace *trace = data->trace; - unsigned long addr = frame->pc; if (data->no_sched_functions && in_sched_functions(addr)) - return 0; + return false; if (data->skip) { data->skip--; - return 0; + return false; } trace->entries[trace->nr_entries++] = addr; From 5fc57df2f6fdc18b9f9273f98318ff7919968c2c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 14 Sep 2020 16:34:09 +0100 Subject: [PATCH 074/148] arm64: stacktrace: Convert to ARCH_STACKWALK Historically architectures have had duplicated code in their stack trace implementations for filtering what gets traced. In order to avoid this duplication some generic code has been provided using a new interface arch_stack_walk(), enabled by selecting ARCH_STACKWALK in Kconfig, which factors all this out into the generic stack trace code. Convert arm64 to use this common infrastructure. Signed-off-by: Mark Brown Reviewed-by: Miroslav Benes Link: https://lore.kernel.org/r/20200914153409.25097-4-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 1 + arch/arm64/kernel/stacktrace.c | 79 +++++----------------------------- 2 files changed, 11 insertions(+), 69 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d232837cbee..d1ba52e4b976 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -29,6 +29,7 @@ config ARM64 select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY + select ARCH_STACKWALK select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_DMA_FOR_DEVICE diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 05eaba21fd46..804d076b02cb 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -133,82 +133,23 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, NOKPROBE_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE -struct stack_trace_data { - struct stack_trace *trace; - unsigned int no_sched_functions; - unsigned int skip; -}; -static bool save_trace(void *d, unsigned long addr) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { - struct stack_trace_data *data = d; - struct stack_trace *trace = data->trace; - - if (data->no_sched_functions && in_sched_functions(addr)) - return false; - if (data->skip) { - data->skip--; - return false; - } - - trace->entries[trace->nr_entries++] = addr; - - return trace->nr_entries >= trace->max_entries; -} - -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - struct stack_trace_data data; struct stackframe frame; - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = 0; - - start_backtrace(&frame, regs->regs[29], regs->pc); - walk_stackframe(current, &frame, save_trace, &data); -} -EXPORT_SYMBOL_GPL(save_stack_trace_regs); - -static noinline void __save_stack_trace(struct task_struct *tsk, - struct stack_trace *trace, unsigned int nosched) -{ - struct stack_trace_data data; - struct stackframe frame; - - if (!try_get_task_stack(tsk)) - return; - - data.trace = trace; - data.skip = trace->skip; - data.no_sched_functions = nosched; - - if (tsk != current) { - start_backtrace(&frame, thread_saved_fp(tsk), - thread_saved_pc(tsk)); - } else { - /* We don't want this function nor the caller */ - data.skip += 2; + if (regs) + start_backtrace(&frame, regs->regs[29], regs->pc); + else if (task == current) start_backtrace(&frame, (unsigned long)__builtin_frame_address(0), - (unsigned long)__save_stack_trace); - } + (unsigned long)arch_stack_walk); + else + start_backtrace(&frame, thread_saved_fp(task), + thread_saved_pc(task)); - walk_stackframe(tsk, &frame, save_trace, &data); - - put_task_stack(tsk); + walk_stackframe(task, &frame, consume_entry, cookie); } -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - __save_stack_trace(tsk, trace, 1); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -void save_stack_trace(struct stack_trace *trace) -{ - __save_stack_trace(current, trace, 0); -} - -EXPORT_SYMBOL_GPL(save_stack_trace); #endif From c6b90d5cf637ac1186ca25ec1f9e410455a0ca7d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Tue, 15 Sep 2020 16:19:59 +0800 Subject: [PATCH 075/148] arm64/fpsimd: Fix missing-prototypes in fpsimd.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings. arch/arm64/kernel/fpsimd.c:935:6: warning: no previous prototype for ‘do_sve_acc’ [-Wmissing-prototypes] arch/arm64/kernel/fpsimd.c:962:6: warning: no previous prototype for ‘do_fpsimd_acc’ [-Wmissing-prototypes] arch/arm64/kernel/fpsimd.c:971:6: warning: no previous prototype for ‘do_fpsimd_exc’ [-Wmissing-prototypes] arch/arm64/kernel/fpsimd.c:1266:6: warning: no previous prototype for ‘kernel_neon_begin’ [-Wmissing-prototypes] arch/arm64/kernel/fpsimd.c:1292:6: warning: no previous prototype for ‘kernel_neon_end’ [-Wmissing-prototypes] Signed-off-by: Tian Tao Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/1600157999-14802-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 55c8f3ec6705..0a637e373226 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -32,9 +32,11 @@ #include #include +#include #include #include #include +#include #include #include #include From 152d75d66428afc636ec34bcef8c90a4aa2a7848 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 16 Sep 2020 10:20:47 +0800 Subject: [PATCH 076/148] arm64: mm: Fix missing-prototypes in pageattr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following warnings. ‘set_memory_valid’ [-Wmissing-prototypes] int set_memory_valid(unsigned long addr, int numpages, int enable) ^ ‘set_direct_map_invalid_noflush’ [-Wmissing-prototypes] int set_direct_map_invalid_noflush(struct page *page) ^ ‘set_direct_map_default_noflush’ [-Wmissing-prototypes] int set_direct_map_default_noflush(struct page *page) ^ Signed-off-by: Tian Tao Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/1600222847-56792-1-git-send-email-tiantao6@hisilicon.com arch/arm64/mm/pageattr.c:138:5: warning: no previous prototype for arch/arm64/mm/pageattr.c:150:5: warning: no previous prototype for arch/arm64/mm/pageattr.c:165:5: warning: no previous prototype for Signed-off-by: Will Deacon --- arch/arm64/mm/pageattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 23f648c2a199..1b94f5b82654 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include +#include #include #include From a76b8236edcf5b785d044b930f9e14ad02b4a484 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Tue, 15 Sep 2020 16:41:09 -0400 Subject: [PATCH 077/148] drivers/perf: xgene_pmu: Fix uninitialized resource struct This splat was reported on newer Fedora kernels booting on certain X-gene based machines: xgene-pmu APMC0D83:00: X-Gene PMU version 3 Unable to handle kernel read from unreadable memory at virtual \ address 0000000000004006 ... Call trace: string+0x50/0x100 vsnprintf+0x160/0x750 devm_kvasprintf+0x5c/0xb4 devm_kasprintf+0x54/0x60 __devm_ioremap_resource+0xdc/0x1a0 devm_ioremap_resource+0x14/0x20 acpi_get_pmu_hw_inf.isra.0+0x84/0x15c acpi_pmu_dev_add+0xbc/0x21c acpi_ns_walk_namespace+0x16c/0x1e4 acpi_walk_namespace+0xb4/0xfc xgene_pmu_probe_pmu_dev+0x7c/0xe0 xgene_pmu_probe.part.0+0x2c0/0x310 xgene_pmu_probe+0x54/0x64 platform_drv_probe+0x60/0xb4 really_probe+0xe8/0x4a0 driver_probe_device+0xe4/0x100 device_driver_attach+0xcc/0xd4 __driver_attach+0xb0/0x17c bus_for_each_dev+0x6c/0xb0 driver_attach+0x30/0x40 bus_add_driver+0x154/0x250 driver_register+0x84/0x140 __platform_driver_register+0x54/0x60 xgene_pmu_driver_init+0x28/0x34 do_one_initcall+0x40/0x204 do_initcalls+0x104/0x144 kernel_init_freeable+0x198/0x210 kernel_init+0x20/0x12c ret_from_fork+0x10/0x18 Code: 91000400 110004e1 eb08009f 540000c0 (38646846) ---[ end trace f08c10566496a703 ]--- This is due to use of an uninitialized local resource struct in the xgene pmu driver. The thunderx2_pmu driver avoids this by using the resource list constructed by acpi_dev_get_resources() rather than using a callback from that function. The callback in the xgene driver didn't fully initialize the resource. So get rid of the callback and search the resource list as done by thunderx2. Fixes: 832c927d119b ("perf: xgene: Add APM X-Gene SoC Performance Monitoring Unit driver") Signed-off-by: Mark Salter Link: https://lore.kernel.org/r/20200915204110.326138-1-msalter@redhat.com Signed-off-by: Will Deacon --- drivers/perf/xgene_pmu.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c index edac28cd25dd..633cf07ba672 100644 --- a/drivers/perf/xgene_pmu.c +++ b/drivers/perf/xgene_pmu.c @@ -1453,17 +1453,6 @@ static char *xgene_pmu_dev_name(struct device *dev, u32 type, int id) } #if defined(CONFIG_ACPI) -static int acpi_pmu_dev_add_resource(struct acpi_resource *ares, void *data) -{ - struct resource *res = data; - - if (ares->type == ACPI_RESOURCE_TYPE_FIXED_MEMORY32) - acpi_dev_resource_memory(ares, res); - - /* Always tell the ACPI core to skip this resource */ - return 1; -} - static struct xgene_pmu_dev_ctx *acpi_get_pmu_hw_inf(struct xgene_pmu *xgene_pmu, struct acpi_device *adev, u32 type) @@ -1475,6 +1464,7 @@ xgene_pmu_dev_ctx *acpi_get_pmu_hw_inf(struct xgene_pmu *xgene_pmu, struct hw_pmu_info *inf; void __iomem *dev_csr; struct resource res; + struct resource_entry *rentry; int enable_bit; int rc; @@ -1483,11 +1473,23 @@ xgene_pmu_dev_ctx *acpi_get_pmu_hw_inf(struct xgene_pmu *xgene_pmu, return NULL; INIT_LIST_HEAD(&resource_list); - rc = acpi_dev_get_resources(adev, &resource_list, - acpi_pmu_dev_add_resource, &res); + rc = acpi_dev_get_resources(adev, &resource_list, NULL, NULL); + if (rc <= 0) { + dev_err(dev, "PMU type %d: No resources found\n", type); + return NULL; + } + + list_for_each_entry(rentry, &resource_list, node) { + if (resource_type(rentry->res) == IORESOURCE_MEM) { + res = *rentry->res; + rentry = NULL; + break; + } + } acpi_dev_free_resource_list(&resource_list); - if (rc < 0) { - dev_err(dev, "PMU type %d: No resource address found\n", type); + + if (rentry) { + dev_err(dev, "PMU type %d: No memory resource found\n", type); return NULL; } From 688494a407d1419a6b158c644b262c61cde39f48 Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Tue, 15 Sep 2020 16:41:10 -0400 Subject: [PATCH 078/148] drivers/perf: thunderx2_pmu: Fix memory resource error handling In tx2_uncore_pmu_init_dev(), a call to acpi_dev_get_resources() is used to create a list _CRS resources which is searched for the device base address. There is an error check following this: if (!rentry->res) return NULL In no case, will rentry->res be NULL, so the test is useless. Even if the test worked, it comes before the resource list memory is freed. None of this really matters as long as the ACPI table has the memory resource. Let's clean it up so that it makes sense and will give a meaningful error should firmware leave out the memory resource. Fixes: 69c32972d593 ("drivers/perf: Add Cavium ThunderX2 SoC UNCORE PMU driver") Signed-off-by: Mark Salter Link: https://lore.kernel.org/r/20200915204110.326138-2-msalter@redhat.com Signed-off-by: Will Deacon --- drivers/perf/thunderx2_pmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c index aac9823b0c6b..e116815fa809 100644 --- a/drivers/perf/thunderx2_pmu.c +++ b/drivers/perf/thunderx2_pmu.c @@ -805,14 +805,17 @@ static struct tx2_uncore_pmu *tx2_uncore_pmu_init_dev(struct device *dev, list_for_each_entry(rentry, &list, node) { if (resource_type(rentry->res) == IORESOURCE_MEM) { res = *rentry->res; + rentry = NULL; break; } } - - if (!rentry->res) - return NULL; - acpi_dev_free_resource_list(&list); + + if (rentry) { + dev_err(dev, "PMU type %d: Fail to find resource\n", type); + return NULL; + } + base = devm_ioremap_resource(dev, &res); if (IS_ERR(base)) { dev_err(dev, "PMU type %d: Fail to map resource\n", type); From 0fdb64c2a303e4d2562245501920241c0e507951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 15:48:09 +0100 Subject: [PATCH 079/148] arm64: Improve diagnostics when trapping BRK with FAULT_BRK_IMM When generating instructions at runtime, for example due to kernel text patching or the BPF JIT, we can emit a trapping BRK instruction if we are asked to encode an invalid instruction such as an out-of-range] branch. This is indicative of a bug in the caller, and will result in a crash on executing the generated code. Unfortunately, the message from the crash is really unhelpful, and mumbles something about ptrace: | Unexpected kernel BRK exception at EL1 | Internal error: ptrace BRK handler: f2000100 [#1] SMP We can do better than this. Install a break handler for FAULT_BRK_IMM, which is the immediate used to encode the "I've been asked to generate an invalid instruction" error, and triage the faulting PC to determine whether or not the failure occurred in the BPF JIT. Link: https://lore.kernel.org/r/20200915141707.GB26439@willie-the-truck Reported-by: Ilias Apalodimas Signed-off-by: Will Deacon --- arch/arm64/include/asm/extable.h | 9 +++++++++ arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/traps.c | 17 +++++++++++++++++ arch/arm64/mm/extable.c | 4 +--- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index 840a35ed92ec..b15eb4a3e6b2 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -22,6 +22,15 @@ struct exception_table_entry #define ARCH_HAS_RELATIVE_EXTABLE +static inline bool in_bpf_jit(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_BPF_JIT)) + return false; + + return regs->pc >= BPF_JIT_REGION_START && + regs->pc < BPF_JIT_REGION_END; +} + #ifdef CONFIG_BPF_JIT int arm64_bpf_fixup_exception(const struct exception_table_entry *ex, struct pt_regs *regs); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 7310a4f7f993..fa76151de6ff 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -384,7 +384,7 @@ void __init debug_traps_init(void) hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP, TRAP_TRACE, "single-step handler"); hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP, - TRAP_BRKPT, "ptrace BRK handler"); + TRAP_BRKPT, "BRK handler"); } /* Re-enable single step for syscall restarting. */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 13ebd5ca2070..00e92c9f338c 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -994,6 +995,21 @@ static struct break_hook bug_break_hook = { .imm = BUG_BRK_IMM, }; +static int reserved_fault_handler(struct pt_regs *regs, unsigned int esr) +{ + pr_err("%s generated an invalid instruction at %pS!\n", + in_bpf_jit(regs) ? "BPF JIT" : "Kernel text patching", + (void *)instruction_pointer(regs)); + + /* We cannot handle this */ + return DBG_HOOK_ERROR; +} + +static struct break_hook fault_break_hook = { + .fn = reserved_fault_handler, + .imm = FAULT_BRK_IMM, +}; + #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 @@ -1059,6 +1075,7 @@ int __init early_brk64(unsigned long addr, unsigned int esr, void __init trap_init(void) { register_kernel_break_hook(&bug_break_hook); + register_kernel_break_hook(&fault_break_hook); #ifdef CONFIG_KASAN_SW_TAGS register_kernel_break_hook(&kasan_break_hook); #endif diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c index eee1732ab6cd..aa0060178343 100644 --- a/arch/arm64/mm/extable.c +++ b/arch/arm64/mm/extable.c @@ -14,9 +14,7 @@ int fixup_exception(struct pt_regs *regs) if (!fixup) return 0; - if (IS_ENABLED(CONFIG_BPF_JIT) && - regs->pc >= BPF_JIT_REGION_START && - regs->pc < BPF_JIT_REGION_END) + if (in_bpf_jit(regs)) return arm64_bpf_fixup_exception(fixup, regs); regs->pc = (unsigned long)&fixup->fixup + fixup->fixup; From f186a84d8abecc7dfbd5bbd0a8ad0358078e4767 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:48 +0100 Subject: [PATCH 080/148] arm64/fpsimd: Update documentation of do_sve_acc fpsimd_restore_current_state() enables and disables the SVE access trap based on TIF_SVE, not task_fpsimd_load(). Update the documentation of do_sve_acc to reflect this behavior. Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-2-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/fpsimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 55c8f3ec6705..1c6a82083d5c 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -928,7 +928,7 @@ void fpsimd_release_task(struct task_struct *dead_task) * the SVE access trap will be disabled the next time this task * reaches ret_to_user. * - * TIF_SVE should be clear on entry: otherwise, task_fpsimd_load() + * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state() * would have disabled the SVE access trap for userspace during * ret_to_user, making an SVE access trap impossible in that case. */ From 68a4c52e55e024ad946779f20d50ab31860eb0cc Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:49 +0100 Subject: [PATCH 081/148] arm64/signal: Update the comment in preserve_sve_context The SVE state is saved by fpsimd_signal_preserve_current_state() and not preserve_fpsimd_context(). Update the comment in preserve_sve_context to reflect the current behavior. Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-3-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 3b4f31f35e45..f3af68dc1cf8 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -244,7 +244,8 @@ static int preserve_sve_context(struct sve_context __user *ctx) if (vq) { /* * This assumes that the SVE state has already been saved to - * the task struct by calling preserve_fpsimd_context(). + * the task struct by calling the function + * fpsimd_signal_preserve_current_state(). */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, From 315cf047d230a363515bedce177cfbf460cbb2d6 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:51 +0100 Subject: [PATCH 082/148] arm64/fpsimdmacros: Introduce a macro to update ZCR_EL1.LEN A follow-up patch will need to update ZCR_EL1.LEN. Add a macro that could be re-used in the current and new places to avoid code duplication. Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-5-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimdmacros.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index 636e9d9c7929..60e29a3e41c5 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -183,6 +183,17 @@ .purgem _for__body .endm +/* Update ZCR_EL1.LEN with the new VQ */ +.macro sve_load_vq xvqminus1, xtmp, xtmp2 + mrs_s \xtmp, SYS_ZCR_EL1 + bic \xtmp2, \xtmp, ZCR_ELx_LEN_MASK + orr \xtmp2, \xtmp2, \xvqminus1 + cmp \xtmp2, \xtmp + b.eq 921f + msr_s SYS_ZCR_EL1, \xtmp2 //self-synchronising +921: +.endm + .macro sve_save nxbase, xpfpsr, nxtmp _for n, 0, 31, _sve_str_v \n, \nxbase, \n - 34 _for n, 0, 15, _sve_str_p \n, \nxbase, \n - 16 @@ -197,13 +208,7 @@ .endm .macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 - mrs_s x\nxtmp, SYS_ZCR_EL1 - bic \xtmp2, x\nxtmp, ZCR_ELx_LEN_MASK - orr \xtmp2, \xtmp2, \xvqminus1 - cmp \xtmp2, x\nxtmp - b.eq 921f - msr_s SYS_ZCR_EL1, \xtmp2 // self-synchronising -921: + sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 _for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34 _sve_ldr_p 0, \nxbase _sve_wrffr 0 From 6d40f05fad0babfb3e991edb2d0bd28a30a2f10c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:50 +0100 Subject: [PATCH 083/148] arm64/fpsimdmacros: Allow the macro "for" to be used in more cases The current version of the macro "for" is not able to work when the counter is used to generate registers using mnemonics. This is because gas is not able to evaluate the expression generated if used in register's name (i.e x\n). Gas offers a way to evaluate macro arguments by using % in front of them under the alternate macro mode. The implementation of "for" is updated to use the alternate macro mode and %, so we can use the macro in more cases. As the alternate macro mode may have side-effects, this is disabled when expanding the body. While it is enough to prefix the argument of the macro "__for_body" with %, the arguments of "__for" are also prefixed to get a more bearable value in case of compilation error. Suggested-by: Dave Martin Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-4-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimdmacros.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index 60e29a3e41c5..feef5b371fba 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -166,19 +166,23 @@ .macro __for from:req, to:req .if (\from) == (\to) - _for__body \from + _for__body %\from .else - __for \from, (\from) + ((\to) - (\from)) / 2 - __for (\from) + ((\to) - (\from)) / 2 + 1, \to + __for %\from, %((\from) + ((\to) - (\from)) / 2) + __for %((\from) + ((\to) - (\from)) / 2 + 1), %\to .endif .endm .macro _for var:req, from:req, to:req, insn:vararg .macro _for__body \var:req + .noaltmacro \insn + .altmacro .endm + .altmacro __for \from, \to + .noaltmacro .purgem _for__body .endm From 1e530f1352a2709d7bcacdb8b6d42c8900ba2c80 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:52 +0100 Subject: [PATCH 084/148] arm64/sve: Implement a helper to flush SVE registers Introduce a new helper that will zero all SVE registers but the first 128-bits of each vector. This will be used by subsequent patches to avoid costly store/maipulate/reload sequences in places like do_sve_acc(). Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-6-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimd.h | 1 + arch/arm64/include/asm/fpsimdmacros.h | 19 +++++++++++++++++++ arch/arm64/kernel/entry-fpsimd.S | 8 ++++++++ 3 files changed, 28 insertions(+) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 59f10dd13f12..958f642e930d 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -69,6 +69,7 @@ static inline void *sve_pffr(struct thread_struct *thread) extern void sve_save_state(void *state, u32 *pfpsr); extern void sve_load_state(void const *state, u32 const *pfpsr, unsigned long vq_minus_1); +extern void sve_flush_live(void); extern unsigned int sve_get_vl(void); struct arm64_cpu_capabilities; diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index feef5b371fba..af43367534c7 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -164,6 +164,13 @@ | ((\np) << 5) .endm +/* PFALSE P\np.B */ +.macro _sve_pfalse np + _sve_check_preg \np + .inst 0x2518e400 \ + | (\np) +.endm + .macro __for from:req, to:req .if (\from) == (\to) _for__body %\from @@ -198,6 +205,18 @@ 921: .endm +/* Preserve the first 128-bits of Znz and zero the rest. */ +.macro _sve_flush_z nz + _sve_check_zreg \nz + mov v\nz\().16b, v\nz\().16b +.endm + +.macro sve_flush + _for n, 0, 31, _sve_flush_z \n + _for n, 0, 15, _sve_pfalse \n + _sve_wrffr 0 +.endm + .macro sve_save nxbase, xpfpsr, nxtmp _for n, 0, 31, _sve_str_v \n, \nxbase, \n - 34 _for n, 0, 15, _sve_str_p \n, \nxbase, \n - 16 diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index f880dd63ddc3..fdf7a5a35c63 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -32,6 +32,7 @@ SYM_FUNC_START(fpsimd_load_state) SYM_FUNC_END(fpsimd_load_state) #ifdef CONFIG_ARM64_SVE + SYM_FUNC_START(sve_save_state) sve_save 0, x1, 2 ret @@ -46,4 +47,11 @@ SYM_FUNC_START(sve_get_vl) _sve_rdvl 0, 1 ret SYM_FUNC_END(sve_get_vl) + +/* Zero all SVE registers but the first 128-bits of each vector */ +SYM_FUNC_START(sve_flush_live) + sve_flush + ret +SYM_FUNC_END(sve_flush_live) + #endif /* CONFIG_ARM64_SVE */ From 9c4b4c701e53183df94f6b9802b63762f0c0e1e3 Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Fri, 28 Aug 2020 19:11:53 +0100 Subject: [PATCH 085/148] arm64/sve: Implement a helper to load SVE registers from FPSIMD state In a follow-up patch, we may save the FPSIMD rather than the full SVE state when the state has to be zeroed on return to userspace (e.g during a syscall). Introduce an helper to load SVE vectors from FPSIMD state and zero the rest of SVE registers. Signed-off-by: Julien Grall Signed-off-by: Mark Brown Reviewed-by: Dave Martin Link: https://lore.kernel.org/r/20200828181155.17745-7-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/fpsimd.h | 2 ++ arch/arm64/kernel/entry-fpsimd.S | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 958f642e930d..bec5f14b622a 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -70,6 +70,8 @@ extern void sve_save_state(void *state, u32 *pfpsr); extern void sve_load_state(void const *state, u32 const *pfpsr, unsigned long vq_minus_1); extern void sve_flush_live(void); +extern void sve_load_from_fpsimd_state(struct user_fpsimd_state const *state, + unsigned long vq_minus_1); extern unsigned int sve_get_vl(void); struct arm64_cpu_capabilities; diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index fdf7a5a35c63..2ca395c25448 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -48,6 +48,23 @@ SYM_FUNC_START(sve_get_vl) ret SYM_FUNC_END(sve_get_vl) +/* + * Load SVE state from FPSIMD state. + * + * x0 = pointer to struct fpsimd_state + * x1 = VQ - 1 + * + * Each SVE vector will be loaded with the first 128-bits taken from FPSIMD + * and the rest zeroed. All the other SVE registers will be zeroed. + */ +SYM_FUNC_START(sve_load_from_fpsimd_state) + sve_load_vq x1, x2, x3 + fpsimd_restore x0, 8 + _for n, 0, 15, _sve_pfalse \n + _sve_wrffr 0 + ret +SYM_FUNC_END(sve_load_from_fpsimd_state) + /* Zero all SVE registers but the first 128-bits of each vector */ SYM_FUNC_START(sve_flush_live) sve_flush From b11483ef5a502663732c6ca1b58d14ff9eedd6f7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 16 Jul 2020 17:11:08 +0100 Subject: [PATCH 086/148] arm64: Make use of ARCH_WORKAROUND_1 even when KVM is not enabled We seem to be pretending that we don't have any firmware mitigation when KVM is not compiled in, which is not quite expected. Bring back the mitigation in this case. Fixes: 4db61fef16a1 ("arm64: kvm: Modernize __smccc_workaround_1_smc_start annotations") Cc: Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index c332d49780dc..88966496806a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -234,14 +234,17 @@ static int detect_harden_bp_fw(void) smccc_end = NULL; break; -#if IS_ENABLED(CONFIG_KVM) case SMCCC_CONDUIT_SMC: cb = call_smc_arch_workaround_1; +#if IS_ENABLED(CONFIG_KVM) smccc_start = __smccc_workaround_1_smc; smccc_end = __smccc_workaround_1_smc + __SMCCC_WORKAROUND_1_SMC_SZ; - break; +#else + smccc_start = NULL; + smccc_end = NULL; #endif + break; default: return -1; From 18fce56134c987e5b4eceddafdbe4b00c07e2ae1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 16 Jul 2020 17:11:09 +0100 Subject: [PATCH 087/148] arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs Commit 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof") changed the way we deal with ARCH_WORKAROUND_1, by moving most of the enabling code to the .matches() callback. This has the unfortunate effect that the workaround gets only enabled on the first affected CPU, and no other. In order to address this, forcefully call the .matches() callback from a .cpu_enable() callback, which brings us back to the original behaviour. Fixes: 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof") Cc: Reviewed-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 88966496806a..3fe64bf5a58d 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -599,6 +599,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) return (need_wa > 0); } +static void +cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap) +{ + cap->matches(cap, SCOPE_LOCAL_CPU); +} + static const __maybe_unused struct midr_range tx2_family_cpus[] = { MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), @@ -890,9 +896,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #endif { + .desc = "Branch predictor hardening", .capability = ARM64_HARDEN_BRANCH_PREDICTOR, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = check_branch_predictor, + .cpu_enable = cpu_enable_branch_predictor_hardening, }, #ifdef CONFIG_RANDOMIZE_BASE { From 9e0f085c2b33ebe13bcec53cbacce505fe78fde7 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 21 Sep 2020 13:23:41 +0100 Subject: [PATCH 088/148] arm64: Move console stack display code to stacktrace.c Currently the code for displaying a stack trace on the console is located in traps.c rather than stacktrace.c, using the unwinding code that is in stacktrace.c. This can be confusing and make the code hard to find since such output is often referred to as a stack trace which might mislead the unwary. Due to this and since traps.c doesn't interact with this code except for via the public interfaces move the code to stacktrace.c to make it easier to find. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20200921122341.11280-1-broonie@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 65 ++++++++++++++++++++++++++++++++++ arch/arm64/kernel/traps.c | 65 ---------------------------------- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 804d076b02cb..fa56af1a59c3 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -132,6 +132,71 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, } NOKPROBE_SYMBOL(walk_stackframe); +static void dump_backtrace_entry(unsigned long where, const char *loglvl) +{ + printk("%s %pS\n", loglvl, (void *)where); +} + +void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, + const char *loglvl) +{ + struct stackframe frame; + int skip = 0; + + pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + + if (regs) { + if (user_mode(regs)) + return; + skip = 1; + } + + if (!tsk) + tsk = current; + + if (!try_get_task_stack(tsk)) + return; + + if (tsk == current) { + start_backtrace(&frame, + (unsigned long)__builtin_frame_address(0), + (unsigned long)dump_backtrace); + } else { + /* + * task blocked in __switch_to + */ + start_backtrace(&frame, + thread_saved_fp(tsk), + thread_saved_pc(tsk)); + } + + printk("%sCall trace:\n", loglvl); + do { + /* skip until specified stack frame */ + if (!skip) { + dump_backtrace_entry(frame.pc, loglvl); + } else if (frame.fp == regs->regs[29]) { + skip = 0; + /* + * Mostly, this is the case where this function is + * called in panic/abort. As exception handler's + * stack frame does not contain the corresponding pc + * at which an exception has taken place, use regs->pc + * instead. + */ + dump_backtrace_entry(regs->pc, loglvl); + } + } while (!unwind_frame(tsk, &frame)); + + put_task_stack(tsk); +} + +void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) +{ + dump_backtrace(NULL, tsk, loglvl); + barrier(); +} + #ifdef CONFIG_STACKTRACE void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 13ebd5ca2070..77dc8e7701ed 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -53,11 +53,6 @@ static const char *handler[]= { int show_unhandled_signals = 0; -static void dump_backtrace_entry(unsigned long where, const char *loglvl) -{ - printk("%s %pS\n", loglvl, (void *)where); -} - static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); @@ -83,66 +78,6 @@ static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) printk("%sCode: %s\n", lvl, str); } -void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, - const char *loglvl) -{ - struct stackframe frame; - int skip = 0; - - pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); - - if (regs) { - if (user_mode(regs)) - return; - skip = 1; - } - - if (!tsk) - tsk = current; - - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) { - start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)dump_backtrace); - } else { - /* - * task blocked in __switch_to - */ - start_backtrace(&frame, - thread_saved_fp(tsk), - thread_saved_pc(tsk)); - } - - printk("%sCall trace:\n", loglvl); - do { - /* skip until specified stack frame */ - if (!skip) { - dump_backtrace_entry(frame.pc, loglvl); - } else if (frame.fp == regs->regs[29]) { - skip = 0; - /* - * Mostly, this is the case where this function is - * called in panic/abort. As exception handler's - * stack frame does not contain the corresponding pc - * at which an exception has taken place, use regs->pc - * instead. - */ - dump_backtrace_entry(regs->pc, loglvl); - } - } while (!unwind_frame(tsk, &frame)); - - put_task_stack(tsk); -} - -void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) -{ - dump_backtrace(NULL, tsk, loglvl); - barrier(); -} - #ifdef CONFIG_PREEMPT #define S_PREEMPT " PREEMPT" #elif defined(CONFIG_PREEMPT_RT) From a194c5f2d2b3a05428805146afcabe5140b5d378 Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Mon, 21 Sep 2020 10:39:36 +0800 Subject: [PATCH 089/148] arm64/mm: return cpu_all_mask when node is NUMA_NO_NODE The @node passed to cpumask_of_node() can be NUMA_NO_NODE, in that case it will trigger the following WARN_ON(node >= nr_node_ids) due to mismatched data types of @node and @nr_node_ids. Actually we should return cpu_all_mask just like most other architectures do if passed NUMA_NO_NODE. Also add a similar check to the inline cpumask_of_node() in numa.h. Signed-off-by: Zhengyuan Liu Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20200921023936.21846-1-liuzhengyuan@tj.kylinos.cn Signed-off-by: Will Deacon --- arch/arm64/include/asm/numa.h | 3 +++ arch/arm64/mm/numa.c | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index 626ad01e83bf..dd870390d639 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node); /* Returns a pointer to the cpumask of CPUs on Node 'node'. */ static inline const struct cpumask *cpumask_of_node(int node) { + if (node == NUMA_NO_NODE) + return cpu_all_mask; + return node_to_cpumask_map[node]; } #endif diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 73f8b49d485c..88e51aade0da 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -46,7 +46,11 @@ EXPORT_SYMBOL(node_to_cpumask_map); */ const struct cpumask *cpumask_of_node(int node) { - if (WARN_ON(node >= nr_node_ids)) + + if (node == NUMA_NO_NODE) + return cpu_all_mask; + + if (WARN_ON(node < 0 || node >= nr_node_ids)) return cpu_none_mask; if (WARN_ON(node_to_cpumask_map[node] == NULL)) From f5be3a61fdb5dd11ef60173e2783ccf62685f892 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 22 Sep 2020 13:53:45 +0800 Subject: [PATCH 090/148] arm64: perf: Add support caps under sysfs ARMv8.4-PMU introduces the PMMIR_EL1 registers and some new PMU events, like STALL_SLOT etc, are related to it. Let's add a caps directory to /sys/bus/event_source/devices/armv8_pmuv3_0/ and support slots from PMMIR_EL1 registers in this entry. The user programs can get the slots from sysfs directly. /sys/bus/event_source/devices/armv8_pmuv3_0/caps/slots is exposed under sysfs. Both ARMv8.4-PMU and STALL_SLOT event are implemented, it returns the slots from PMMIR_EL1, otherwise it will return 0. Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/1600754025-53535-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/perf_event.h | 3 + arch/arm64/include/asm/sysreg.h | 2 + arch/arm64/kernel/perf_event.c | 103 +++++++++++++++++++--------- include/linux/perf/arm_pmu.h | 3 + 4 files changed, 78 insertions(+), 33 deletions(-) diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 2c2d7dbe8a02..60731f602d3e 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -236,6 +236,9 @@ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +/* PMMIR_EL1.SLOTS mask */ +#define ARMV8_PMU_SLOTS_MASK 0xff + #ifdef CONFIG_PERF_EVENTS struct pt_regs; extern unsigned long perf_instruction_pointer(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 554a7e8ecb07..921773adff5e 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -321,6 +321,8 @@ #define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1) #define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2) +#define SYS_PMMIR_EL1 sys_reg(3, 0, 9, 14, 6) + #define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0) #define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 04d0ceb72a67..34a7cd3af699 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -305,6 +305,28 @@ static struct attribute_group armv8_pmuv3_format_attr_group = { .attrs = armv8_pmuv3_format_attrs, }; +static ssize_t slots_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu); + u32 slots = cpu_pmu->reg_pmmir & ARMV8_PMU_SLOTS_MASK; + + return snprintf(page, PAGE_SIZE, "0x%08x\n", slots); +} + +static DEVICE_ATTR_RO(slots); + +static struct attribute *armv8_pmuv3_caps_attrs[] = { + &dev_attr_slots.attr, + NULL, +}; + +static struct attribute_group armv8_pmuv3_caps_attr_group = { + .name = "caps", + .attrs = armv8_pmuv3_caps_attrs, +}; + /* * Perf Events' indices */ @@ -984,6 +1006,12 @@ static void __armv8pmu_probe_pmu(void *info) bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap, pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + /* store PMMIR_EL1 register for sysfs */ + if (pmuver >= ID_AA64DFR0_PMUVER_8_4 && (pmceid_raw[1] & BIT(31))) + cpu_pmu->reg_pmmir = read_cpuid(PMMIR_EL1); + else + cpu_pmu->reg_pmmir = 0; } static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) @@ -1006,7 +1034,8 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, int (*map_event)(struct perf_event *event), const struct attribute_group *events, - const struct attribute_group *format) + const struct attribute_group *format, + const struct attribute_group *caps) { int ret = armv8pmu_probe_pmu(cpu_pmu); if (ret) @@ -1031,104 +1060,112 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name, events : &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = format ? format : &armv8_pmuv3_format_attr_group; + cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ? + caps : &armv8_pmuv3_caps_attr_group; return 0; } +static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name, + int (*map_event)(struct perf_event *event)) +{ + return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL); +} + static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_pmuv3", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_pmuv3", + armv8_pmuv3_map_event); } static int armv8_a34_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a34", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a34", + armv8_pmuv3_map_event); } static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a35", - armv8_a53_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a35", + armv8_a53_map_event); } static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a53", - armv8_a53_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a53", + armv8_a53_map_event); } static int armv8_a55_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a55", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a55", + armv8_pmuv3_map_event); } static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a57", - armv8_a57_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57", + armv8_a57_map_event); } static int armv8_a65_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a65", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a65", + armv8_pmuv3_map_event); } static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a72", - armv8_a57_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72", + armv8_a57_map_event); } static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a73", - armv8_a73_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a73", + armv8_a73_map_event); } static int armv8_a75_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a75", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a75", + armv8_pmuv3_map_event); } static int armv8_a76_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a76", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a76", + armv8_pmuv3_map_event); } static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cortex_a77", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a77", + armv8_pmuv3_map_event); } static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_neoverse_e1", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1", + armv8_pmuv3_map_event); } static int armv8_n1_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_neoverse_n1", - armv8_pmuv3_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_n1", + armv8_pmuv3_map_event); } static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_cavium_thunder", - armv8_thunder_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder", + armv8_thunder_map_event); } static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) { - return armv8_pmu_init(cpu_pmu, "armv8_brcm_vulcan", - armv8_vulcan_map_event, NULL, NULL); + return armv8_pmu_init_nogroups(cpu_pmu, "armv8_brcm_vulcan", + armv8_vulcan_map_event); } static const struct of_device_id armv8_pmu_of_device_ids[] = { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 5b616dde9a4c..505480217cf1 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -73,6 +73,7 @@ enum armpmu_attr_groups { ARMPMU_ATTR_GROUP_COMMON, ARMPMU_ATTR_GROUP_EVENTS, ARMPMU_ATTR_GROUP_FORMATS, + ARMPMU_ATTR_GROUP_CAPS, ARMPMU_NR_ATTR_GROUPS }; @@ -109,6 +110,8 @@ struct arm_pmu { struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; + /* store the PMMIR_EL1 to expose slots */ + u64 reg_pmmir; /* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; From c8fdbbfa981a7bc64acec234620788c97d1f6a88 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 18 Sep 2020 19:24:09 +0100 Subject: [PATCH 091/148] perf: Add Arm CMN-600 DT binding Document the requirements for the CMN-600 DT binding. The internal topology is almost entirely discoverable by walking a tree of ID registers, but sadly both the starting point for that walk and the exact format of those registers are configuration-dependent and not discoverable from some sane fixed location. Oh well. Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- .../devicetree/bindings/perf/arm,cmn.yaml | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 Documentation/devicetree/bindings/perf/arm,cmn.yaml diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml new file mode 100644 index 000000000000..e4fcc0de25e2 --- /dev/null +++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright 2020 Arm Ltd. +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/perf/arm,cmn.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Arm CMN (Coherent Mesh Network) Performance Monitors + +maintainers: + - Robin Murphy + +properties: + compatible: + const: arm,cmn-600 + + reg: + items: + - description: Physical address of the base (PERIPHBASE) and + size (up to 64MB) of the configuration address space. + + interrupts: + minItems: 1 + maxItems: 4 + items: + - description: Overflow interrupt for DTC0 + - description: Overflow interrupt for DTC1 + - description: Overflow interrupt for DTC2 + - description: Overflow interrupt for DTC3 + description: One interrupt for each DTC domain implemented must + be specified, in order. DTC0 is always present. + + arm,root-node: + $ref: /schemas/types.yaml#/definitions/uint32 + description: Offset from PERIPHBASE of the configuration + discovery node (see TRM definition of ROOTNODEBASE). + +required: + - compatible + - reg + - interrupts + - arm,root-node + +additionalProperties: false + +examples: + - | + #include + #include + pmu@50000000 { + compatible = "arm,cmn-600"; + reg = <0x50000000 0x4000000>; + /* 4x2 mesh with one DTC, and CFG node at 0,1,1,0 */ + interrupts = ; + arm,root-node = <0x104000>; + }; +... From 0ba64770a2f2e5a104bf835e133d78d3f82287ad Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 18 Sep 2020 14:28:38 +0100 Subject: [PATCH 092/148] perf: Add Arm CMN-600 PMU driver Initial driver for PMU event counting on the Arm CMN-600 interconnect. CMN sports an obnoxiously complex distributed PMU system as part of its debug and trace features, which can do all manner of things like sampling, cross-triggering and generating CoreSight trace. This driver covers the PMU functionality, plus the relevant aspects of watchpoints for simply counting matching flits. Tested-by: Tsahi Zidenberg Tested-by: Tuan Phan Signed-off-by: Robin Murphy Signed-off-by: Will Deacon --- Documentation/admin-guide/perf/arm-cmn.rst | 65 + Documentation/admin-guide/perf/index.rst | 1 + drivers/perf/Kconfig | 7 + drivers/perf/Makefile | 1 + drivers/perf/arm-cmn.c | 1641 ++++++++++++++++++++ 5 files changed, 1715 insertions(+) create mode 100644 Documentation/admin-guide/perf/arm-cmn.rst create mode 100644 drivers/perf/arm-cmn.c diff --git a/Documentation/admin-guide/perf/arm-cmn.rst b/Documentation/admin-guide/perf/arm-cmn.rst new file mode 100644 index 000000000000..0e4809346014 --- /dev/null +++ b/Documentation/admin-guide/perf/arm-cmn.rst @@ -0,0 +1,65 @@ +============================= +Arm Coherent Mesh Network PMU +============================= + +CMN-600 is a configurable mesh interconnect consisting of a rectangular +grid of crosspoints (XPs), with each crosspoint supporting up to two +device ports to which various AMBA CHI agents are attached. + +CMN implements a distributed PMU design as part of its debug and trace +functionality. This consists of a local monitor (DTM) at every XP, which +counts up to 4 event signals from the connected device nodes and/or the +XP itself. Overflow from these local counters is accumulated in up to 8 +global counters implemented by the main controller (DTC), which provides +overall PMU control and interrupts for global counter overflow. + +PMU events +---------- + +The PMU driver registers a single PMU device for the whole interconnect, +see /sys/bus/event_source/devices/arm_cmn. Multi-chip systems may link +more than one CMN together via external CCIX links - in this situation, +each mesh counts its own events entirely independently, and additional +PMU devices will be named arm_cmn_{1..n}. + +Most events are specified in a format based directly on the TRM +definitions - "type" selects the respective node type, and "eventid" the +event number. Some events require an additional occupancy ID, which is +specified by "occupid". + +* Since RN-D nodes do not have any distinct events from RN-I nodes, they + are treated as the same type (0xa), and the common event templates are + named "rnid_*". + +* The cycle counter is treated as a synthetic event belonging to the DTC + node ("type" == 0x3, "eventid" is ignored). + +* XP events also encode the port and channel in the "eventid" field, to + match the underlying pmu_event0_id encoding for the pmu_event_sel + register. The event templates are named with prefixes to cover all + permutations. + +By default each event provides an aggregate count over all nodes of the +given type. To target a specific node, "bynodeid" must be set to 1 and +"nodeid" to the appropriate value derived from the CMN configuration +(as defined in the "Node ID Mapping" section of the TRM). + +Watchpoints +----------- + +The PMU can also count watchpoint events to monitor specific flit +traffic. Watchpoints are treated as a synthetic event type, and like PMU +events can be global or targeted with a particular XP's "nodeid" value. +Since the watchpoint direction is otherwise implicit in the underlying +register selection, separate events are provided for flit uploads and +downloads. + +The flit match value and mask are passed in config1 and config2 ("val" +and "mask" respectively). "wp_dev_sel", "wp_chn_sel", "wp_grp" and +"wp_exclusive" are specified per the TRM definitions for dtm_wp_config0. +Where a watchpoint needs to match fields from both match groups on the +REQ or SNP channel, it can be specified as two events - one for each +group - with the same nonzero "combine" value. The count for such a +pair of combined events will be attributed to the primary match. +Watchpoint events with a "combine" value of 0 are considered independent +and will count individually. diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst index 47c99f40cc16..5a8f2529a033 100644 --- a/Documentation/admin-guide/perf/index.rst +++ b/Documentation/admin-guide/perf/index.rst @@ -12,6 +12,7 @@ Performance monitor support qcom_l2_pmu qcom_l3_pmu arm-ccn + arm-cmn xgene-pmu arm_dsu_pmu thunderx2-pmu diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 7305d57d1890..130327ff0b0e 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -41,6 +41,13 @@ config ARM_CCN PMU (perf) driver supporting the ARM CCN (Cache Coherent Network) interconnect. +config ARM_CMN + tristate "Arm CMN-600 PMU support" + depends on ARM64 || (COMPILE_TEST && 64BIT) + help + Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh + Network interconnect. + config ARM_PMU depends on ARM || ARM64 bool "ARM PMU framework" diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 2ebb4de17815..5365fd56f88f 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_CCI_PMU) += arm-cci.o obj-$(CONFIG_ARM_CCN) += arm-ccn.o +obj-$(CONFIG_ARM_CMN) += arm-cmn.o obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c new file mode 100644 index 000000000000..e824b5b83ea2 --- /dev/null +++ b/drivers/perf/arm-cmn.c @@ -0,0 +1,1641 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2016-2020 Arm Limited +// CMN-600 Coherent Mesh Network PMU driver + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Common register stuff */ +#define CMN_NODE_INFO 0x0000 +#define CMN_NI_NODE_TYPE GENMASK_ULL(15, 0) +#define CMN_NI_NODE_ID GENMASK_ULL(31, 16) +#define CMN_NI_LOGICAL_ID GENMASK_ULL(47, 32) + +#define CMN_NODEID_DEVID(reg) ((reg) & 3) +#define CMN_NODEID_PID(reg) (((reg) >> 2) & 1) +#define CMN_NODEID_X(reg, bits) ((reg) >> (3 + (bits))) +#define CMN_NODEID_Y(reg, bits) (((reg) >> 3) & ((1U << (bits)) - 1)) + +#define CMN_CHILD_INFO 0x0080 +#define CMN_CI_CHILD_COUNT GENMASK_ULL(15, 0) +#define CMN_CI_CHILD_PTR_OFFSET GENMASK_ULL(31, 16) + +#define CMN_CHILD_NODE_ADDR GENMASK(27,0) +#define CMN_CHILD_NODE_EXTERNAL BIT(31) + +#define CMN_ADDR_NODE_PTR GENMASK(27, 14) + +#define CMN_NODE_PTR_DEVID(ptr) (((ptr) >> 2) & 3) +#define CMN_NODE_PTR_PID(ptr) ((ptr) & 1) +#define CMN_NODE_PTR_X(ptr, bits) ((ptr) >> (6 + (bits))) +#define CMN_NODE_PTR_Y(ptr, bits) (((ptr) >> 6) & ((1U << (bits)) - 1)) + +#define CMN_MAX_XPS (8 * 8) + +/* The CFG node has one other useful purpose */ +#define CMN_CFGM_PERIPH_ID_2 0x0010 +#define CMN_CFGM_PID2_REVISION GENMASK(7, 4) + +/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */ +#define CMN_PMU_OFFSET 0x2000 + +/* For most nodes, this is all there is */ +#define CMN_PMU_EVENT_SEL 0x000 +#define CMN_PMU_EVENTn_ID_SHIFT(n) ((n) * 8) + +/* DTMs live in the PMU space of XP registers */ +#define CMN_DTM_WPn(n) (0x1A0 + (n) * 0x18) +#define CMN_DTM_WPn_CONFIG(n) (CMN_DTM_WPn(n) + 0x00) +#define CMN_DTM_WPn_CONFIG_WP_COMBINE BIT(6) +#define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE BIT(5) +#define CMN_DTM_WPn_CONFIG_WP_GRP BIT(4) +#define CMN_DTM_WPn_CONFIG_WP_CHN_SEL GENMASK_ULL(3, 1) +#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL BIT(0) +#define CMN_DTM_WPn_VAL(n) (CMN_DTM_WPn(n) + 0x08) +#define CMN_DTM_WPn_MASK(n) (CMN_DTM_WPn(n) + 0x10) + +#define CMN_DTM_PMU_CONFIG 0x210 +#define CMN__PMEVCNT0_INPUT_SEL GENMASK_ULL(37, 32) +#define CMN__PMEVCNT0_INPUT_SEL_WP 0x00 +#define CMN__PMEVCNT0_INPUT_SEL_XP 0x04 +#define CMN__PMEVCNT0_INPUT_SEL_DEV 0x10 +#define CMN__PMEVCNT0_GLOBAL_NUM GENMASK_ULL(18, 16) +#define CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(n) ((n) * 4) +#define CMN__PMEVCNT_PAIRED(n) BIT(4 + (n)) +#define CMN__PMEVCNT23_COMBINED BIT(2) +#define CMN__PMEVCNT01_COMBINED BIT(1) +#define CMN_DTM_PMU_CONFIG_PMU_EN BIT(0) + +#define CMN_DTM_PMEVCNT 0x220 + +#define CMN_DTM_PMEVCNTSR 0x240 + +#define CMN_DTM_NUM_COUNTERS 4 + +/* The DTC node is where the magic happens */ +#define CMN_DT_DTC_CTL 0x0a00 +#define CMN_DT_DTC_CTL_DT_EN BIT(0) + +/* DTC counters are paired in 64-bit registers on a 16-byte stride. Yuck */ +#define _CMN_DT_CNT_REG(n) ((((n) / 2) * 4 + (n) % 2) * 4) +#define CMN_DT_PMEVCNT(n) (CMN_PMU_OFFSET + _CMN_DT_CNT_REG(n)) +#define CMN_DT_PMCCNTR (CMN_PMU_OFFSET + 0x40) + +#define CMN_DT_PMEVCNTSR(n) (CMN_PMU_OFFSET + 0x50 + _CMN_DT_CNT_REG(n)) +#define CMN_DT_PMCCNTRSR (CMN_PMU_OFFSET + 0x90) + +#define CMN_DT_PMCR (CMN_PMU_OFFSET + 0x100) +#define CMN_DT_PMCR_PMU_EN BIT(0) +#define CMN_DT_PMCR_CNTR_RST BIT(5) +#define CMN_DT_PMCR_OVFL_INTR_EN BIT(6) + +#define CMN_DT_PMOVSR (CMN_PMU_OFFSET + 0x118) +#define CMN_DT_PMOVSR_CLR (CMN_PMU_OFFSET + 0x120) + +#define CMN_DT_PMSSR (CMN_PMU_OFFSET + 0x128) +#define CMN_DT_PMSSR_SS_STATUS(n) BIT(n) + +#define CMN_DT_PMSRR (CMN_PMU_OFFSET + 0x130) +#define CMN_DT_PMSRR_SS_REQ BIT(0) + +#define CMN_DT_NUM_COUNTERS 8 +#define CMN_MAX_DTCS 4 + +/* + * Even in the worst case a DTC counter can't wrap in fewer than 2^42 cycles, + * so throwing away one bit to make overflow handling easy is no big deal. + */ +#define CMN_COUNTER_INIT 0x80000000 +/* Similarly for the 40-bit cycle counter */ +#define CMN_CC_INIT 0x8000000000ULL + + +/* Event attributes */ +#define CMN_CONFIG_TYPE GENMASK(15, 0) +#define CMN_CONFIG_EVENTID GENMASK(23, 16) +#define CMN_CONFIG_OCCUPID GENMASK(27, 24) +#define CMN_CONFIG_BYNODEID BIT(31) +#define CMN_CONFIG_NODEID GENMASK(47, 32) + +#define CMN_EVENT_TYPE(event) FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config) +#define CMN_EVENT_EVENTID(event) FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config) +#define CMN_EVENT_OCCUPID(event) FIELD_GET(CMN_CONFIG_OCCUPID, (event)->attr.config) +#define CMN_EVENT_BYNODEID(event) FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config) +#define CMN_EVENT_NODEID(event) FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config) + +#define CMN_CONFIG_WP_COMBINE GENMASK(27, 24) +#define CMN_CONFIG_WP_DEV_SEL BIT(48) +#define CMN_CONFIG_WP_CHN_SEL GENMASK(50, 49) +#define CMN_CONFIG_WP_GRP BIT(52) +#define CMN_CONFIG_WP_EXCLUSIVE BIT(53) +#define CMN_CONFIG1_WP_VAL GENMASK(63, 0) +#define CMN_CONFIG2_WP_MASK GENMASK(63, 0) + +#define CMN_EVENT_WP_COMBINE(event) FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config) +#define CMN_EVENT_WP_DEV_SEL(event) FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config) +#define CMN_EVENT_WP_CHN_SEL(event) FIELD_GET(CMN_CONFIG_WP_CHN_SEL, (event)->attr.config) +#define CMN_EVENT_WP_GRP(event) FIELD_GET(CMN_CONFIG_WP_GRP, (event)->attr.config) +#define CMN_EVENT_WP_EXCLUSIVE(event) FIELD_GET(CMN_CONFIG_WP_EXCLUSIVE, (event)->attr.config) +#define CMN_EVENT_WP_VAL(event) FIELD_GET(CMN_CONFIG1_WP_VAL, (event)->attr.config1) +#define CMN_EVENT_WP_MASK(event) FIELD_GET(CMN_CONFIG2_WP_MASK, (event)->attr.config2) + +/* Made-up event IDs for watchpoint direction */ +#define CMN_WP_UP 0 +#define CMN_WP_DOWN 2 + + +/* r0px probably don't exist in silicon, thankfully */ +enum cmn_revision { + CMN600_R1P0, + CMN600_R1P1, + CMN600_R1P2, + CMN600_R1P3, + CMN600_R2P0, + CMN600_R3P0, +}; + +enum cmn_node_type { + CMN_TYPE_INVALID, + CMN_TYPE_DVM, + CMN_TYPE_CFG, + CMN_TYPE_DTC, + CMN_TYPE_HNI, + CMN_TYPE_HNF, + CMN_TYPE_XP, + CMN_TYPE_SBSX, + CMN_TYPE_RNI = 0xa, + CMN_TYPE_RND = 0xd, + CMN_TYPE_RNSAM = 0xf, + CMN_TYPE_CXRA = 0x100, + CMN_TYPE_CXHA = 0x101, + CMN_TYPE_CXLA = 0x102, + /* Not a real node type */ + CMN_TYPE_WP = 0x7770 +}; + +struct arm_cmn_node { + void __iomem *pmu_base; + u16 id, logid; + enum cmn_node_type type; + + union { + /* Device node */ + struct { + int to_xp; + /* DN/HN-F/CXHA */ + unsigned int occupid_val; + unsigned int occupid_count; + }; + /* XP */ + struct { + int dtc; + u32 pmu_config_low; + union { + u8 input_sel[4]; + __le32 pmu_config_high; + }; + s8 wp_event[4]; + }; + }; + + union { + u8 event[4]; + __le32 event_sel; + }; +}; + +struct arm_cmn_dtc { + void __iomem *base; + unsigned int irq; + int irq_friend; + bool cc_active; + + struct perf_event *counters[CMN_DT_NUM_COUNTERS]; + struct perf_event *cycles; +}; + +#define CMN_STATE_DISABLED BIT(0) +#define CMN_STATE_TXN BIT(1) + +struct arm_cmn { + struct device *dev; + void __iomem *base; + + enum cmn_revision rev; + u8 mesh_x; + u8 mesh_y; + u16 num_xps; + u16 num_dns; + struct arm_cmn_node *xps; + struct arm_cmn_node *dns; + + struct arm_cmn_dtc *dtc; + unsigned int num_dtcs; + + int cpu; + struct hlist_node cpuhp_node; + + unsigned int state; + struct pmu pmu; +}; + +#define to_cmn(p) container_of(p, struct arm_cmn, pmu) + +static int arm_cmn_hp_state; + +struct arm_cmn_hw_event { + struct arm_cmn_node *dn; + u64 dtm_idx[2]; + unsigned int dtc_idx; + u8 dtcs_used; + u8 num_dns; +}; + +#define for_each_hw_dn(hw, dn, i) \ + for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++) + +static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event) +{ + BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target)); + return (struct arm_cmn_hw_event *)&event->hw; +} + +static void arm_cmn_set_index(u64 x[], unsigned int pos, unsigned int val) +{ + x[pos / 32] |= (u64)val << ((pos % 32) * 2); +} + +static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos) +{ + return (x[pos / 32] >> ((pos % 32) * 2)) & 3; +} + +struct arm_cmn_event_attr { + struct device_attribute attr; + enum cmn_node_type type; + u8 eventid; + u8 occupid; +}; + +struct arm_cmn_format_attr { + struct device_attribute attr; + u64 field; + int config; +}; + +static int arm_cmn_xyidbits(const struct arm_cmn *cmn) +{ + return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2; +} + +static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn, + struct arm_cmn_node *dn) +{ + int bits = arm_cmn_xyidbits(cmn); + int x = CMN_NODEID_X(dn->id, bits); + int y = CMN_NODEID_Y(dn->id, bits); + int xp_idx = cmn->mesh_x * y + x; + + dn->to_xp = (cmn->xps + xp_idx) - dn; +} + +static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn) +{ + return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp; +} + +static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn, + enum cmn_node_type type) +{ + int i; + + for (i = 0; i < cmn->num_dns; i++) + if (cmn->dns[i].type == type) + return &cmn->dns[i]; + return NULL; +} + +#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid) \ + (&((struct arm_cmn_event_attr[]) {{ \ + .attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL), \ + .type = _type, \ + .eventid = _eventid, \ + .occupid = _occupid, \ + }})[0].attr.attr) + +static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id) +{ + return (type == CMN_TYPE_DVM && id == 0x05) || + (type == CMN_TYPE_HNF && id == 0x0f); +} + +static ssize_t arm_cmn_event_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_cmn_event_attr *eattr; + + eattr = container_of(attr, typeof(*eattr), attr); + + if (eattr->type == CMN_TYPE_DTC) + return snprintf(buf, PAGE_SIZE, "type=0x%x\n", eattr->type); + + if (eattr->type == CMN_TYPE_WP) + return snprintf(buf, PAGE_SIZE, + "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n", + eattr->type, eattr->eventid); + + if (arm_cmn_is_occup_event(eattr->type, eattr->eventid)) + return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x,occupid=0x%x\n", + eattr->type, eattr->eventid, eattr->occupid); + + return snprintf(buf, PAGE_SIZE, "type=0x%x,eventid=0x%x\n", + eattr->type, eattr->eventid); +} + +static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, + struct attribute *attr, + int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev)); + struct arm_cmn_event_attr *eattr; + enum cmn_node_type type; + + eattr = container_of(attr, typeof(*eattr), attr.attr); + type = eattr->type; + + /* Watchpoints aren't nodes */ + if (type == CMN_TYPE_WP) + type = CMN_TYPE_XP; + + /* Revision-specific differences */ + if (cmn->rev < CMN600_R1P2) { + if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b) + return 0; + } + + if (!arm_cmn_node(cmn, type)) + return 0; + + return attr->mode; +} + +#define _CMN_EVENT_DVM(_name, _event, _occup) \ + CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup) +#define CMN_EVENT_DTC(_name) \ + CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0) +#define _CMN_EVENT_HNF(_name, _event, _occup) \ + CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup) +#define CMN_EVENT_HNI(_name, _event) \ + CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0) +#define __CMN_EVENT_XP(_name, _event) \ + CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0) +#define CMN_EVENT_SBSX(_name, _event) \ + CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0) +#define CMN_EVENT_RNID(_name, _event) \ + CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0) + +#define CMN_EVENT_DVM(_name, _event) \ + _CMN_EVENT_DVM(_name, _event, 0) +#define CMN_EVENT_HNF(_name, _event) \ + _CMN_EVENT_HNF(_name, _event, 0) +#define _CMN_EVENT_XP(_name, _event) \ + __CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)), \ + __CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)), \ + __CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)), \ + __CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)), \ + __CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)), \ + __CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)) + +/* Good thing there are only 3 fundamental XP events... */ +#define CMN_EVENT_XP(_name, _event) \ + _CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)), \ + _CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)), \ + _CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)), \ + _CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)) + + +static struct attribute *arm_cmn_event_attrs[] = { + CMN_EVENT_DTC(cycles), + + /* + * DVM node events conflict with HN-I events in the equivalent PMU + * slot, but our lazy short-cut of using the DTM counter index for + * the PMU index as well happens to avoid that by construction. + */ + CMN_EVENT_DVM(rxreq_dvmop, 0x01), + CMN_EVENT_DVM(rxreq_dvmsync, 0x02), + CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03), + CMN_EVENT_DVM(rxreq_retried, 0x04), + _CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0), + _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1), + _CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2), + + CMN_EVENT_HNF(cache_miss, 0x01), + CMN_EVENT_HNF(slc_sf_cache_access, 0x02), + CMN_EVENT_HNF(cache_fill, 0x03), + CMN_EVENT_HNF(pocq_retry, 0x04), + CMN_EVENT_HNF(pocq_reqs_recvd, 0x05), + CMN_EVENT_HNF(sf_hit, 0x06), + CMN_EVENT_HNF(sf_evictions, 0x07), + CMN_EVENT_HNF(dir_snoops_sent, 0x08), + CMN_EVENT_HNF(brd_snoops_sent, 0x09), + CMN_EVENT_HNF(slc_eviction, 0x0a), + CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b), + CMN_EVENT_HNF(mc_retries, 0x0c), + CMN_EVENT_HNF(mc_reqs, 0x0d), + CMN_EVENT_HNF(qos_hh_retry, 0x0e), + _CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0), + _CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1), + _CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2), + _CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3), + _CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4), + CMN_EVENT_HNF(pocq_addrhaz, 0x10), + CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11), + CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12), + CMN_EVENT_HNF(cmp_adq_full, 0x13), + CMN_EVENT_HNF(txdat_stall, 0x14), + CMN_EVENT_HNF(txrsp_stall, 0x15), + CMN_EVENT_HNF(seq_full, 0x16), + CMN_EVENT_HNF(seq_hit, 0x17), + CMN_EVENT_HNF(snp_sent, 0x18), + CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19), + CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a), + CMN_EVENT_HNF(snp_sent_untrk, 0x1b), + CMN_EVENT_HNF(intv_dirty, 0x1c), + CMN_EVENT_HNF(stash_snp_sent, 0x1d), + CMN_EVENT_HNF(stash_data_pull, 0x1e), + CMN_EVENT_HNF(snp_fwded, 0x1f), + + CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20), + CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21), + CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22), + CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23), + CMN_EVENT_HNI(wdb_occ_cnt_ovfl, 0x24), + CMN_EVENT_HNI(rrt_rd_alloc, 0x25), + CMN_EVENT_HNI(rrt_wr_alloc, 0x26), + CMN_EVENT_HNI(rdt_rd_alloc, 0x27), + CMN_EVENT_HNI(rdt_wr_alloc, 0x28), + CMN_EVENT_HNI(wdb_alloc, 0x29), + CMN_EVENT_HNI(txrsp_retryack, 0x2a), + CMN_EVENT_HNI(arvalid_no_arready, 0x2b), + CMN_EVENT_HNI(arready_no_arvalid, 0x2c), + CMN_EVENT_HNI(awvalid_no_awready, 0x2d), + CMN_EVENT_HNI(awready_no_awvalid, 0x2e), + CMN_EVENT_HNI(wvalid_no_wready, 0x2f), + CMN_EVENT_HNI(txdat_stall, 0x30), + CMN_EVENT_HNI(nonpcie_serialization, 0x31), + CMN_EVENT_HNI(pcie_serialization, 0x32), + + CMN_EVENT_XP(txflit_valid, 0x01), + CMN_EVENT_XP(txflit_stall, 0x02), + CMN_EVENT_XP(partial_dat_flit, 0x03), + /* We treat watchpoints as a special made-up class of XP events */ + CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0), + CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0), + + CMN_EVENT_SBSX(rd_req, 0x01), + CMN_EVENT_SBSX(wr_req, 0x02), + CMN_EVENT_SBSX(cmo_req, 0x03), + CMN_EVENT_SBSX(txrsp_retryack, 0x04), + CMN_EVENT_SBSX(txdat_flitv, 0x05), + CMN_EVENT_SBSX(txrsp_flitv, 0x06), + CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11), + CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12), + CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13), + CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14), + CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15), + CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16), + CMN_EVENT_SBSX(arvalid_no_arready, 0x21), + CMN_EVENT_SBSX(awvalid_no_awready, 0x22), + CMN_EVENT_SBSX(wvalid_no_wready, 0x23), + CMN_EVENT_SBSX(txdat_stall, 0x24), + CMN_EVENT_SBSX(txrsp_stall, 0x25), + + CMN_EVENT_RNID(s0_rdata_beats, 0x01), + CMN_EVENT_RNID(s1_rdata_beats, 0x02), + CMN_EVENT_RNID(s2_rdata_beats, 0x03), + CMN_EVENT_RNID(rxdat_flits, 0x04), + CMN_EVENT_RNID(txdat_flits, 0x05), + CMN_EVENT_RNID(txreq_flits_total, 0x06), + CMN_EVENT_RNID(txreq_flits_retried, 0x07), + CMN_EVENT_RNID(rrt_occ_ovfl, 0x08), + CMN_EVENT_RNID(wrt_occ_ovfl, 0x09), + CMN_EVENT_RNID(txreq_flits_replayed, 0x0a), + CMN_EVENT_RNID(wrcancel_sent, 0x0b), + CMN_EVENT_RNID(s0_wdata_beats, 0x0c), + CMN_EVENT_RNID(s1_wdata_beats, 0x0d), + CMN_EVENT_RNID(s2_wdata_beats, 0x0e), + CMN_EVENT_RNID(rrt_alloc, 0x0f), + CMN_EVENT_RNID(wrt_alloc, 0x10), + CMN_EVENT_RNID(rdb_unord, 0x11), + CMN_EVENT_RNID(rdb_replay, 0x12), + CMN_EVENT_RNID(rdb_hybrid, 0x13), + CMN_EVENT_RNID(rdb_ord, 0x14), + + NULL +}; + +static const struct attribute_group arm_cmn_event_attrs_group = { + .name = "events", + .attrs = arm_cmn_event_attrs, + .is_visible = arm_cmn_event_attr_is_visible, +}; + +static ssize_t arm_cmn_format_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_cmn_format_attr *fmt = container_of(attr, typeof(*fmt), attr); + int lo = __ffs(fmt->field), hi = __fls(fmt->field); + + if (lo == hi) + return snprintf(buf, PAGE_SIZE, "config:%d\n", lo); + + if (!fmt->config) + return snprintf(buf, PAGE_SIZE, "config:%d-%d\n", lo, hi); + + return snprintf(buf, PAGE_SIZE, "config%d:%d-%d\n", fmt->config, lo, hi); +} + +#define _CMN_FORMAT_ATTR(_name, _cfg, _fld) \ + (&((struct arm_cmn_format_attr[]) {{ \ + .attr = __ATTR(_name, 0444, arm_cmn_format_show, NULL), \ + .config = _cfg, \ + .field = _fld, \ + }})[0].attr.attr) +#define CMN_FORMAT_ATTR(_name, _fld) _CMN_FORMAT_ATTR(_name, 0, _fld) + +static struct attribute *arm_cmn_format_attrs[] = { + CMN_FORMAT_ATTR(type, CMN_CONFIG_TYPE), + CMN_FORMAT_ATTR(eventid, CMN_CONFIG_EVENTID), + CMN_FORMAT_ATTR(occupid, CMN_CONFIG_OCCUPID), + CMN_FORMAT_ATTR(bynodeid, CMN_CONFIG_BYNODEID), + CMN_FORMAT_ATTR(nodeid, CMN_CONFIG_NODEID), + + CMN_FORMAT_ATTR(wp_dev_sel, CMN_CONFIG_WP_DEV_SEL), + CMN_FORMAT_ATTR(wp_chn_sel, CMN_CONFIG_WP_CHN_SEL), + CMN_FORMAT_ATTR(wp_grp, CMN_CONFIG_WP_GRP), + CMN_FORMAT_ATTR(wp_exclusive, CMN_CONFIG_WP_EXCLUSIVE), + CMN_FORMAT_ATTR(wp_combine, CMN_CONFIG_WP_COMBINE), + + _CMN_FORMAT_ATTR(wp_val, 1, CMN_CONFIG1_WP_VAL), + _CMN_FORMAT_ATTR(wp_mask, 2, CMN_CONFIG2_WP_MASK), + + NULL +}; + +static const struct attribute_group arm_cmn_format_attrs_group = { + .name = "format", + .attrs = arm_cmn_format_attrs, +}; + +static ssize_t arm_cmn_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(cmn->cpu)); +} + +static struct device_attribute arm_cmn_cpumask_attr = + __ATTR(cpumask, 0444, arm_cmn_cpumask_show, NULL); + +static struct attribute *arm_cmn_cpumask_attrs[] = { + &arm_cmn_cpumask_attr.attr, + NULL, +}; + +static struct attribute_group arm_cmn_cpumask_attr_group = { + .attrs = arm_cmn_cpumask_attrs, +}; + +static const struct attribute_group *arm_cmn_attr_groups[] = { + &arm_cmn_event_attrs_group, + &arm_cmn_format_attrs_group, + &arm_cmn_cpumask_attr_group, + NULL +}; + +static int arm_cmn_wp_idx(struct perf_event *event) +{ + return CMN_EVENT_EVENTID(event) + CMN_EVENT_WP_GRP(event); +} + +static u32 arm_cmn_wp_config(struct perf_event *event) +{ + u32 config; + u32 dev = CMN_EVENT_WP_DEV_SEL(event); + u32 chn = CMN_EVENT_WP_CHN_SEL(event); + u32 grp = CMN_EVENT_WP_GRP(event); + u32 exc = CMN_EVENT_WP_EXCLUSIVE(event); + u32 combine = CMN_EVENT_WP_COMBINE(event); + + config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) | + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) | + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) | + FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc); + if (combine && !grp) + config |= CMN_DTM_WPn_CONFIG_WP_COMBINE; + + return config; +} + +static void arm_cmn_set_state(struct arm_cmn *cmn, u32 state) +{ + if (!cmn->state) + writel_relaxed(0, cmn->dtc[0].base + CMN_DT_PMCR); + cmn->state |= state; +} + +static void arm_cmn_clear_state(struct arm_cmn *cmn, u32 state) +{ + cmn->state &= ~state; + if (!cmn->state) + writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, + cmn->dtc[0].base + CMN_DT_PMCR); +} + +static void arm_cmn_pmu_enable(struct pmu *pmu) +{ + arm_cmn_clear_state(to_cmn(pmu), CMN_STATE_DISABLED); +} + +static void arm_cmn_pmu_disable(struct pmu *pmu) +{ + arm_cmn_set_state(to_cmn(pmu), CMN_STATE_DISABLED); +} + +static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw, + bool snapshot) +{ + struct arm_cmn_node *dn; + unsigned int i, offset; + u64 count = 0; + + offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT; + for_each_hw_dn(hw, dn, i) { + struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); + int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + u64 reg = readq_relaxed(xp->pmu_base + offset); + u16 dtm_count = reg >> (dtm_idx * 16); + + count += dtm_count; + } + return count; +} + +static u64 arm_cmn_read_cc(struct arm_cmn_dtc *dtc) +{ + u64 val = readq_relaxed(dtc->base + CMN_DT_PMCCNTR); + + writeq_relaxed(CMN_CC_INIT, dtc->base + CMN_DT_PMCCNTR); + return (val - CMN_CC_INIT) & ((CMN_CC_INIT << 1) - 1); +} + +static u32 arm_cmn_read_counter(struct arm_cmn_dtc *dtc, int idx) +{ + u32 val, pmevcnt = CMN_DT_PMEVCNT(idx); + + val = readl_relaxed(dtc->base + pmevcnt); + writel_relaxed(CMN_COUNTER_INIT, dtc->base + pmevcnt); + return val - CMN_COUNTER_INIT; +} + +static void arm_cmn_init_counter(struct perf_event *event) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx); + u64 count; + + for (i = 0; hw->dtcs_used & (1U << i); i++) { + writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt); + cmn->dtc[i].counters[hw->dtc_idx] = event; + } + + count = arm_cmn_read_dtm(cmn, hw, false); + local64_set(&event->hw.prev_count, count); +} + +static void arm_cmn_event_read(struct perf_event *event) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + u64 delta, new, prev; + unsigned long flags; + unsigned int i; + + if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) { + i = __ffs(hw->dtcs_used); + delta = arm_cmn_read_cc(cmn->dtc + i); + local64_add(delta, &event->count); + return; + } + new = arm_cmn_read_dtm(cmn, hw, false); + prev = local64_xchg(&event->hw.prev_count, new); + + delta = new - prev; + + local_irq_save(flags); + for (i = 0; hw->dtcs_used & (1U << i); i++) { + new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx); + delta += new << 16; + } + local_irq_restore(flags); + local64_add(delta, &event->count); +} + +static void arm_cmn_event_start(struct perf_event *event, int flags) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; + enum cmn_node_type type = CMN_EVENT_TYPE(event); + int i; + + if (type == CMN_TYPE_DTC) { + i = __ffs(hw->dtcs_used); + writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR); + cmn->dtc[i].cc_active = true; + } else if (type == CMN_TYPE_WP) { + int wp_idx = arm_cmn_wp_idx(event); + u64 val = CMN_EVENT_WP_VAL(event); + u64 mask = CMN_EVENT_WP_MASK(event); + + for_each_hw_dn(hw, dn, i) { + writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); + writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); + } + } else for_each_hw_dn(hw, dn, i) { + int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + + dn->event[dtm_idx] = CMN_EVENT_EVENTID(event); + writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL); + } +} + +static void arm_cmn_event_stop(struct perf_event *event, int flags) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; + enum cmn_node_type type = CMN_EVENT_TYPE(event); + int i; + + if (type == CMN_TYPE_DTC) { + i = __ffs(hw->dtcs_used); + cmn->dtc[i].cc_active = false; + } else if (type == CMN_TYPE_WP) { + int wp_idx = arm_cmn_wp_idx(event); + + for_each_hw_dn(hw, dn, i) { + writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx)); + writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx)); + } + } else for_each_hw_dn(hw, dn, i) { + int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + + dn->event[dtm_idx] = 0; + writel_relaxed(le32_to_cpu(dn->event_sel), dn->pmu_base + CMN_PMU_EVENT_SEL); + } + + arm_cmn_event_read(event); +} + +struct arm_cmn_val { + u8 dtm_count[CMN_MAX_XPS]; + u8 occupid[CMN_MAX_XPS]; + u8 wp[CMN_MAX_XPS][4]; + int dtc_count; + bool cycles; +}; + +static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event) +{ + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; + enum cmn_node_type type; + int i; + u8 occupid; + + if (is_software_event(event)) + return; + + type = CMN_EVENT_TYPE(event); + if (type == CMN_TYPE_DTC) { + val->cycles = true; + return; + } + + val->dtc_count++; + if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + occupid = CMN_EVENT_OCCUPID(event) + 1; + else + occupid = 0; + + for_each_hw_dn(hw, dn, i) { + int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid; + + val->dtm_count[xp]++; + val->occupid[xp] = occupid; + + if (type != CMN_TYPE_WP) + continue; + + wp_idx = arm_cmn_wp_idx(event); + val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1; + } +} + +static int arm_cmn_validate_group(struct perf_event *event) +{ + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_node *dn; + struct perf_event *sibling, *leader = event->group_leader; + enum cmn_node_type type; + struct arm_cmn_val val; + int i; + u8 occupid; + + if (leader == event) + return 0; + + if (event->pmu != leader->pmu && !is_software_event(leader)) + return -EINVAL; + + memset(&val, 0, sizeof(val)); + + arm_cmn_val_add_event(&val, leader); + for_each_sibling_event(sibling, leader) + arm_cmn_val_add_event(&val, sibling); + + type = CMN_EVENT_TYPE(event); + if (type == CMN_TYPE_DTC) + return val.cycles ? -EINVAL : 0; + + if (val.dtc_count == CMN_DT_NUM_COUNTERS) + return -EINVAL; + + if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + occupid = CMN_EVENT_OCCUPID(event) + 1; + else + occupid = 0; + + for_each_hw_dn(hw, dn, i) { + int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid; + + if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS) + return -EINVAL; + + if (occupid && val.occupid[xp] && occupid != val.occupid[xp]) + return -EINVAL; + + if (type != CMN_TYPE_WP) + continue; + + wp_idx = arm_cmn_wp_idx(event); + if (val.wp[xp][wp_idx]) + return -EINVAL; + + wp_cmb = val.wp[xp][wp_idx ^ 1]; + if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1) + return -EINVAL; + } + + return 0; +} + +static int arm_cmn_event_init(struct perf_event *event) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + enum cmn_node_type type; + unsigned int i; + bool bynodeid; + u16 nodeid, eventid; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + event->cpu = cmn->cpu; + if (event->cpu < 0) + return -EINVAL; + + type = CMN_EVENT_TYPE(event); + /* DTC events (i.e. cycles) already have everything they need */ + if (type == CMN_TYPE_DTC) + return 0; + + /* For watchpoints we need the actual XP node here */ + if (type == CMN_TYPE_WP) { + type = CMN_TYPE_XP; + /* ...and we need a "real" direction */ + eventid = CMN_EVENT_EVENTID(event); + if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN) + return -EINVAL; + } + + bynodeid = CMN_EVENT_BYNODEID(event); + nodeid = CMN_EVENT_NODEID(event); + + hw->dn = arm_cmn_node(cmn, type); + for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) { + if (!bynodeid) { + hw->num_dns++; + } else if (cmn->dns[i].id != nodeid) { + hw->dn++; + } else { + hw->num_dns = 1; + break; + } + } + + if (!hw->num_dns) { + int bits = arm_cmn_xyidbits(cmn); + + dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n", + nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits), + CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type); + return -EINVAL; + } + /* + * By assuming events count in all DTC domains, we cunningly avoid + * needing to know anything about how XPs are assigned to domains. + */ + hw->dtcs_used = (1U << cmn->num_dtcs) - 1; + + return arm_cmn_validate_group(event); +} + +static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event, + int i) +{ + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + enum cmn_node_type type = CMN_EVENT_TYPE(event); + + while (i--) { + struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i); + unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i); + + if (type == CMN_TYPE_WP) + hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1; + + if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) + hw->dn[i].occupid_count--; + + xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx); + writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG); + } + memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx)); + + for (i = 0; hw->dtcs_used & (1U << i); i++) + cmn->dtc[i].counters[hw->dtc_idx] = NULL; +} + +static int arm_cmn_event_add(struct perf_event *event, int flags) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + struct arm_cmn_dtc *dtc = &cmn->dtc[0]; + struct arm_cmn_node *dn; + enum cmn_node_type type = CMN_EVENT_TYPE(event); + unsigned int i, dtc_idx, input_sel; + + if (type == CMN_TYPE_DTC) { + i = 0; + while (cmn->dtc[i].cycles) + if (++i == cmn->num_dtcs) + return -ENOSPC; + + cmn->dtc[i].cycles = event; + hw->dtc_idx = CMN_DT_NUM_COUNTERS; + hw->dtcs_used = 1U << i; + + if (flags & PERF_EF_START) + arm_cmn_event_start(event, 0); + return 0; + } + + /* Grab a free global counter first... */ + dtc_idx = 0; + while (dtc->counters[dtc_idx]) + if (++dtc_idx == CMN_DT_NUM_COUNTERS) + return -ENOSPC; + + hw->dtc_idx = dtc_idx; + + /* ...then the local counters to feed it. */ + for_each_hw_dn(hw, dn, i) { + struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn); + unsigned int dtm_idx, shift; + u64 reg; + + dtm_idx = 0; + while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx)) + if (++dtm_idx == CMN_DTM_NUM_COUNTERS) + goto free_dtms; + + if (type == CMN_TYPE_XP) { + input_sel = CMN__PMEVCNT0_INPUT_SEL_XP + dtm_idx; + } else if (type == CMN_TYPE_WP) { + int tmp, wp_idx = arm_cmn_wp_idx(event); + u32 cfg = arm_cmn_wp_config(event); + + if (dn->wp_event[wp_idx] >= 0) + goto free_dtms; + + tmp = dn->wp_event[wp_idx ^ 1]; + if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) != + CMN_EVENT_WP_COMBINE(dtc->counters[tmp])) + goto free_dtms; + + input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx; + dn->wp_event[wp_idx] = dtc_idx; + writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx)); + } else { + unsigned int port = CMN_NODEID_PID(dn->id); + unsigned int dev = CMN_NODEID_DEVID(dn->id); + + input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx + + (port << 4) + (dev << 2); + + if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) { + int occupid = CMN_EVENT_OCCUPID(event); + + if (dn->occupid_count == 0) { + dn->occupid_val = occupid; + writel_relaxed(occupid, + dn->pmu_base + CMN_PMU_EVENT_SEL + 4); + } else if (dn->occupid_val != occupid) { + goto free_dtms; + } + dn->occupid_count++; + } + } + + arm_cmn_set_index(hw->dtm_idx, i, dtm_idx); + + xp->input_sel[dtm_idx] = input_sel; + shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx); + xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift); + xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift; + xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx); + reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low; + writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG); + } + + /* Go go go! */ + arm_cmn_init_counter(event); + + if (flags & PERF_EF_START) + arm_cmn_event_start(event, 0); + + return 0; + +free_dtms: + arm_cmn_event_clear(cmn, event, i); + return -ENOSPC; +} + +static void arm_cmn_event_del(struct perf_event *event, int flags) +{ + struct arm_cmn *cmn = to_cmn(event->pmu); + struct arm_cmn_hw_event *hw = to_cmn_hw(event); + enum cmn_node_type type = CMN_EVENT_TYPE(event); + + arm_cmn_event_stop(event, PERF_EF_UPDATE); + + if (type == CMN_TYPE_DTC) + cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL; + else + arm_cmn_event_clear(cmn, event, hw->num_dns); +} + +/* + * We stop the PMU for both add and read, to avoid skew across DTM counters. + * In theory we could use snapshots to read without stopping, but then it + * becomes a lot trickier to deal with overlow and racing against interrupts, + * plus it seems they don't work properly on some hardware anyway :( + */ +static void arm_cmn_start_txn(struct pmu *pmu, unsigned int flags) +{ + arm_cmn_set_state(to_cmn(pmu), CMN_STATE_TXN); +} + +static void arm_cmn_end_txn(struct pmu *pmu) +{ + arm_cmn_clear_state(to_cmn(pmu), CMN_STATE_TXN); +} + +static int arm_cmn_commit_txn(struct pmu *pmu) +{ + arm_cmn_end_txn(pmu); + return 0; +} + +static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct arm_cmn *cmn; + unsigned int target; + + cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node); + if (cpu != cmn->cpu) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + + perf_pmu_migrate_context(&cmn->pmu, cpu, target); + cmn->cpu = target; + return 0; +} + +static irqreturn_t arm_cmn_handle_irq(int irq, void *dev_id) +{ + struct arm_cmn_dtc *dtc = dev_id; + irqreturn_t ret = IRQ_NONE; + + for (;;) { + u32 status = readl_relaxed(dtc->base + CMN_DT_PMOVSR); + u64 delta; + int i; + + for (i = 0; i < CMN_DTM_NUM_COUNTERS; i++) { + if (status & (1U << i)) { + ret = IRQ_HANDLED; + if (WARN_ON(!dtc->counters[i])) + continue; + delta = (u64)arm_cmn_read_counter(dtc, i) << 16; + local64_add(delta, &dtc->counters[i]->count); + } + } + + if (status & (1U << CMN_DT_NUM_COUNTERS)) { + ret = IRQ_HANDLED; + if (dtc->cc_active && !WARN_ON(!dtc->cycles)) { + delta = arm_cmn_read_cc(dtc); + local64_add(delta, &dtc->cycles->count); + } + } + + writel_relaxed(status, dtc->base + CMN_DT_PMOVSR_CLR); + + if (!dtc->irq_friend) + return ret; + dtc += dtc->irq_friend; + } +} + +/* We can reasonably accommodate DTCs of the same CMN sharing IRQs */ +static int arm_cmn_init_irqs(struct arm_cmn *cmn) +{ + int i, j, irq, err; + + for (i = 0; i < cmn->num_dtcs; i++) { + irq = cmn->dtc[i].irq; + for (j = i; j--; ) { + if (cmn->dtc[j].irq == irq) { + cmn->dtc[j].irq_friend = j - i; + goto next; + } + } + err = devm_request_irq(cmn->dev, irq, arm_cmn_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + dev_name(cmn->dev), &cmn->dtc[i]); + if (err) + return err; + + err = irq_set_affinity_hint(irq, cpumask_of(cmn->cpu)); + if (err) + return err; + next: + ; /* isn't C great? */ + } + return 0; +} + +static void arm_cmn_init_dtm(struct arm_cmn_node *xp) +{ + int i; + + for (i = 0; i < 4; i++) { + xp->wp_event[i] = -1; + writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i)); + writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i)); + } + xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN; + xp->dtc = -1; +} + +static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx) +{ + struct arm_cmn_dtc *dtc = cmn->dtc + idx; + struct arm_cmn_node *xp; + + dtc->base = dn->pmu_base - CMN_PMU_OFFSET; + dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx); + if (dtc->irq < 0) + return dtc->irq; + + writel_relaxed(0, dtc->base + CMN_DT_PMCR); + writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR); + writel_relaxed(CMN_DT_PMCR_OVFL_INTR_EN, dtc->base + CMN_DT_PMCR); + + /* We do at least know that a DTC's XP must be in that DTC's domain */ + xp = arm_cmn_node_to_xp(dn); + xp->dtc = idx; + + return 0; +} + +static int arm_cmn_node_cmp(const void *a, const void *b) +{ + const struct arm_cmn_node *dna = a, *dnb = b; + int cmp; + + cmp = dna->type - dnb->type; + if (!cmp) + cmp = dna->logid - dnb->logid; + return cmp; +} + +static int arm_cmn_init_dtcs(struct arm_cmn *cmn) +{ + struct arm_cmn_node *dn; + int dtc_idx = 0; + + cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL); + if (!cmn->dtc) + return -ENOMEM; + + sort(cmn->dns, cmn->num_dns, sizeof(cmn->dns[0]), arm_cmn_node_cmp, NULL); + + cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP); + + for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) { + if (dn->type != CMN_TYPE_XP) + arm_cmn_init_node_to_xp(cmn, dn); + else if (cmn->num_dtcs == 1) + dn->dtc = 0; + + if (dn->type == CMN_TYPE_DTC) + arm_cmn_init_dtc(cmn, dn, dtc_idx++); + + /* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */ + if (dn->type == CMN_TYPE_RND) + dn->type = CMN_TYPE_RNI; + } + + writel_relaxed(CMN_DT_DTC_CTL_DT_EN, cmn->dtc[0].base + CMN_DT_DTC_CTL); + + return 0; +} + +static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_cmn_node *node) +{ + int level; + u64 reg = readq_relaxed(cmn->base + offset + CMN_NODE_INFO); + + node->type = FIELD_GET(CMN_NI_NODE_TYPE, reg); + node->id = FIELD_GET(CMN_NI_NODE_ID, reg); + node->logid = FIELD_GET(CMN_NI_LOGICAL_ID, reg); + + node->pmu_base = cmn->base + offset + CMN_PMU_OFFSET; + + if (node->type == CMN_TYPE_CFG) + level = 0; + else if (node->type == CMN_TYPE_XP) + level = 1; + else + level = 2; + + dev_dbg(cmn->dev, "node%*c%#06hx%*ctype:%-#6hx id:%-4hd off:%#x\n", + (level * 2) + 1, ' ', node->id, 5 - (level * 2), ' ', + node->type, node->logid, offset); +} + +static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) +{ + void __iomem *cfg_region; + struct arm_cmn_node cfg, *dn; + u16 child_count, child_poff; + u32 xp_offset[CMN_MAX_XPS]; + u64 reg; + int i, j; + + cfg_region = cmn->base + rgn_offset; + reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2); + cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg); + dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev); + + arm_cmn_init_node_info(cmn, rgn_offset, &cfg); + if (cfg.type != CMN_TYPE_CFG) + return -ENODEV; + + reg = readq_relaxed(cfg_region + CMN_CHILD_INFO); + child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); + child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg); + + cmn->num_xps = child_count; + cmn->num_dns = cmn->num_xps; + + /* Pass 1: visit the XPs, enumerate their children */ + for (i = 0; i < cmn->num_xps; i++) { + reg = readq_relaxed(cfg_region + child_poff + i * 8); + xp_offset[i] = reg & CMN_CHILD_NODE_ADDR; + + reg = readq_relaxed(cmn->base + xp_offset[i] + CMN_CHILD_INFO); + cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg); + } + + /* Cheeky +1 to help terminate pointer-based iteration */ + cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1, + sizeof(*cmn->dns), GFP_KERNEL); + if (!cmn->dns) + return -ENOMEM; + + /* Pass 2: now we can actually populate the nodes */ + dn = cmn->dns; + for (i = 0; i < cmn->num_xps; i++) { + void __iomem *xp_region = cmn->base + xp_offset[i]; + struct arm_cmn_node *xp = dn++; + + arm_cmn_init_node_info(cmn, xp_offset[i], xp); + arm_cmn_init_dtm(xp); + /* + * Thanks to the order in which XP logical IDs seem to be + * assigned, we can handily infer the mesh X dimension by + * looking out for the XP at (0,1) without needing to know + * the exact node ID format, which we can later derive. + */ + if (xp->id == (1 << 3)) + cmn->mesh_x = xp->logid; + + reg = readq_relaxed(xp_region + CMN_CHILD_INFO); + child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg); + child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg); + + for (j = 0; j < child_count; j++) { + reg = readq_relaxed(xp_region + child_poff + j * 8); + /* + * Don't even try to touch anything external, since in general + * we haven't a clue how to power up arbitrary CHI requesters. + * As of CMN-600r1 these could only be RN-SAMs or CXLAs, + * neither of which have any PMU events anyway. + * (Actually, CXLAs do seem to have grown some events in r1p2, + * but they don't go to regular XP DTMs, and they depend on + * secure configuration which we can't easily deal with) + */ + if (reg & CMN_CHILD_NODE_EXTERNAL) { + dev_dbg(cmn->dev, "ignoring external node %llx\n", reg); + continue; + } + + arm_cmn_init_node_info(cmn, reg & CMN_CHILD_NODE_ADDR, dn); + + switch (dn->type) { + case CMN_TYPE_DTC: + cmn->num_dtcs++; + dn++; + break; + /* These guys have PMU events */ + case CMN_TYPE_DVM: + case CMN_TYPE_HNI: + case CMN_TYPE_HNF: + case CMN_TYPE_SBSX: + case CMN_TYPE_RNI: + case CMN_TYPE_RND: + case CMN_TYPE_CXRA: + case CMN_TYPE_CXHA: + dn++; + break; + /* Nothing to see here */ + case CMN_TYPE_RNSAM: + case CMN_TYPE_CXLA: + break; + /* Something has gone horribly wrong */ + default: + dev_err(cmn->dev, "invalid device node type: 0x%hx\n", dn->type); + return -ENODEV; + } + } + } + + /* Correct for any nodes we skipped */ + cmn->num_dns = dn - cmn->dns; + + /* + * If mesh_x wasn't set during discovery then we never saw + * an XP at (0,1), thus we must have an Nx1 configuration. + */ + if (!cmn->mesh_x) + cmn->mesh_x = cmn->num_xps; + cmn->mesh_y = cmn->num_xps / cmn->mesh_x; + + dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n", + cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn)); + + return 0; +} + +static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn) +{ + struct resource *cfg, *root; + + cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!cfg) + return -EINVAL; + + root = platform_get_resource(pdev, IORESOURCE_MEM, 1); + if (!root) + return -EINVAL; + + if (!resource_contains(cfg, root)) + swap(cfg, root); + /* + * Note that devm_ioremap_resource() is dumb and won't let the platform + * device claim cfg when the ACPI companion device has already claimed + * root within it. But since they *are* already both claimed in the + * appropriate name, we don't really need to do it again here anyway. + */ + cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg)); + if (!cmn->base) + return -ENOMEM; + + return root->start - cfg->start; +} + +static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn) +{ + struct device_node *np = pdev->dev.of_node; + u32 rootnode; + int ret; + + cmn->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(cmn->base)) + return PTR_ERR(cmn->base); + + ret = of_property_read_u32(np, "arm,root-node", &rootnode); + if (ret) + return ret; + + return rootnode; +} + +static int arm_cmn_probe(struct platform_device *pdev) +{ + struct arm_cmn *cmn; + const char *name; + static atomic_t id; + int err, rootnode, this_id; + + cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL); + if (!cmn) + return -ENOMEM; + + cmn->dev = &pdev->dev; + platform_set_drvdata(pdev, cmn); + + if (has_acpi_companion(cmn->dev)) + rootnode = arm_cmn_acpi_probe(pdev, cmn); + else + rootnode = arm_cmn_of_probe(pdev, cmn); + if (rootnode < 0) + return rootnode; + + err = arm_cmn_discover(cmn, rootnode); + if (err) + return err; + + err = arm_cmn_init_dtcs(cmn); + if (err) + return err; + + err = arm_cmn_init_irqs(cmn); + if (err) + return err; + + cmn->cpu = raw_smp_processor_id(); + cmn->pmu = (struct pmu) { + .module = THIS_MODULE, + .attr_groups = arm_cmn_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .pmu_enable = arm_cmn_pmu_enable, + .pmu_disable = arm_cmn_pmu_disable, + .event_init = arm_cmn_event_init, + .add = arm_cmn_event_add, + .del = arm_cmn_event_del, + .start = arm_cmn_event_start, + .stop = arm_cmn_event_stop, + .read = arm_cmn_event_read, + .start_txn = arm_cmn_start_txn, + .commit_txn = arm_cmn_commit_txn, + .cancel_txn = arm_cmn_end_txn, + }; + + this_id = atomic_fetch_inc(&id); + if (this_id == 0) { + name = "arm_cmn"; + } else { + name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id); + if (!name) + return -ENOMEM; + } + + err = cpuhp_state_add_instance(arm_cmn_hp_state, &cmn->cpuhp_node); + if (err) + return err; + + err = perf_pmu_register(&cmn->pmu, name, -1); + if (err) + cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node); + return err; +} + +static int arm_cmn_remove(struct platform_device *pdev) +{ + struct arm_cmn *cmn = platform_get_drvdata(pdev); + int i; + + writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL); + + perf_pmu_unregister(&cmn->pmu); + cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node); + + for (i = 0; i < cmn->num_dtcs; i++) + irq_set_affinity_hint(cmn->dtc[i].irq, NULL); + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id arm_cmn_of_match[] = { + { .compatible = "arm,cmn-600", }, + {} +}; +MODULE_DEVICE_TABLE(of, arm_cmn_of_match); +#endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id arm_cmn_acpi_match[] = { + { "ARMHC600", }, + {} +}; +MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match); +#endif + +static struct platform_driver arm_cmn_driver = { + .driver = { + .name = "arm-cmn", + .of_match_table = of_match_ptr(arm_cmn_of_match), + .acpi_match_table = ACPI_PTR(arm_cmn_acpi_match), + }, + .probe = arm_cmn_probe, + .remove = arm_cmn_remove, +}; + +static int __init arm_cmn_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "perf/arm/cmn:online", NULL, + arm_cmn_pmu_offline_cpu); + if (ret < 0) + return ret; + + arm_cmn_hp_state = ret; + ret = platform_driver_register(&arm_cmn_driver); + if (ret) + cpuhp_remove_multi_state(arm_cmn_hp_state); + return ret; +} + +static void __exit arm_cmn_exit(void) +{ + platform_driver_unregister(&arm_cmn_driver); + cpuhp_remove_multi_state(arm_cmn_hp_state); +} + +module_init(arm_cmn_init); +module_exit(arm_cmn_exit); + +MODULE_AUTHOR("Robin Murphy "); +MODULE_DESCRIPTION("Arm CMN-600 PMU driver"); +MODULE_LICENSE("GPL v2"); From 490d7b7c0845eacf5593db333fd2ae7715416e16 Mon Sep 17 00:00:00 2001 From: Alexandru Elisei Date: Thu, 24 Sep 2020 12:07:00 +0100 Subject: [PATCH 093/148] arm64: perf: Add missing ISB in armv8pmu_enable_counter() Writes to the PMXEVTYPER_EL0 register are not self-synchronising. In armv8pmu_enable_event(), the PE can reorder configuring the event type after we have enabled the counter and the interrupt. This can lead to an interrupt being asserted because of the previous event type that we were counting using the same counter, not the one that we've just configured. The same rationale applies to writes to the PMINTENSET_EL1 register. The PE can reorder enabling the interrupt at any point in the future after we have enabled the event. Prevent both situations from happening by adding an ISB just before we enable the event counter. Fixes: 030896885ade ("arm64: Performance counters support") Reported-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-2-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 34a7cd3af699..e43ab350a5a2 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -541,6 +541,11 @@ static u32 armv8pmu_event_cnten_mask(struct perf_event *event) static inline void armv8pmu_enable_counter(u32 mask) { + /* + * Make sure event configuration register writes are visible before we + * enable the counter. + * */ + isb(); write_sysreg(mask, pmcntenset_el0); } From 0fdf1bb75953a67e63e5055a7709c629ab6d7692 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 24 Sep 2020 12:07:01 +0100 Subject: [PATCH 094/148] arm64: perf: Avoid PMXEV* indirection Currently we access the counter registers and their respective type registers indirectly. This requires us to write to PMSELR, issue an ISB, then access the relevant PMXEV* registers. This is unfortunate, because: * Under virtualization, accessing one register requires two traps to the hypervisor, even though we could access the register directly with a single trap. * We have to issue an ISB which we could otherwise avoid the cost of. * When we use NMIs, the NMI handler will have to save/restore the select register in case the code it preempted was attempting to access a counter or its type register. We can avoid these issues by directly accessing the relevant registers. This patch adds helpers to do so. In armv8pmu_enable_event() we still need the ISB to prevent the PE from reordering the write to PMINTENSET_EL1 register. If the interrupt is enabled before we disable the counter and the new event is configured, we might get an interrupt triggered by the previously programmed event overflowing, but which we wrongly attribute to the event that we are enabling. Execute an ISB after we disable the counter. In the process, remove the comment that refers to the ARMv7 PMU. [Julien T.: Don't inline read/write functions to avoid big code-size increase, remove unused read_pmevtypern function, fix counter index issue.] [Alexandru E.: Removed comment, removed trailing semicolons in macros, added ISB] Signed-off-by: Mark Rutland Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-3-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 99 +++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index e43ab350a5a2..39596ed4ab86 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -371,6 +371,73 @@ static inline bool armv8pmu_event_is_chained(struct perf_event *event) #define ARMV8_IDX_TO_COUNTER(x) \ (((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK) +/* + * This code is really good + */ + +#define PMEVN_CASE(n, case_macro) \ + case n: case_macro(n); break + +#define PMEVN_SWITCH(x, case_macro) \ + do { \ + switch (x) { \ + PMEVN_CASE(0, case_macro); \ + PMEVN_CASE(1, case_macro); \ + PMEVN_CASE(2, case_macro); \ + PMEVN_CASE(3, case_macro); \ + PMEVN_CASE(4, case_macro); \ + PMEVN_CASE(5, case_macro); \ + PMEVN_CASE(6, case_macro); \ + PMEVN_CASE(7, case_macro); \ + PMEVN_CASE(8, case_macro); \ + PMEVN_CASE(9, case_macro); \ + PMEVN_CASE(10, case_macro); \ + PMEVN_CASE(11, case_macro); \ + PMEVN_CASE(12, case_macro); \ + PMEVN_CASE(13, case_macro); \ + PMEVN_CASE(14, case_macro); \ + PMEVN_CASE(15, case_macro); \ + PMEVN_CASE(16, case_macro); \ + PMEVN_CASE(17, case_macro); \ + PMEVN_CASE(18, case_macro); \ + PMEVN_CASE(19, case_macro); \ + PMEVN_CASE(20, case_macro); \ + PMEVN_CASE(21, case_macro); \ + PMEVN_CASE(22, case_macro); \ + PMEVN_CASE(23, case_macro); \ + PMEVN_CASE(24, case_macro); \ + PMEVN_CASE(25, case_macro); \ + PMEVN_CASE(26, case_macro); \ + PMEVN_CASE(27, case_macro); \ + PMEVN_CASE(28, case_macro); \ + PMEVN_CASE(29, case_macro); \ + PMEVN_CASE(30, case_macro); \ + default: WARN(1, "Invalid PMEV* index\n"); \ + } \ + } while (0) + +#define RETURN_READ_PMEVCNTRN(n) \ + return read_sysreg(pmevcntr##n##_el0) +static unsigned long read_pmevcntrn(int n) +{ + PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN); + return 0; +} + +#define WRITE_PMEVCNTRN(n) \ + write_sysreg(val, pmevcntr##n##_el0) +static void write_pmevcntrn(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVCNTRN); +} + +#define WRITE_PMEVTYPERN(n) \ + write_sysreg(val, pmevtyper##n##_el0) +static void write_pmevtypern(int n, unsigned long val) +{ + PMEVN_SWITCH(n, WRITE_PMEVTYPERN); +} + static inline u32 armv8pmu_pmcr_read(void) { return read_sysreg(pmcr_el0); @@ -393,17 +460,11 @@ static inline int armv8pmu_counter_has_overflowed(u32 pmnc, int idx) return pmnc & BIT(ARMV8_IDX_TO_COUNTER(idx)); } -static inline void armv8pmu_select_counter(int idx) +static inline u32 armv8pmu_read_evcntr(int idx) { u32 counter = ARMV8_IDX_TO_COUNTER(idx); - write_sysreg(counter, pmselr_el0); - isb(); -} -static inline u64 armv8pmu_read_evcntr(int idx) -{ - armv8pmu_select_counter(idx); - return read_sysreg(pmxevcntr_el0); + return read_pmevcntrn(counter); } static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) @@ -471,8 +532,9 @@ static u64 armv8pmu_read_counter(struct perf_event *event) static inline void armv8pmu_write_evcntr(int idx, u64 value) { - armv8pmu_select_counter(idx); - write_sysreg(value, pmxevcntr_el0); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + + write_pmevcntrn(counter, value); } static inline void armv8pmu_write_hw_counter(struct perf_event *event, @@ -503,9 +565,10 @@ static void armv8pmu_write_counter(struct perf_event *event, u64 value) static inline void armv8pmu_write_evtype(int idx, u32 val) { - armv8pmu_select_counter(idx); + u32 counter = ARMV8_IDX_TO_COUNTER(idx); + val &= ARMV8_PMU_EVTYPE_MASK; - write_sysreg(val, pmxevtyper_el0); + write_pmevtypern(counter, val); } static inline void armv8pmu_write_event_type(struct perf_event *event) @@ -525,7 +588,10 @@ static inline void armv8pmu_write_event_type(struct perf_event *event) armv8pmu_write_evtype(idx - 1, hwc->config_base); armv8pmu_write_evtype(idx, chain_evt); } else { - armv8pmu_write_evtype(idx, hwc->config_base); + if (idx == ARMV8_IDX_CYCLE_COUNTER) + write_sysreg(hwc->config_base, pmccfiltr_el0); + else + armv8pmu_write_evtype(idx, hwc->config_base); } } @@ -564,6 +630,11 @@ static inline void armv8pmu_enable_event_counter(struct perf_event *event) static inline void armv8pmu_disable_counter(u32 mask) { write_sysreg(mask, pmcntenclr_el0); + /* + * Make sure the effects of disabling the counter are visible before we + * start configuring the event. + */ + isb(); } static inline void armv8pmu_disable_event_counter(struct perf_event *event) @@ -636,7 +707,7 @@ static void armv8pmu_enable_event(struct perf_event *event) armv8pmu_disable_event_counter(event); /* - * Set event (if destined for PMNx counters). + * Set event. */ armv8pmu_write_event_type(event); From 2a0e2a02e4b719174547d6f04c27410c6fe456f5 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:02 +0100 Subject: [PATCH 095/148] arm64: perf: Remove PMU locking The PMU is disabled and enabled, and the counters are programmed from contexts where interrupts or preemption is disabled. The functions to toggle the PMU and to program the PMU counters access the registers directly and don't access data modified by the interrupt handler. That, and the fact that they're always called from non-preemptible contexts, means that we don't need to disable interrupts or use a spinlock. [Alexandru E.: Explained why locking is not needed, removed WARN_ONs] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-4-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 39596ed4ab86..077df5aea19a 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -691,15 +691,10 @@ static inline u32 armv8pmu_getreset_flags(void) static void armv8pmu_enable_event(struct perf_event *event) { - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - /* * Enable counter and interrupt, and set the counter to count * the event that we're interested in. */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* * Disable counter @@ -720,21 +715,10 @@ static void armv8pmu_enable_event(struct perf_event *event) * Enable counter */ armv8pmu_enable_event_counter(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_disable_event(struct perf_event *event) { - unsigned long flags; - struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - /* - * Disable counter and interrupt - */ - raw_spin_lock_irqsave(&events->pmu_lock, flags); - /* * Disable counter */ @@ -744,30 +728,18 @@ static void armv8pmu_disable_event(struct perf_event *event) * Disable interrupt for this counter */ armv8pmu_disable_event_irq(event); - - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_start(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Enable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static void armv8pmu_stop(struct arm_pmu *cpu_pmu) { - unsigned long flags; - struct pmu_hw_events *events = this_cpu_ptr(cpu_pmu->hw_events); - - raw_spin_lock_irqsave(&events->pmu_lock, flags); /* Disable all counters */ armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); - raw_spin_unlock_irqrestore(&events->pmu_lock, flags); } static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) From 05ab72813340d11205556c0d1bc08e6857a3856c Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:03 +0100 Subject: [PATCH 096/148] arm64: perf: Defer irq_work to IPI_IRQ_WORK When handling events, armv8pmu_handle_irq() calls perf_event_overflow(), and subsequently calls irq_work_run() to handle any work queued by perf_event_overflow(). As perf_event_overflow() raises IPI_IRQ_WORK when queuing the work, this isn't strictly necessary and the work could be handled as part of the IPI_IRQ_WORK handler. In the common case the IPI handler will run immediately after the PMU IRQ handler, and where the PE is heavily loaded with interrupts other handlers may run first, widening the window where some counters are disabled. In practice this window is unlikely to be a significant issue, and removing the call to irq_work_run() would make the PMU IRQ handler NMI safe in addition to making it simpler, so let's do that. [Alexandru E.: Reworded commit message] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Catalin Marinas Link: https://lore.kernel.org/r/20200924110706.254996-5-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 077df5aea19a..3605f77ad4df 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -792,20 +792,16 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) if (!armpmu_event_set_period(event)) continue; + /* + * Perf event overflow will queue the processing of the event as + * an irq_work which will be taken care of in the handling of + * IPI_IRQ_WORK. + */ if (perf_event_overflow(event, &data, regs)) cpu_pmu->disable(event); } armv8pmu_start(cpu_pmu); - /* - * Handle the pending perf events. - * - * Note: this call *must* be run with interrupts disabled. For - * platforms that can have the PMU interrupts raised as an NMI, this - * will not work. - */ - irq_work_run(); - return IRQ_HANDLED; } From 95e92e45a454a10a8114294d0f7aec930fb85891 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:04 +0100 Subject: [PATCH 097/148] KVM: arm64: pmu: Make overflow handler NMI safe kvm_vcpu_kick() is not NMI safe. When the overflow handler is called from NMI context, defer waking the vcpu to an irq_work queue. A vcpu can be freed while it's not running by kvm_destroy_vm(). Prevent running the irq_work for a non-existent vcpu by calling irq_work_sync() on the PMU destroy path. [Alexandru E.: Added irq_work_sync()] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Marc Zyngier Cc: Will Deacon Cc: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Suzuki K Pouloze Cc: kvm@vger.kernel.org Cc: kvmarm@lists.cs.columbia.edu Link: https://lore.kernel.org/r/20200924110706.254996-6-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- arch/arm64/kvm/pmu-emul.c | 26 +++++++++++++++++++++++++- include/kvm/arm_pmu.h | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f0d0312c0a55..81916e360b1e 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -269,6 +269,7 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) for (i = 0; i < ARMV8_PMU_MAX_COUNTERS; i++) kvm_pmu_release_perf_event(&pmu->pmc[i]); + irq_work_sync(&vcpu->arch.pmu.overflow_work); } u64 kvm_pmu_valid_counter_mask(struct kvm_vcpu *vcpu) @@ -433,6 +434,22 @@ void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) kvm_pmu_update_state(vcpu); } +/** + * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding + * to the event. + * This is why we need a callback to do it once outside of the NMI context. + */ +static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) +{ + struct kvm_vcpu *vcpu; + struct kvm_pmu *pmu; + + pmu = container_of(work, struct kvm_pmu, overflow_work); + vcpu = kvm_pmc_to_vcpu(pmu->pmc); + + kvm_vcpu_kick(vcpu); +} + /** * When the perf event overflows, set the overflow status and inform the vcpu. */ @@ -465,7 +482,11 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, if (kvm_pmu_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); - kvm_vcpu_kick(vcpu); + + if (!in_nmi()) + kvm_vcpu_kick(vcpu); + else + irq_work_queue(&vcpu->arch.pmu.overflow_work); } cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD); @@ -764,6 +785,9 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) return ret; } + init_irq_work(&vcpu->arch.pmu.overflow_work, + kvm_pmu_perf_overflow_notify_vcpu); + vcpu->arch.pmu.created = true; return 0; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 6db030439e29..dbf4f08d42e5 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -27,6 +27,7 @@ struct kvm_pmu { bool ready; bool created; bool irq_level; + struct irq_work overflow_work; }; #define kvm_arm_pmu_v3_ready(v) ((v)->arch.pmu.ready) From f76b130bdb8949eac002b8e0ddb85576ed137838 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:05 +0100 Subject: [PATCH 098/148] arm_pmu: Introduce pmu_irq_ops Currently the PMU interrupt can either be a normal irq or a percpu irq. Supporting NMI will introduce two cases for each existing one. It becomes a mess of 'if's when managing the interrupt. Define sets of callbacks for operations commonly done on the interrupt. The appropriate set of callbacks is selected at interrupt request time and simplifies interrupt enabling/disabling and freeing. Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/20200924110706.254996-7-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 90 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index df352b334ea7..a770726e98d4 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -26,8 +26,46 @@ #include +static int armpmu_count_irq_users(const int irq); + +struct pmu_irq_ops { + void (*enable_pmuirq)(unsigned int irq); + void (*disable_pmuirq)(unsigned int irq); + void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid); +}; + +static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid) +{ + free_irq(irq, per_cpu_ptr(devid, cpu)); +} + +static const struct pmu_irq_ops pmuirq_ops = { + .enable_pmuirq = enable_irq, + .disable_pmuirq = disable_irq_nosync, + .free_pmuirq = armpmu_free_pmuirq +}; + +static void armpmu_enable_percpu_pmuirq(unsigned int irq) +{ + enable_percpu_irq(irq, IRQ_TYPE_NONE); +} + +static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, + void __percpu *devid) +{ + if (armpmu_count_irq_users(irq) == 1) + free_percpu_irq(irq, devid); +} + +static const struct pmu_irq_ops percpu_pmuirq_ops = { + .enable_pmuirq = armpmu_enable_percpu_pmuirq, + .disable_pmuirq = disable_percpu_irq, + .free_pmuirq = armpmu_free_percpu_pmuirq +}; + static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); +static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); static inline u64 arm_pmu_event_max_period(struct perf_event *event) { @@ -544,6 +582,23 @@ static int armpmu_count_irq_users(const int irq) return count; } +static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) +{ + const struct pmu_irq_ops *ops = NULL; + int cpu; + + for_each_possible_cpu(cpu) { + if (per_cpu(cpu_irq, cpu) != irq) + continue; + + ops = per_cpu(cpu_irq_ops, cpu); + if (ops) + break; + } + + return ops; +} + void armpmu_free_irq(int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) @@ -551,18 +606,18 @@ void armpmu_free_irq(int irq, int cpu) if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; - if (!irq_is_percpu_devid(irq)) - free_irq(irq, per_cpu_ptr(&cpu_armpmu, cpu)); - else if (armpmu_count_irq_users(irq) == 1) - free_percpu_irq(irq, &cpu_armpmu); + per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); per_cpu(cpu_irq, cpu) = 0; + per_cpu(cpu_irq_ops, cpu) = NULL; } int armpmu_request_irq(int irq, int cpu) { int err = 0; const irq_handler_t handler = armpmu_dispatch_irq; + const struct pmu_irq_ops *irq_ops; + if (!irq) return 0; @@ -584,15 +639,26 @@ int armpmu_request_irq(int irq, int cpu) irq_set_status_flags(irq, IRQ_NOAUTOEN); err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); + + irq_ops = &pmuirq_ops; } else if (armpmu_count_irq_users(irq) == 0) { err = request_percpu_irq(irq, handler, "arm-pmu", &cpu_armpmu); + + irq_ops = &percpu_pmuirq_ops; + } else { + /* Per cpudevid irq was already requested by another CPU */ + irq_ops = armpmu_find_irq_ops(irq); + + if (WARN_ON(!irq_ops)) + err = -EINVAL; } if (err) goto err_out; per_cpu(cpu_irq, cpu) = irq; + per_cpu(cpu_irq_ops, cpu) = irq_ops; return 0; err_out: @@ -625,12 +691,8 @@ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) per_cpu(cpu_armpmu, cpu) = pmu; irq = armpmu_get_cpu_irq(pmu, cpu); - if (irq) { - if (irq_is_percpu_devid(irq)) - enable_percpu_irq(irq, IRQ_TYPE_NONE); - else - enable_irq(irq); - } + if (irq) + per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); return 0; } @@ -644,12 +706,8 @@ static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) return 0; irq = armpmu_get_cpu_irq(pmu, cpu); - if (irq) { - if (irq_is_percpu_devid(irq)) - disable_percpu_irq(irq); - else - disable_irq_nosync(irq); - } + if (irq) + per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); per_cpu(cpu_armpmu, cpu) = NULL; From d8f6267f7ce5dc7b8920910e7e75216f77e06d21 Mon Sep 17 00:00:00 2001 From: Julien Thierry Date: Thu, 24 Sep 2020 12:07:06 +0100 Subject: [PATCH 099/148] arm_pmu: arm64: Use NMIs for PMU Add required PMU interrupt operations for NMIs. Request interrupt lines as NMIs when possible, otherwise fall back to normal interrupts. NMIs are only supported on the arm64 architecture with a GICv3 irqchip. [Alexandru E.: Added that NMIs only work on arm64 + GICv3, print message when PMU is using NMIs] Signed-off-by: Julien Thierry Signed-off-by: Alexandru Elisei Tested-by: Sumit Garg (Developerbox) Cc: Julien Thierry Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/20200924110706.254996-8-alexandru.elisei@arm.com Signed-off-by: Will Deacon --- drivers/perf/arm_pmu.c | 69 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 7 deletions(-) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index a770726e98d4..cb2f55f450e4 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -45,6 +45,17 @@ static const struct pmu_irq_ops pmuirq_ops = { .free_pmuirq = armpmu_free_pmuirq }; +static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid) +{ + free_nmi(irq, per_cpu_ptr(devid, cpu)); +} + +static const struct pmu_irq_ops pmunmi_ops = { + .enable_pmuirq = enable_nmi, + .disable_pmuirq = disable_nmi_nosync, + .free_pmuirq = armpmu_free_pmunmi +}; + static void armpmu_enable_percpu_pmuirq(unsigned int irq) { enable_percpu_irq(irq, IRQ_TYPE_NONE); @@ -63,10 +74,37 @@ static const struct pmu_irq_ops percpu_pmuirq_ops = { .free_pmuirq = armpmu_free_percpu_pmuirq }; +static void armpmu_enable_percpu_pmunmi(unsigned int irq) +{ + if (!prepare_percpu_nmi(irq)) + enable_percpu_nmi(irq, IRQ_TYPE_NONE); +} + +static void armpmu_disable_percpu_pmunmi(unsigned int irq) +{ + disable_percpu_nmi(irq); + teardown_percpu_nmi(irq); +} + +static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, + void __percpu *devid) +{ + if (armpmu_count_irq_users(irq) == 1) + free_percpu_nmi(irq, devid); +} + +static const struct pmu_irq_ops percpu_pmunmi_ops = { + .enable_pmuirq = armpmu_enable_percpu_pmunmi, + .disable_pmuirq = armpmu_disable_percpu_pmunmi, + .free_pmuirq = armpmu_free_percpu_pmunmi +}; + static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); +static bool has_nmi; + static inline u64 arm_pmu_event_max_period(struct perf_event *event) { if (event->hw.flags & ARMPMU_EVT_64BIT) @@ -637,15 +675,31 @@ int armpmu_request_irq(int irq, int cpu) IRQF_NO_THREAD; irq_set_status_flags(irq, IRQ_NOAUTOEN); - err = request_irq(irq, handler, irq_flags, "arm-pmu", + + err = request_nmi(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); - irq_ops = &pmuirq_ops; + /* If cannot get an NMI, get a normal interrupt */ + if (err) { + err = request_irq(irq, handler, irq_flags, "arm-pmu", + per_cpu_ptr(&cpu_armpmu, cpu)); + irq_ops = &pmuirq_ops; + } else { + has_nmi = true; + irq_ops = &pmunmi_ops; + } } else if (armpmu_count_irq_users(irq) == 0) { - err = request_percpu_irq(irq, handler, "arm-pmu", - &cpu_armpmu); + err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); - irq_ops = &percpu_pmuirq_ops; + /* If cannot get an NMI, get a normal interrupt */ + if (err) { + err = request_percpu_irq(irq, handler, "arm-pmu", + &cpu_armpmu); + irq_ops = &percpu_pmuirq_ops; + } else { + has_nmi= true; + irq_ops = &percpu_pmunmi_ops; + } } else { /* Per cpudevid irq was already requested by another CPU */ irq_ops = armpmu_find_irq_ops(irq); @@ -928,8 +982,9 @@ int armpmu_register(struct arm_pmu *pmu) if (!__oprofile_cpu_pmu) __oprofile_cpu_pmu = pmu; - pr_info("enabled with %s PMU driver, %d counters available\n", - pmu->name, pmu->num_events); + pr_info("enabled with %s PMU driver, %d counters available%s\n", + pmu->name, pmu->num_events, + has_nmi ? ", using NMIs" : ""); return 0; From 5735f515843019257913432a8c36eb558be388db Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:11 +1000 Subject: [PATCH 100/148] firmware: arm_sdei: Remove sdei_is_err() sdei_is_err() is only called by sdei_to_linux_errno(). The logic of checking on the error number is common to them. They can be combined finely. This removes sdei_is_err() and its logic is combined to the function sdei_to_linux_errno(). Also, the assignment of @err to zero is also dropped in invoke_sdei_fn() because it's always overridden afterwards. This shouldn't cause functional changes. Signed-off-by: Gavin Shan Reviewed-by: James Morse Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20200922130423.10173-2-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index b4b9ce97f415..2d256b2ed4b4 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -114,26 +114,7 @@ static int sdei_to_linux_errno(unsigned long sdei_err) return -ENOMEM; } - /* Not an error value ... */ - return sdei_err; -} - -/* - * If x0 is any of these values, then the call failed, use sdei_to_linux_errno() - * to translate. - */ -static int sdei_is_err(struct arm_smccc_res *res) -{ - switch (res->a0) { - case SDEI_NOT_SUPPORTED: - case SDEI_INVALID_PARAMETERS: - case SDEI_DENIED: - case SDEI_PENDING: - case SDEI_OUT_OF_RESOURCE: - return true; - } - - return false; + return 0; } static int invoke_sdei_fn(unsigned long function_id, unsigned long arg0, @@ -141,14 +122,13 @@ static int invoke_sdei_fn(unsigned long function_id, unsigned long arg0, unsigned long arg3, unsigned long arg4, u64 *result) { - int err = 0; + int err; struct arm_smccc_res res; if (sdei_firmware_call) { sdei_firmware_call(function_id, arg0, arg1, arg2, arg3, arg4, &res); - if (sdei_is_err(&res)) - err = sdei_to_linux_errno(res.a0); + err = sdei_to_linux_errno(res.a0); } else { /* * !sdei_firmware_call means we failed to probe or called From 119884249fdba34a06df250a4402a6b2f3697a47 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:12 +1000 Subject: [PATCH 101/148] firmware: arm_sdei: Common block for failing path in sdei_event_create() There are multiple calls of kfree(event) in the failing path of sdei_event_create() to free the SDEI event. It means we need to call it again when adding more code in the failing path. It's prone to miss doing that and introduce memory leakage. This introduces common block for failing path in sdei_event_create() to resolve the issue. This shouldn't cause functional changes. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Acked-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-3-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 2d256b2ed4b4..a126ab7e3490 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -190,33 +190,31 @@ static struct sdei_event *sdei_event_create(u32 event_num, lockdep_assert_held(&sdei_events_lock); event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return ERR_PTR(-ENOMEM); + if (!event) { + err = -ENOMEM; + goto fail; + } INIT_LIST_HEAD(&event->list); event->event_num = event_num; err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_PRIORITY, &result); - if (err) { - kfree(event); - return ERR_PTR(err); - } + if (err) + goto fail; event->priority = result; err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_TYPE, &result); - if (err) { - kfree(event); - return ERR_PTR(err); - } + if (err) + goto fail; event->type = result; if (event->type == SDEI_EVENT_TYPE_SHARED) { reg = kzalloc(sizeof(*reg), GFP_KERNEL); if (!reg) { - kfree(event); - return ERR_PTR(-ENOMEM); + err = -ENOMEM; + goto fail; } reg->event_num = event_num; @@ -231,8 +229,8 @@ static struct sdei_event *sdei_event_create(u32 event_num, regs = alloc_percpu(struct sdei_registered_event); if (!regs) { - kfree(event); - return ERR_PTR(-ENOMEM); + err = -ENOMEM; + goto fail; } for_each_possible_cpu(cpu) { @@ -252,6 +250,10 @@ static struct sdei_event *sdei_event_create(u32 event_num, spin_unlock(&sdei_list_lock); return event; + +fail: + kfree(event); + return ERR_PTR(err); } static void sdei_event_destroy_llocked(struct sdei_event *event) From 663c0e89c8defd75a9d34ae31dbc315d6cf894bd Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:13 +1000 Subject: [PATCH 102/148] firmware: arm_sdei: Retrieve event number from event instance In sdei_event_create(), the event number is retrieved from the variable @event_num for the shared event. The event number was stored in the event instance. So we can fetch it from the event instance, similar to what we're doing for the private event. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Acked-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-4-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index a126ab7e3490..0f49fff20cc7 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -217,7 +217,7 @@ static struct sdei_event *sdei_event_create(u32 event_num, goto fail; } - reg->event_num = event_num; + reg->event_num = event->event_num; reg->priority = event->priority; reg->callback = cb; From 10fd7c42b7953b3316806388493bff22abd3f78c Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:14 +1000 Subject: [PATCH 103/148] firmware: arm_sdei: Avoid nested statements in sdei_init() In sdei_init(), the nested statements can be avoided by bailing on error from platform_driver_register() or absent ACPI SDEI table. With it, the code looks a bit more readable. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-5-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 0f49fff20cc7..1fa4e577b78e 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -1081,17 +1081,18 @@ static bool __init sdei_present_acpi(void) static int __init sdei_init(void) { - int ret = platform_driver_register(&sdei_driver); + struct platform_device *pdev; + int ret; - if (!ret && sdei_present_acpi()) { - struct platform_device *pdev; + ret = platform_driver_register(&sdei_driver); + if (ret || !sdei_present_acpi()) + return ret; - pdev = platform_device_register_simple(sdei_driver.driver.name, - 0, NULL, 0); - if (IS_ERR(pdev)) - pr_info("Failed to register ACPI:SDEI platform device %ld\n", - PTR_ERR(pdev)); - } + pdev = platform_device_register_simple(sdei_driver.driver.name, + 0, NULL, 0); + if (IS_ERR(pdev)) + pr_info("Failed to register ACPI:SDEI platform device %ld\n", + PTR_ERR(pdev)); return ret; } From 63627cae41e33763ca967bd2bb8cfe5d281bfe64 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:15 +1000 Subject: [PATCH 104/148] firmware: arm_sdei: Unregister driver on error in sdei_init() The SDEI platform device is created from device-tree node or ACPI (SDEI) table. For the later case, the platform device is created explicitly by this module. It'd better to unregister the driver on failure to create the device to keep the symmetry. The driver, owned by this module, isn't needed if the device isn't existing. Besides, the errno (@ret) should be updated accordingly in this case. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-6-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 1fa4e577b78e..e7e9059c395b 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -1090,9 +1090,12 @@ static int __init sdei_init(void) pdev = platform_device_register_simple(sdei_driver.driver.name, 0, NULL, 0); - if (IS_ERR(pdev)) - pr_info("Failed to register ACPI:SDEI platform device %ld\n", - PTR_ERR(pdev)); + if (IS_ERR(pdev)) { + ret = PTR_ERR(pdev); + platform_driver_unregister(&sdei_driver); + pr_info("Failed to register ACPI:SDEI platform device %d\n", + ret); + } return ret; } From bc110fd322816c9684603a55404b51228c0feca8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:16 +1000 Subject: [PATCH 105/148] firmware: arm_sdei: Remove duplicate check in sdei_get_conduit() The following two checks are duplicate because @acpi_disabled doesn't depend on CONFIG_ACPI. So the duplicate check (IS_ENABLED(CONFIG_ACPI)) can be dropped. More details is provided to keep the commit log complete: * @acpi_disabled is defined in arch/arm64/kernel/acpi.c when CONFIG_ACPI is enabled. * @acpi_disabled in defined in include/acpi.h when CONFIG_ACPI is disabled. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Acked-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-7-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index e7e9059c395b..5daa4e20595c 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -958,7 +958,7 @@ static int sdei_get_conduit(struct platform_device *pdev) } pr_warn("invalid \"method\" property: %s\n", method); - } else if (IS_ENABLED(CONFIG_ACPI) && !acpi_disabled) { + } else if (!acpi_disabled) { if (acpi_psci_use_hvc()) { sdei_firmware_call = &sdei_smccc_hvc; return SMCCC_CONDUIT_HVC; From 101119a35ca108022391ad5b84d31e203970185c Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:17 +1000 Subject: [PATCH 106/148] firmware: arm_sdei: Remove redundant error message in sdei_probe() This removes the redundant error message in sdei_probe() because the case can be identified from the errno in next error message. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Acked-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-8-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 5daa4e20595c..4caeef3e1e0b 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -982,8 +982,6 @@ static int sdei_probe(struct platform_device *pdev) return 0; err = sdei_api_get_version(&ver); - if (err == -EOPNOTSUPP) - pr_err("advertised but not implemented in platform firmware\n"); if (err) { pr_err("Failed to get SDEI version: %d\n", err); sdei_mark_interface_broken(); From 1bbc75518503fe7992c8ec09557b453ec7222dcf Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:18 +1000 Subject: [PATCH 107/148] firmware: arm_sdei: Remove while loop in sdei_event_register() This removes the unnecessary while loop in sdei_event_register() because of the following two reasons. This shouldn't cause any functional changes. * The while loop is executed for once, meaning it's not needed in theory. * With the while loop removed, the nested statements can be avoid to make the code a bit cleaner. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20200922130423.10173-9-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 52 ++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 4caeef3e1e0b..6595bd66aa73 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -590,36 +590,34 @@ int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) WARN_ON(in_nmi()); mutex_lock(&sdei_events_lock); - do { - if (sdei_event_find(event_num)) { - pr_warn("Event %u already registered\n", event_num); - err = -EBUSY; - break; - } + if (sdei_event_find(event_num)) { + pr_warn("Event %u already registered\n", event_num); + err = -EBUSY; + goto unlock; + } - event = sdei_event_create(event_num, cb, arg); - if (IS_ERR(event)) { - err = PTR_ERR(event); - pr_warn("Failed to create event %u: %d\n", event_num, - err); - break; - } + event = sdei_event_create(event_num, cb, arg); + if (IS_ERR(event)) { + err = PTR_ERR(event); + pr_warn("Failed to create event %u: %d\n", event_num, err); + goto unlock; + } - cpus_read_lock(); - err = _sdei_event_register(event); - if (err) { - sdei_event_destroy(event); - pr_warn("Failed to register event %u: %d\n", event_num, - err); - } else { - spin_lock(&sdei_list_lock); - event->reregister = true; - spin_unlock(&sdei_list_lock); - } - cpus_read_unlock(); - } while (0); + cpus_read_lock(); + err = _sdei_event_register(event); + if (err) { + sdei_event_destroy(event); + pr_warn("Failed to register event %u: %d\n", event_num, err); + goto cpu_unlock; + } + + spin_lock(&sdei_list_lock); + event->reregister = true; + spin_unlock(&sdei_list_lock); +cpu_unlock: + cpus_read_unlock(); +unlock: mutex_unlock(&sdei_events_lock); - return err; } From b06146b698e6e835253d01901ecf0ab84646591e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:19 +1000 Subject: [PATCH 108/148] firmware: arm_sdei: Remove while loop in sdei_event_unregister() This removes the unnecessary while loop in sdei_event_unregister() because of the following two reasons. This shouldn't cause any functional changes. * The while loop is executed for once, meaning it's not needed in theory. * With the while loop removed, the nested statements can be avoid to make the code a bit cleaner. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20200922130423.10173-10-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 6595bd66aa73..790bff70d169 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -491,24 +491,23 @@ int sdei_event_unregister(u32 event_num) mutex_lock(&sdei_events_lock); event = sdei_event_find(event_num); - do { - if (!event) { - pr_warn("Event %u not registered\n", event_num); - err = -ENOENT; - break; - } + if (!event) { + pr_warn("Event %u not registered\n", event_num); + err = -ENOENT; + goto unlock; + } - spin_lock(&sdei_list_lock); - event->reregister = false; - event->reenable = false; - spin_unlock(&sdei_list_lock); + spin_lock(&sdei_list_lock); + event->reregister = false; + event->reenable = false; + spin_unlock(&sdei_list_lock); - err = _sdei_event_unregister(event); - if (err) - break; + err = _sdei_event_unregister(event); + if (err) + goto unlock; - sdei_event_destroy(event); - } while (0); + sdei_event_destroy(event); +unlock: mutex_unlock(&sdei_events_lock); return err; From a27c04e1de876297b78a6b812049e6fecc6e4d9a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:20 +1000 Subject: [PATCH 109/148] firmware: arm_sdei: Cleanup on cross call function This applies cleanup on the cross call functions, no functional changes are introduced: * Wrap the code block of CROSSCALL_INIT inside "do { } while (0)" as linux kernel usually does. Otherwise, scripts/checkpatch.pl reports warning regarding this. * Use smp_call_func_t for @fn argument in sdei_do_cross_call() as the function is called on target CPU(s). * Remove unnecessary space before @event in sdei_do_cross_call() Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-11-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 790bff70d169..786c1b6d4af3 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -78,11 +78,15 @@ struct sdei_crosscall_args { int first_error; }; -#define CROSSCALL_INIT(arg, event) (arg.event = event, \ - arg.first_error = 0, \ - atomic_set(&arg.errors, 0)) +#define CROSSCALL_INIT(arg, event) \ + do { \ + arg.event = event; \ + arg.first_error = 0; \ + atomic_set(&arg.errors, 0); \ + } while (0) -static inline int sdei_do_cross_call(void *fn, struct sdei_event * event) +static inline int sdei_do_cross_call(smp_call_func_t fn, + struct sdei_event *event) { struct sdei_crosscall_args arg; From f4673625a52c69a12d2a8f71bcf408c685c148ec Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:21 +1000 Subject: [PATCH 110/148] firmware: arm_sdei: Introduce sdei_do_local_call() During the CPU hotplug, the private events are registered, enabled or unregistered on the specific CPU. It repeats the same steps: initializing cross call argument, make function call on local CPU, check the returned error. This introduces sdei_do_local_call() to cover the first steps. The other benefit is to make CROSSCALL_INIT and struct sdei_crosscall_args are only visible to sdei_do_{cross, local}_call(). Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-12-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 41 ++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 786c1b6d4af3..cf1a37b40e24 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -85,6 +85,17 @@ struct sdei_crosscall_args { atomic_set(&arg.errors, 0); \ } while (0) +static inline int sdei_do_local_call(smp_call_func_t fn, + struct sdei_event *event) +{ + struct sdei_crosscall_args arg; + + CROSSCALL_INIT(arg, event); + fn(&arg); + + return arg.first_error; +} + static inline int sdei_do_cross_call(smp_call_func_t fn, struct sdei_event *event) { @@ -677,7 +688,7 @@ static int sdei_reregister_shared(void) static int sdei_cpuhp_down(unsigned int cpu) { struct sdei_event *event; - struct sdei_crosscall_args arg; + int err; /* un-register private events */ spin_lock(&sdei_list_lock); @@ -685,12 +696,11 @@ static int sdei_cpuhp_down(unsigned int cpu) if (event->type == SDEI_EVENT_TYPE_SHARED) continue; - CROSSCALL_INIT(arg, event); - /* call the cross-call function locally... */ - _local_event_unregister(&arg); - if (arg.first_error) + err = sdei_do_local_call(_local_event_unregister, event); + if (err) { pr_err("Failed to unregister event %u: %d\n", - event->event_num, arg.first_error); + event->event_num, err); + } } spin_unlock(&sdei_list_lock); @@ -700,7 +710,7 @@ static int sdei_cpuhp_down(unsigned int cpu) static int sdei_cpuhp_up(unsigned int cpu) { struct sdei_event *event; - struct sdei_crosscall_args arg; + int err; /* re-register/enable private events */ spin_lock(&sdei_list_lock); @@ -709,20 +719,19 @@ static int sdei_cpuhp_up(unsigned int cpu) continue; if (event->reregister) { - CROSSCALL_INIT(arg, event); - /* call the cross-call function locally... */ - _local_event_register(&arg); - if (arg.first_error) + err = sdei_do_local_call(_local_event_register, event); + if (err) { pr_err("Failed to re-register event %u: %d\n", - event->event_num, arg.first_error); + event->event_num, err); + } } if (event->reenable) { - CROSSCALL_INIT(arg, event); - _local_event_enable(&arg); - if (arg.first_error) + err = sdei_do_local_call(_local_event_enable, event); + if (err) { pr_err("Failed to re-enable event %u: %d\n", - event->event_num, arg.first_error); + event->event_num, err); + } } } spin_unlock(&sdei_list_lock); From d2fc580d2dcaf92863fbb194d4e70ef1383b92c4 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:22 +1000 Subject: [PATCH 111/148] firmware: arm_sdei: Remove _sdei_event_register() The function _sdei_event_register() is called by sdei_event_register() and sdei_device_thaw() as the following functional call chain shows. _sdei_event_register() covers the shared and private events, but sdei_device_thaw() only covers the shared events. So the logic to cover the private events in _sdei_event_register() isn't needed by sdei_device_thaw(). Similarly, sdei_reregister_event_llocked() covers the shared and private events in the regard of reenablement. The logic to cover the private events isn't needed by sdei_device_thaw() either. sdei_event_register sdei_device_thaw _sdei_event_register sdei_reregister_shared sdei_reregister_event_llocked _sdei_event_register This removes _sdei_event_register() and sdei_reregister_event_llocked(). Their logic is moved to sdei_event_register() and sdei_reregister_shared(). This shouldn't cause any logical changes. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-13-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 77 ++++++++++++++----------------------- 1 file changed, 28 insertions(+), 49 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index cf1a37b40e24..361d142ad2a8 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -577,25 +577,6 @@ static void _local_event_register(void *data) sdei_cross_call_return(arg, err); } -static int _sdei_event_register(struct sdei_event *event) -{ - int err; - - lockdep_assert_held(&sdei_events_lock); - - if (event->type == SDEI_EVENT_TYPE_SHARED) - return sdei_api_event_register(event->event_num, - sdei_entry_point, - event->registered, - SDEI_EVENT_REGISTER_RM_ANY, 0); - - err = sdei_do_cross_call(_local_event_register, event); - if (err) - sdei_do_cross_call(_local_event_unregister, event); - - return err; -} - int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) { int err; @@ -618,7 +599,17 @@ int sdei_event_register(u32 event_num, sdei_event_callback *cb, void *arg) } cpus_read_lock(); - err = _sdei_event_register(event); + if (event->type == SDEI_EVENT_TYPE_SHARED) { + err = sdei_api_event_register(event->event_num, + sdei_entry_point, + event->registered, + SDEI_EVENT_REGISTER_RM_ANY, 0); + } else { + err = sdei_do_cross_call(_local_event_register, event); + if (err) + sdei_do_cross_call(_local_event_unregister, event); + } + if (err) { sdei_event_destroy(event); pr_warn("Failed to register event %u: %d\n", event_num, err); @@ -635,33 +626,6 @@ unlock: return err; } -static int sdei_reregister_event_llocked(struct sdei_event *event) -{ - int err; - - lockdep_assert_held(&sdei_events_lock); - lockdep_assert_held(&sdei_list_lock); - - err = _sdei_event_register(event); - if (err) { - pr_err("Failed to re-register event %u\n", event->event_num); - sdei_event_destroy_llocked(event); - return err; - } - - if (event->reenable) { - if (event->type == SDEI_EVENT_TYPE_SHARED) - err = sdei_api_event_enable(event->event_num); - else - err = sdei_do_cross_call(_local_event_enable, event); - } - - if (err) - pr_err("Failed to re-enable event %u\n", event->event_num); - - return err; -} - static int sdei_reregister_shared(void) { int err = 0; @@ -674,9 +638,24 @@ static int sdei_reregister_shared(void) continue; if (event->reregister) { - err = sdei_reregister_event_llocked(event); - if (err) + err = sdei_api_event_register(event->event_num, + sdei_entry_point, event->registered, + SDEI_EVENT_REGISTER_RM_ANY, 0); + if (err) { + pr_err("Failed to re-register event %u\n", + event->event_num); + sdei_event_destroy_llocked(event); break; + } + } + + if (event->reenable) { + err = sdei_api_event_enable(event->event_num); + if (err) { + pr_err("Failed to re-enable event %u\n", + event->event_num); + break; + } } } spin_unlock(&sdei_list_lock); From 4b2b76cbbc8ff5dabc18123d94a1125143aea567 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 22 Sep 2020 23:04:23 +1000 Subject: [PATCH 112/148] firmware: arm_sdei: Remove _sdei_event_unregister() _sdei_event_unregister() is called by sdei_event_unregister() and sdei_device_freeze(). _sdei_event_unregister() covers the shared and private events, but sdei_device_freeze() only covers the shared events. So the logic to cover the private events isn't needed by sdei_device_freeze(). sdei_event_unregister sdei_device_freeze _sdei_event_unregister sdei_unregister_shared _sdei_event_unregister This removes _sdei_event_unregister(). Its logic is moved to its callers accordingly. This shouldn't cause any logical changes. Signed-off-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: James Morse Link: https://lore.kernel.org/r/20200922130423.10173-14-gshan@redhat.com Signed-off-by: Will Deacon --- drivers/firmware/arm_sdei.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 361d142ad2a8..840754dcc6ca 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -487,16 +487,6 @@ static void _local_event_unregister(void *data) sdei_cross_call_return(arg, err); } -static int _sdei_event_unregister(struct sdei_event *event) -{ - lockdep_assert_held(&sdei_events_lock); - - if (event->type == SDEI_EVENT_TYPE_SHARED) - return sdei_api_event_unregister(event->event_num); - - return sdei_do_cross_call(_local_event_unregister, event); -} - int sdei_event_unregister(u32 event_num) { int err; @@ -517,7 +507,11 @@ int sdei_event_unregister(u32 event_num) event->reenable = false; spin_unlock(&sdei_list_lock); - err = _sdei_event_unregister(event); + if (event->type == SDEI_EVENT_TYPE_SHARED) + err = sdei_api_event_unregister(event->event_num); + else + err = sdei_do_cross_call(_local_event_unregister, event); + if (err) goto unlock; @@ -543,7 +537,7 @@ static int sdei_unregister_shared(void) if (event->type != SDEI_EVENT_TYPE_SHARED) continue; - err = _sdei_event_unregister(event); + err = sdei_api_event_unregister(event->event_num); if (err) break; } From 48118151d8cc7a457f61d91df0c053473d4b31b1 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 18 Sep 2020 12:18:44 +0200 Subject: [PATCH 113/148] arm64: mm: Pin down ASIDs for sharing mm with devices To enable address space sharing with the IOMMU, introduce arm64_mm_context_get() and arm64_mm_context_put(), that pin down a context and ensure that it will keep its ASID after a rollover. Export the symbols to let the modular SMMUv3 driver use them. Pinning is necessary because a device constantly needs a valid ASID, unlike tasks that only require one when running. Without pinning, we would need to notify the IOMMU when we're about to use a new ASID for a task, and it would get complicated when a new task is assigned a shared ASID. Consider the following scenario with no ASID pinned: 1. Task t1 is running on CPUx with shared ASID (gen=1, asid=1) 2. Task t2 is scheduled on CPUx, gets ASID (1, 2) 3. Task tn is scheduled on CPUy, a rollover occurs, tn gets ASID (2, 1) We would now have to immediately generate a new ASID for t1, notify the IOMMU, and finally enable task tn. We are holding the lock during all that time, since we can't afford having another CPU trigger a rollover. The IOMMU issues invalidation commands that can take tens of milliseconds. It gets needlessly complicated. All we wanted to do was schedule task tn, that has no business with the IOMMU. By letting the IOMMU pin tasks when needed, we avoid stalling the slow path, and let the pinning fail when we're out of shareable ASIDs. After a rollover, the allocator expects at least one ASID to be available in addition to the reserved ones (one per CPU). So (NR_ASIDS - NR_CPUS - 1) is the maximum number of ASIDs that can be shared with the IOMMU. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20200918101852.582559-5-jean-philippe@linaro.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/mmu.h | 3 + arch/arm64/include/asm/mmu_context.h | 11 ++- arch/arm64/mm/context.c | 105 +++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index a7a5ecaa2e83..0fda85b2cc1b 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -17,11 +17,14 @@ #ifndef __ASSEMBLY__ +#include + typedef struct { atomic64_t id; #ifdef CONFIG_COMPAT void *sigpage; #endif + refcount_t pinned; void *vdso; unsigned long flags; } mm_context_t; diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index f2d7537d6f83..0672236e1aea 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -177,7 +177,13 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) #define destroy_context(mm) do { } while(0) void check_and_switch_context(struct mm_struct *mm); -#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) +static inline int +init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + atomic64_set(&mm->context.id, 0); + refcount_set(&mm->context.pinned, 0); + return 0; +} #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, @@ -248,6 +254,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, void verify_cpu_asid_bits(void); void post_ttbr_update_workaround(void); +unsigned long arm64_mm_context_get(struct mm_struct *mm); +void arm64_mm_context_put(struct mm_struct *mm); + #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_MMU_CONTEXT_H */ diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 9b11c096a042..001737a8f309 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -27,6 +27,10 @@ static DEFINE_PER_CPU(atomic64_t, active_asids); static DEFINE_PER_CPU(u64, reserved_asids); static cpumask_t tlb_flush_pending; +static unsigned long max_pinned_asids; +static unsigned long nr_pinned_asids; +static unsigned long *pinned_asid_map; + #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) #define ASID_FIRST_VERSION (1UL << asid_bits) @@ -72,7 +76,7 @@ void verify_cpu_asid_bits(void) } } -static void set_kpti_asid_bits(void) +static void set_kpti_asid_bits(unsigned long *map) { unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(unsigned long); /* @@ -81,13 +85,15 @@ static void set_kpti_asid_bits(void) * is set, then the ASID will map only userspace. Thus * mark even as reserved for kernel. */ - memset(asid_map, 0xaa, len); + memset(map, 0xaa, len); } static void set_reserved_asid_bits(void) { - if (arm64_kernel_unmapped_at_el0()) - set_kpti_asid_bits(); + if (pinned_asid_map) + bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS); + else if (arm64_kernel_unmapped_at_el0()) + set_kpti_asid_bits(asid_map); else bitmap_clear(asid_map, 0, NUM_USER_ASIDS); } @@ -165,6 +171,14 @@ static u64 new_context(struct mm_struct *mm) if (check_update_reserved_asid(asid, newasid)) return newasid; + /* + * If it is pinned, we can keep using it. Note that reserved + * takes priority, because even if it is also pinned, we need to + * update the generation into the reserved_asids. + */ + if (refcount_read(&mm->context.pinned)) + return newasid; + /* * We had a valid ASID in a previous life, so try to re-use * it if possible. @@ -256,6 +270,71 @@ switch_mm_fastpath: cpu_switch_mm(mm->pgd, mm); } +unsigned long arm64_mm_context_get(struct mm_struct *mm) +{ + unsigned long flags; + u64 asid; + + if (!pinned_asid_map) + return 0; + + raw_spin_lock_irqsave(&cpu_asid_lock, flags); + + asid = atomic64_read(&mm->context.id); + + if (refcount_inc_not_zero(&mm->context.pinned)) + goto out_unlock; + + if (nr_pinned_asids >= max_pinned_asids) { + asid = 0; + goto out_unlock; + } + + if (!asid_gen_match(asid)) { + /* + * We went through one or more rollover since that ASID was + * used. Ensure that it is still valid, or generate a new one. + */ + asid = new_context(mm); + atomic64_set(&mm->context.id, asid); + } + + nr_pinned_asids++; + __set_bit(asid2idx(asid), pinned_asid_map); + refcount_set(&mm->context.pinned, 1); + +out_unlock: + raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); + + asid &= ~ASID_MASK; + + /* Set the equivalent of USER_ASID_BIT */ + if (asid && arm64_kernel_unmapped_at_el0()) + asid |= 1; + + return asid; +} +EXPORT_SYMBOL_GPL(arm64_mm_context_get); + +void arm64_mm_context_put(struct mm_struct *mm) +{ + unsigned long flags; + u64 asid = atomic64_read(&mm->context.id); + + if (!pinned_asid_map) + return; + + raw_spin_lock_irqsave(&cpu_asid_lock, flags); + + if (refcount_dec_and_test(&mm->context.pinned)) { + __clear_bit(asid2idx(asid), pinned_asid_map); + nr_pinned_asids--; + } + + raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); +} +EXPORT_SYMBOL_GPL(arm64_mm_context_put); + /* Errata workaround post TTBRx_EL1 update. */ asmlinkage void post_ttbr_update_workaround(void) { @@ -296,8 +375,11 @@ static int asids_update_limit(void) { unsigned long num_available_asids = NUM_USER_ASIDS; - if (arm64_kernel_unmapped_at_el0()) + if (arm64_kernel_unmapped_at_el0()) { num_available_asids /= 2; + if (pinned_asid_map) + set_kpti_asid_bits(pinned_asid_map); + } /* * Expect allocation after rollover to fail if we don't have at least * one more ASID than CPUs. ASID #0 is reserved for init_mm. @@ -305,6 +387,13 @@ static int asids_update_limit(void) WARN_ON(num_available_asids - 1 <= num_possible_cpus()); pr_info("ASID allocator initialised with %lu entries\n", num_available_asids); + + /* + * There must always be an ASID available after rollover. Ensure that, + * even if all CPUs have a reserved ASID and the maximum number of ASIDs + * are pinned, there still is at least one empty slot in the ASID map. + */ + max_pinned_asids = num_available_asids - num_possible_cpus() - 2; return 0; } arch_initcall(asids_update_limit); @@ -319,13 +408,17 @@ static int asids_init(void) panic("Failed to allocate bitmap for %lu ASIDs\n", NUM_USER_ASIDS); + pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS), + sizeof(*pinned_asid_map), GFP_KERNEL); + nr_pinned_asids = 0; + /* * We cannot call set_reserved_asid_bits() here because CPU * caps are not finalized yet, so it is safer to assume KPTI * and reserve kernel ASID's from beginning. */ if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) - set_kpti_asid_bits(); + set_kpti_asid_bits(asid_map); return 0; } early_initcall(asids_init); From 6f3c4afae9805bda1682e0ad17b8ce56d85c9a83 Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Fri, 18 Sep 2020 12:18:46 +0200 Subject: [PATCH 114/148] arm64: cpufeature: Export symbol read_sanitised_ftr_reg() The SMMUv3 driver would like to read the MMFR0 PARANGE field in order to share CPU page tables with devices. Allow the driver to be built as module by exporting the read_sanitized_ftr_reg() cpufeature symbol. Signed-off-by: Jean-Philippe Brucker Reviewed-by: Jonathan Cameron Acked-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20200918101852.582559-7-jean-philippe@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6424584be01e..07f10ad8855c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1111,6 +1111,7 @@ u64 read_sanitised_ftr_reg(u32 id) return 0; return regp->sys_val; } +EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg); #define read_sysreg_case(r) \ case r: return read_sysreg_s(r) From b5756146db3ad57a9c0e841ea01ce915db27b7de Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 28 Sep 2020 23:02:19 +0100 Subject: [PATCH 115/148] arm64: mte: Fix typo in memory tagging ABI documentation We offer both PTRACE_PEEKMTETAGS and PTRACE_POKEMTETAGS requests via ptrace(). Reported-by: Andrey Konovalov Signed-off-by: Will Deacon --- Documentation/arm64/memory-tagging-extension.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arm64/memory-tagging-extension.rst b/Documentation/arm64/memory-tagging-extension.rst index e3709b536b89..034d37c605e8 100644 --- a/Documentation/arm64/memory-tagging-extension.rst +++ b/Documentation/arm64/memory-tagging-extension.rst @@ -142,7 +142,7 @@ the tags from or set the tags to a tracee's address space. The ``ptrace()`` system call is invoked as ``ptrace(request, pid, addr, data)`` where: -- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_PEEKMTETAGS``. +- ``request`` - one of ``PTRACE_PEEKMTETAGS`` or ``PTRACE_POKEMTETAGS``. - ``pid`` - the tracee's PID. - ``addr`` - address in the tracee's address space. - ``data`` - pointer to a ``struct iovec`` where ``iov_base`` points to From 39533e12063be7f55e3d6ae21ffe067799d542a4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 16 Jul 2020 17:11:10 +0100 Subject: [PATCH 116/148] arm64: Run ARCH_WORKAROUND_2 enabling code on all CPUs Commit 606f8e7b27bf ("arm64: capabilities: Use linear array for detection and verification") changed the way we deal with per-CPU errata by only calling the .matches() callback until one CPU is found to be affected. At this point, .matches() stop being called, and .cpu_enable() will be called on all CPUs. This breaks the ARCH_WORKAROUND_2 handling, as only a single CPU will be mitigated. In order to address this, forcefully call the .matches() callback from a .cpu_enable() callback, which brings us back to the original behaviour. Fixes: 606f8e7b27bf ("arm64: capabilities: Use linear array for detection and verification") Cc: Reviewed-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 3fe64bf5a58d..abfef5f3b5fd 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -457,6 +457,12 @@ out_printmsg: return required; } +static void cpu_enable_ssbd_mitigation(const struct arm64_cpu_capabilities *cap) +{ + if (ssbd_state != ARM64_SSBD_FORCE_DISABLE) + cap->matches(cap, SCOPE_LOCAL_CPU); +} + /* known invulnerable cores */ static const struct midr_range arm64_ssb_cpus[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), @@ -914,6 +920,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_SSBD, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = has_ssbd_mitigation, + .cpu_enable = cpu_enable_ssbd_mitigation, .midr_range_list = arm64_ssb_cpus, }, #ifdef CONFIG_ARM64_ERRATUM_1418040 From 6e5f0927846adf39aebee450f13871e3cb4ab012 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 22:11:13 +0100 Subject: [PATCH 117/148] arm64: Remove Spectre-related CONFIG_* options The spectre mitigations are too configurable for their own good, leading to confusing logic trying to figure out when we should mitigate and when we shouldn't. Although the plethora of command-line options need to stick around for backwards compatibility, the default-on CONFIG options that depend on EXPERT can be dropped, as the mitigations only do anything if the system is vulnerable, a mitigation is available and the command-line hasn't disabled it. Remove CONFIG_HARDEN_BRANCH_PREDICTOR and CONFIG_ARM64_SSBD in favour of enabling this code unconditionally. Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 26 ------------------------- arch/arm64/include/asm/cpufeature.h | 4 ---- arch/arm64/include/asm/kvm_mmu.h | 7 ------- arch/arm64/include/asm/mmu.h | 9 --------- arch/arm64/kernel/Makefile | 3 +-- arch/arm64/kernel/cpu_errata.c | 19 ++---------------- arch/arm64/kernel/cpufeature.c | 4 ---- arch/arm64/kernel/entry.S | 4 ---- arch/arm64/kvm/Kconfig | 2 +- arch/arm64/kvm/hyp/hyp-entry.S | 2 -- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 ---- 11 files changed, 4 insertions(+), 80 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6d232837cbee..51259274a819 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1165,32 +1165,6 @@ config UNMAP_KERNEL_AT_EL0 If unsure, say Y. -config HARDEN_BRANCH_PREDICTOR - bool "Harden the branch predictor against aliasing attacks" if EXPERT - default y - help - Speculation attacks against some high-performance processors rely on - being able to manipulate the branch predictor for a victim context by - executing aliasing branches in the attacker context. Such attacks - can be partially mitigated against by clearing internal branch - predictor state and limiting the prediction logic in some situations. - - This config option will take CPU-specific actions to harden the - branch predictor against aliasing attacks and may rely on specific - instruction sequences or control bits being set by the system - firmware. - - If unsure, say Y. - -config ARM64_SSBD - bool "Speculative Store Bypass Disable" if EXPERT - default y - help - This enables mitigation of the bypassing of previous stores - by speculative loads. - - If unsure, say Y. - config RODATA_FULL_DEFAULT_ENABLED bool "Apply r/o permissions of VM areas also to their linear aliases" default y diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 89b4f0142c28..851d144527ed 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -712,12 +712,8 @@ int get_spectre_v2_workaround_state(void); static inline int arm64_get_ssbd_state(void) { -#ifdef CONFIG_ARM64_SSBD extern int ssbd_state; return ssbd_state; -#else - return ARM64_SSBD_UNKNOWN; -#endif } void arm64_set_ssbd_mitigation(bool state); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 189839c3706a..1df85a3ddb9b 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -527,7 +527,6 @@ static inline int kvm_map_vectors(void) } #endif -#ifdef CONFIG_ARM64_SSBD DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); static inline int hyp_map_aux_data(void) @@ -544,12 +543,6 @@ static inline int hyp_map_aux_data(void) } return 0; } -#else -static inline int hyp_map_aux_data(void) -{ - return 0; -} -#endif #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index a7a5ecaa2e83..f5e3efeb5b97 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -45,7 +45,6 @@ struct bp_hardening_data { bp_hardening_cb_t fn; }; -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) @@ -64,14 +63,6 @@ static inline void arm64_apply_bp_hardening(void) if (d->fn) d->fn(); } -#else -static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) -{ - return NULL; -} - -static inline void arm64_apply_bp_hardening(void) { } -#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ extern void arm64_memblock_init(void); extern void paging_init(void); diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index a561cbb91d4d..ed8799bdd41f 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -19,7 +19,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - syscall.o + ssbd.o syscall.o targets += efi-entry.o @@ -59,7 +59,6 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o -obj-$(CONFIG_ARM64_SSBD) += ssbd.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index abfef5f3b5fd..dd9103915f1e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -254,9 +254,7 @@ static int detect_harden_bp_fw(void) ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) cb = qcom_link_stack_sanitization; - if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) - install_bp_hardening_cb(cb, smccc_start, smccc_end); - + install_bp_hardening_cb(cb, smccc_start, smccc_end); return 1; } @@ -335,11 +333,6 @@ void arm64_set_ssbd_mitigation(bool state) { int conduit; - if (!IS_ENABLED(CONFIG_ARM64_SSBD)) { - pr_info_once("SSBD disabled by kernel configuration\n"); - return; - } - if (this_cpu_has_cap(ARM64_SSBS)) { if (state) asm volatile(SET_PSTATE_SSBS(0)); @@ -584,12 +577,6 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) __spectrev2_safe = false; - if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) { - pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n"); - __hardenbp_enab = false; - return false; - } - /* forced off */ if (__nospectre_v2 || cpu_mitigations_off()) { pr_info_once("spectrev2 mitigation disabled by command line option\n"); @@ -1004,9 +991,7 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, switch (ssbd_state) { case ARM64_SSBD_KERNEL: case ARM64_SSBD_FORCE_ENABLE: - if (IS_ENABLED(CONFIG_ARM64_SSBD)) - return sprintf(buf, - "Mitigation: Speculative Store Bypass disabled via prctl\n"); + return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); } return sprintf(buf, "Vulnerable\n"); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6424584be01e..4bb45b1d4ae4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1583,7 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) WARN_ON(val & (7 << 27 | 7 << 21)); } -#ifdef CONFIG_ARM64_SSBD static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) { if (user_mode(regs)) @@ -1623,7 +1622,6 @@ static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) arm64_set_ssbd_mitigation(true); } } -#endif /* CONFIG_ARM64_SSBD */ #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) @@ -1976,7 +1974,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64ISAR0_CRC32_SHIFT, .min_field_value = 1, }, -#ifdef CONFIG_ARM64_SSBD { .desc = "Speculative Store Bypassing Safe (SSBS)", .capability = ARM64_SSBS, @@ -1988,7 +1985,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY, .cpu_enable = cpu_enable_ssbs, }, -#endif #ifdef CONFIG_ARM64_CNP { .desc = "Common not Private translations", diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 55af8b504b65..81b709349d7b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -132,7 +132,6 @@ alternative_else_nop_endif * them if required. */ .macro apply_ssbd, state, tmp1, tmp2 -#ifdef CONFIG_ARM64_SSBD alternative_cb arm64_enable_wa2_handling b .L__asm_ssbd_skip\@ alternative_cb_end @@ -146,7 +145,6 @@ alternative_cb arm64_update_smccc_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: -#endif .endm .macro kernel_entry, el, regsize = 64 @@ -697,11 +695,9 @@ el0_irq_naked: bl trace_hardirqs_off #endif -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR tbz x22, #55, 1f bl do_el0_irq_bp_hardening 1: -#endif irq_handler #ifdef CONFIG_TRACE_IRQFLAGS diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 318c8f2df245..42e5895763b3 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -58,7 +58,7 @@ config KVM_ARM_PMU virtual machines. config KVM_INDIRECT_VECTORS - def_bool HARDEN_BRANCH_PREDICTOR || RANDOMIZE_BASE + def_bool RANDOMIZE_BASE endif # KVM diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 46b4dab933d0..41698bae5d5d 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -116,7 +116,6 @@ el1_hvc_guest: ARM_SMCCC_ARCH_WORKAROUND_2) cbnz w1, el1_trap -#ifdef CONFIG_ARM64_SSBD alternative_cb arm64_enable_wa2_handling b wa2_end alternative_cb_end @@ -143,7 +142,6 @@ alternative_cb_end wa2_end: mov x2, xzr mov x1, xzr -#endif wa_epilogue: mov x0, xzr diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5b6b8fa00f0a..b503f19c37c5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -489,7 +489,6 @@ static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu) static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu) { -#ifdef CONFIG_ARM64_SSBD /* * The host runs with the workaround always present. If the * guest wants it disabled, so be it... @@ -497,19 +496,16 @@ static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu) if (__needs_ssbd_off(vcpu) && __hyp_this_cpu_read(arm64_ssbd_callback_required)) arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL); -#endif } static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu) { -#ifdef CONFIG_ARM64_SSBD /* * If the guest has disabled the workaround, bring it back on. */ if (__needs_ssbd_off(vcpu) && __hyp_this_cpu_read(arm64_ssbd_callback_required)) arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL); -#endif } static inline void __kvm_unexpected_el2_exception(void) From 5359a87d5bdacb0c27f282f40ccbd80ce8f9702a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 22:16:10 +0100 Subject: [PATCH 118/148] KVM: arm64: Replace CONFIG_KVM_INDIRECT_VECTORS with CONFIG_RANDOMIZE_BASE The removal of CONFIG_HARDEN_BRANCH_PREDICTOR means that CONFIG_KVM_INDIRECT_VECTORS is synonymous with CONFIG_RANDOMIZE_BASE, so replace it. Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_asm.h | 2 +- arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/kernel/cpu_errata.c | 4 ++-- arch/arm64/kvm/Kconfig | 3 --- arch/arm64/kvm/hyp/Makefile | 2 +- arch/arm64/kvm/hyp/hyp-entry.S | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 6f98fbd0ac81..e9378cc8049d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -102,7 +102,7 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) -#ifdef CONFIG_KVM_INDIRECT_VECTORS +#ifdef CONFIG_RANDOMIZE_BASE extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 1df85a3ddb9b..dfd176b0642e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -430,7 +430,7 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } -#ifdef CONFIG_KVM_INDIRECT_VECTORS +#ifdef CONFIG_RANDOMIZE_BASE /* * EL2 vectors can be mapped and rerouted in a number of ways, * depending on the kernel configuration and CPU present: diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index dd9103915f1e..135bf7f92d4a 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -113,7 +113,7 @@ atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); -#ifdef CONFIG_KVM_INDIRECT_VECTORS +#ifdef CONFIG_RANDOMIZE_BASE static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, const char *hyp_vecs_end) { @@ -167,7 +167,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn, { __this_cpu_write(bp_hardening_data.fn, fn); } -#endif /* CONFIG_KVM_INDIRECT_VECTORS */ +#endif /* CONFIG_RANDOMIZE_BASE */ #include diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 42e5895763b3..043756db8f6e 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -57,9 +57,6 @@ config KVM_ARM_PMU Adds support for a virtual Performance Monitoring Unit (PMU) in virtual machines. -config KVM_INDIRECT_VECTORS - def_bool RANDOMIZE_BASE - endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index f54f0e89a71c..89d2bf73acb5 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -11,4 +11,4 @@ subdir-ccflags-y := -I$(incdir) \ $(DISABLE_STACKLEAK_PLUGIN) obj-$(CONFIG_KVM) += vhe/ nvhe/ -obj-$(CONFIG_KVM_INDIRECT_VECTORS) += smccc_wa.o +obj-$(CONFIG_RANDOMIZE_BASE) += smccc_wa.o diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 41698bae5d5d..db2dd7500617 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -286,7 +286,7 @@ SYM_CODE_START(__kvm_hyp_vector) valid_vect el1_error // Error 32-bit EL1 SYM_CODE_END(__kvm_hyp_vector) -#ifdef CONFIG_KVM_INDIRECT_VECTORS +#ifdef CONFIG_RANDOMIZE_BASE .macro hyp_ventry .align 7 1: esb From b181048f414668933c524fe077b9aa6e8e9a42b4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 22:42:07 +0100 Subject: [PATCH 119/148] KVM: arm64: Simplify install_bp_hardening_cb() Use is_hyp_mode_available() to detect whether or not we need to patch the KVM vectors for branch hardening, which avoids the need to take the vector pointers as parameters. Signed-off-by: Will Deacon --- arch/arm64/kernel/cpu_errata.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 135bf7f92d4a..a72ca57f5630 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -126,18 +126,19 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); } -static void install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) +static void install_bp_hardening_cb(bp_hardening_cb_t fn) { static DEFINE_RAW_SPINLOCK(bp_lock); int cpu, slot = -1; + const char *hyp_vecs_start = __smccc_workaround_1_smc; + const char *hyp_vecs_end = __smccc_workaround_1_smc + + __SMCCC_WORKAROUND_1_SMC_SZ; /* * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if * we're a guest. Skip the hyp-vectors work. */ - if (!hyp_vecs_start) { + if (!is_hyp_mode_available()) { __this_cpu_write(bp_hardening_data.fn, fn); return; } @@ -161,9 +162,7 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn, raw_spin_unlock(&bp_lock); } #else -static void install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) +static void install_bp_hardening_cb(bp_hardening_cb_t fn) { __this_cpu_write(bp_hardening_data.fn, fn); } @@ -209,7 +208,6 @@ early_param("nospectre_v2", parse_nospectre_v2); static int detect_harden_bp_fw(void) { bp_hardening_cb_t cb; - void *smccc_start, *smccc_end; struct arm_smccc_res res; u32 midr = read_cpuid_id(); @@ -229,21 +227,10 @@ static int detect_harden_bp_fw(void) switch (arm_smccc_1_1_get_conduit()) { case SMCCC_CONDUIT_HVC: cb = call_hvc_arch_workaround_1; - /* This is a guest, no need to patch KVM vectors */ - smccc_start = NULL; - smccc_end = NULL; break; case SMCCC_CONDUIT_SMC: cb = call_smc_arch_workaround_1; -#if IS_ENABLED(CONFIG_KVM) - smccc_start = __smccc_workaround_1_smc; - smccc_end = __smccc_workaround_1_smc + - __SMCCC_WORKAROUND_1_SMC_SZ; -#else - smccc_start = NULL; - smccc_end = NULL; -#endif break; default: @@ -254,7 +241,7 @@ static int detect_harden_bp_fw(void) ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) cb = qcom_link_stack_sanitization; - install_bp_hardening_cb(cb, smccc_start, smccc_end); + install_bp_hardening_cb(cb); return 1; } From 688f1e4b6d8f792c44bec3a448664fd9e8949964 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 23:00:31 +0100 Subject: [PATCH 120/148] arm64: Rename ARM64_HARDEN_BRANCH_PREDICTOR to ARM64_SPECTRE_V2 For better or worse, the world knows about "Spectre" and not about "Branch predictor hardening". Rename ARM64_HARDEN_BRANCH_PREDICTOR to ARM64_SPECTRE_V2 as part of moving all of the Spectre mitigations into their own little corner. Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpucaps.h | 2 +- arch/arm64/include/asm/kvm_mmu.h | 27 +++++++++++++-------------- arch/arm64/include/asm/mmu.h | 2 +- arch/arm64/kernel/cpu_errata.c | 2 +- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 07b643a70710..348bfcf6c818 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -31,7 +31,7 @@ #define ARM64_HAS_DCPOP 21 #define ARM64_SVE 22 #define ARM64_UNMAP_KERNEL_AT_EL0 23 -#define ARM64_HARDEN_BRANCH_PREDICTOR 24 +#define ARM64_SPECTRE_V2 24 #define ARM64_HAS_RAS_EXTN 25 #define ARM64_WORKAROUND_843419 26 #define ARM64_HAS_CACHE_IDC 27 diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index dfd176b0642e..873e12430ac7 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -435,14 +435,13 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, * EL2 vectors can be mapped and rerouted in a number of ways, * depending on the kernel configuration and CPU present: * - * - If the CPU has the ARM64_HARDEN_BRANCH_PREDICTOR cap, the - * hardening sequence is placed in one of the vector slots, which is - * executed before jumping to the real vectors. + * - If the CPU is affected by Spectre-v2, the hardening sequence is + * placed in one of the vector slots, which is executed before jumping + * to the real vectors. * - * - If the CPU has both the ARM64_HARDEN_EL2_VECTORS cap and the - * ARM64_HARDEN_BRANCH_PREDICTOR cap, the slot containing the - * hardening sequence is mapped next to the idmap page, and executed - * before jumping to the real vectors. + * - If the CPU also has the ARM64_HARDEN_EL2_VECTORS cap, the slot + * containing the hardening sequence is mapped next to the idmap page, + * and executed before jumping to the real vectors. * * - If the CPU only has the ARM64_HARDEN_EL2_VECTORS cap, then an * empty slot is selected, mapped next to the idmap page, and @@ -464,7 +463,7 @@ static inline void *kvm_get_hyp_vector(void) void *vect = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); int slot = -1; - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR) && data->fn) { + if (cpus_have_const_cap(ARM64_SPECTRE_V2) && data->fn) { vect = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); slot = data->hyp_vectors_slot; } @@ -485,15 +484,15 @@ static inline void *kvm_get_hyp_vector(void) static inline int kvm_map_vectors(void) { /* - * HBP = ARM64_HARDEN_BRANCH_PREDICTOR + * SV2 = ARM64_SPECTRE_V2 * HEL2 = ARM64_HARDEN_EL2_VECTORS * - * !HBP + !HEL2 -> use direct vectors - * HBP + !HEL2 -> use hardened vectors in place - * !HBP + HEL2 -> allocate one vector slot and use exec mapping - * HBP + HEL2 -> use hardened vertors and use exec mapping + * !SV2 + !HEL2 -> use direct vectors + * SV2 + !HEL2 -> use hardened vectors in place + * !SV2 + HEL2 -> allocate one vector slot and use exec mapping + * SV2 + HEL2 -> use hardened vertors and use exec mapping */ - if (cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) { + if (cpus_have_const_cap(ARM64_SPECTRE_V2)) { __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs); __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); } diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index f5e3efeb5b97..cbff2d42c1d8 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -56,7 +56,7 @@ static inline void arm64_apply_bp_hardening(void) { struct bp_hardening_data *d; - if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) + if (!cpus_have_const_cap(ARM64_SPECTRE_V2)) return; d = arm64_get_bp_hardening_data(); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a72ca57f5630..b275f2d5e7a3 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -877,7 +877,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif { .desc = "Branch predictor hardening", - .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + .capability = ARM64_SPECTRE_V2, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = check_branch_predictor, .cpu_enable = cpu_enable_branch_predictor_hardening, From 455697adefdb8604cd10413da37c60014aecbbb7 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 23:10:49 +0100 Subject: [PATCH 121/148] arm64: Introduce separate file for spectre mitigations and reporting The spectre mitigation code is spread over a few different files, which makes it both hard to follow, but also hard to remove it should we want to do that in future. Introduce a new file for housing the spectre mitigations, and populate it with the spectre-v1 reporting code to start with. Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/cpu_errata.c | 6 ------ arch/arm64/kernel/proton-pack.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 arch/arm64/kernel/proton-pack.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index ed8799bdd41f..15b0fcbb6883 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -19,7 +19,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - ssbd.o syscall.o + ssbd.o syscall.o proton-pack.o targets += efi-entry.o diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b275f2d5e7a3..5eb9a9126dc4 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -949,12 +949,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = { } }; -ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); -} - ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c new file mode 100644 index 000000000000..c37bf468e4a4 --- /dev/null +++ b/arch/arm64/kernel/proton-pack.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Handle detection, reporting and mitigation of Spectre v1, v2 and v4, as + * detailed at: + * + * https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability + * + * This code was originally written hastily under an awful lot of stress and so + * aspects of it are somewhat hacky. Unfortunately, changing anything in here + * instantly makes me feel ill. Thanks, Jann. Thann. + * + * Copyright (C) 2018 ARM Ltd, All Rights Reserved. + * Copyright (C) 2020 Google LLC + * + * "If there's something strange in your neighbourhood, who you gonna call?" + * + * Authors: Will Deacon and Marc Zyngier + */ + +#include + +/* + * Spectre v1. + * + * The kernel can't protect userspace for this one: it's each person for + * themselves. Advertise what we're doing and be done with it. + */ +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} From d4647f0a2ad711101067cba69c34716758aa1e48 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 23:30:17 +0100 Subject: [PATCH 122/148] arm64: Rewrite Spectre-v2 mitigation code The Spectre-v2 mitigation code is pretty unwieldy and hard to maintain. This is largely due to it being written hastily, without much clue as to how things would pan out, and also because it ends up mixing policy and state in such a way that it is very difficult to figure out what's going on. Rewrite the Spectre-v2 mitigation so that it clearly separates state from policy and follows a more structured approach to handling the mitigation. Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 6 - arch/arm64/include/asm/kvm_host.h | 17 -- arch/arm64/include/asm/processor.h | 1 + arch/arm64/include/asm/spectre.h | 27 +++ arch/arm64/kernel/cpu_errata.c | 236 +---------------------- arch/arm64/kernel/proton-pack.c | 288 ++++++++++++++++++++++++++++ arch/arm64/kvm/hypercalls.c | 8 +- arch/arm64/kvm/psci.c | 8 +- 8 files changed, 327 insertions(+), 264 deletions(-) create mode 100644 arch/arm64/include/asm/spectre.h diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 851d144527ed..3b48aa121cee 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -698,12 +698,6 @@ static inline bool system_supports_tlb_range(void) cpus_have_const_cap(ARM64_HAS_TLB_RANGE); } -#define ARM64_BP_HARDEN_UNKNOWN -1 -#define ARM64_BP_HARDEN_WA_NEEDED 0 -#define ARM64_BP_HARDEN_NOT_REQUIRED 1 - -int get_spectre_v2_workaround_state(void); - #define ARM64_SSBD_UNKNOWN -1 #define ARM64_SSBD_FORCE_DISABLE 0 #define ARM64_SSBD_KERNEL 1 diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e52c927aade5..59f6cff0f0cb 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -631,23 +631,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} #endif -#define KVM_BP_HARDEN_UNKNOWN -1 -#define KVM_BP_HARDEN_WA_NEEDED 0 -#define KVM_BP_HARDEN_NOT_REQUIRED 1 - -static inline int kvm_arm_harden_branch_predictor(void) -{ - switch (get_spectre_v2_workaround_state()) { - case ARM64_BP_HARDEN_WA_NEEDED: - return KVM_BP_HARDEN_WA_NEEDED; - case ARM64_BP_HARDEN_NOT_REQUIRED: - return KVM_BP_HARDEN_NOT_REQUIRED; - case ARM64_BP_HARDEN_UNKNOWN: - default: - return KVM_BP_HARDEN_UNKNOWN; - } -} - #define KVM_SSBD_UNKNOWN -1 #define KVM_SSBD_FORCE_DISABLE 0 #define KVM_SSBD_KERNEL 1 diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 240fe5e5b720..436ab1549ec6 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -38,6 +38,7 @@ #include #include #include +#include #include /* diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h new file mode 100644 index 000000000000..b776abe28dff --- /dev/null +++ b/arch/arm64/include/asm/spectre.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Interface for managing mitigations for Spectre vulnerabilities. + * + * Copyright (C) 2020 Google LLC + * Author: Will Deacon + */ + +#ifndef __ASM_SPECTRE_H +#define __ASM_SPECTRE_H + +#include + +/* Watch out, ordering is important here. */ +enum mitigation_state { + SPECTRE_UNAFFECTED, + SPECTRE_MITIGATED, + SPECTRE_VULNERABLE, +}; + +struct task_struct; + +enum mitigation_state arm64_get_spectre_v2_state(void); +bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); +void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); + +#endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 5eb9a9126dc4..4103391f51b5 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -106,145 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); - -#include -#include - -DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); - -#ifdef CONFIG_RANDOMIZE_BASE -static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K); - int i; - - for (i = 0; i < SZ_2K; i += 0x80) - memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); - - __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); -} - -static void install_bp_hardening_cb(bp_hardening_cb_t fn) -{ - static DEFINE_RAW_SPINLOCK(bp_lock); - int cpu, slot = -1; - const char *hyp_vecs_start = __smccc_workaround_1_smc; - const char *hyp_vecs_end = __smccc_workaround_1_smc + - __SMCCC_WORKAROUND_1_SMC_SZ; - - /* - * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if - * we're a guest. Skip the hyp-vectors work. - */ - if (!is_hyp_mode_available()) { - __this_cpu_write(bp_hardening_data.fn, fn); - return; - } - - raw_spin_lock(&bp_lock); - for_each_possible_cpu(cpu) { - if (per_cpu(bp_hardening_data.fn, cpu) == fn) { - slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); - break; - } - } - - if (slot == -1) { - slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); - __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); - } - - __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); - __this_cpu_write(bp_hardening_data.fn, fn); - raw_spin_unlock(&bp_lock); -} -#else -static void install_bp_hardening_cb(bp_hardening_cb_t fn) -{ - __this_cpu_write(bp_hardening_data.fn, fn); -} -#endif /* CONFIG_RANDOMIZE_BASE */ - -#include - -static void __maybe_unused call_smc_arch_workaround_1(void) -{ - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void call_hvc_arch_workaround_1(void) -{ - arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); -} - -static void qcom_link_stack_sanitization(void) -{ - u64 tmp; - - asm volatile("mov %0, x30 \n" - ".rept 16 \n" - "bl . + 4 \n" - ".endr \n" - "mov x30, %0 \n" - : "=&r" (tmp)); -} - -static bool __nospectre_v2; -static int __init parse_nospectre_v2(char *str) -{ - __nospectre_v2 = true; - return 0; -} -early_param("nospectre_v2", parse_nospectre_v2); - -/* - * -1: No workaround - * 0: No workaround required - * 1: Workaround installed - */ -static int detect_harden_bp_fw(void) -{ - bp_hardening_cb_t cb; - struct arm_smccc_res res; - u32 midr = read_cpuid_id(); - - arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_1, &res); - - switch ((int)res.a0) { - case 1: - /* Firmware says we're just fine */ - return 0; - case 0: - break; - default: - return -1; - } - - switch (arm_smccc_1_1_get_conduit()) { - case SMCCC_CONDUIT_HVC: - cb = call_hvc_arch_workaround_1; - break; - - case SMCCC_CONDUIT_SMC: - cb = call_smc_arch_workaround_1; - break; - - default: - return -1; - } - - if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || - ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) - cb = qcom_link_stack_sanitization; - - install_bp_hardening_cb(cb); - return 1; -} - DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; @@ -508,83 +369,6 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ CAP_MIDR_RANGE_LIST(midr_list) -/* Track overall mitigation state. We are only mitigated if all cores are ok */ -static bool __hardenbp_enab = true; -static bool __spectrev2_safe = true; - -int get_spectre_v2_workaround_state(void) -{ - if (__spectrev2_safe) - return ARM64_BP_HARDEN_NOT_REQUIRED; - - if (!__hardenbp_enab) - return ARM64_BP_HARDEN_UNKNOWN; - - return ARM64_BP_HARDEN_WA_NEEDED; -} - -/* - * List of CPUs that do not need any Spectre-v2 mitigation at all. - */ -static const struct midr_range spectre_v2_safe_list[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), - MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), - MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), - MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), - MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), - { /* sentinel */ } -}; - -/* - * Track overall bp hardening for all heterogeneous cores in the machine. - * We are only considered "safe" if all booted cores are known safe. - */ -static bool __maybe_unused -check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) -{ - int need_wa; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - - /* If the CPU has CSV2 set, we're safe */ - if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1), - ID_AA64PFR0_CSV2_SHIFT)) - return false; - - /* Alternatively, we have a list of unaffected CPUs */ - if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) - return false; - - /* Fallback to firmware detection */ - need_wa = detect_harden_bp_fw(); - if (!need_wa) - return false; - - __spectrev2_safe = false; - - /* forced off */ - if (__nospectre_v2 || cpu_mitigations_off()) { - pr_info_once("spectrev2 mitigation disabled by command line option\n"); - __hardenbp_enab = false; - return false; - } - - if (need_wa < 0) { - pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n"); - __hardenbp_enab = false; - } - - return (need_wa > 0); -} - -static void -cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap) -{ - cap->matches(cap, SCOPE_LOCAL_CPU); -} - static const __maybe_unused struct midr_range tx2_family_cpus[] = { MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), @@ -876,11 +660,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #endif { - .desc = "Branch predictor hardening", + .desc = "Spectre-v2", .capability = ARM64_SPECTRE_V2, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .matches = check_branch_predictor, - .cpu_enable = cpu_enable_branch_predictor_hardening, + .matches = has_spectre_v2, + .cpu_enable = spectre_v2_enable_mitigation, }, #ifdef CONFIG_RANDOMIZE_BASE { @@ -949,20 +733,6 @@ const struct arm64_cpu_capabilities arm64_errata[] = { } }; -ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, - char *buf) -{ - switch (get_spectre_v2_workaround_state()) { - case ARM64_BP_HARDEN_NOT_REQUIRED: - return sprintf(buf, "Not affected\n"); - case ARM64_BP_HARDEN_WA_NEEDED: - return sprintf(buf, "Mitigation: Branch predictor hardening\n"); - case ARM64_BP_HARDEN_UNKNOWN: - default: - return sprintf(buf, "Vulnerable\n"); - } -} - ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index c37bf468e4a4..4f1411b45301 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -17,7 +17,33 @@ * Authors: Will Deacon and Marc Zyngier */ +#include +#include #include +#include + +#include +#include + +/* + * We try to ensure that the mitigation state can never change as the result of + * onlining a late CPU. + */ +static void update_mitigation_state(enum mitigation_state *oldp, + enum mitigation_state new) +{ + enum mitigation_state state; + + do { + state = READ_ONCE(*oldp); + if (new <= state) + break; + + /* Userspace almost certainly can't deal with this. */ + if (WARN_ON(system_capabilities_finalized())) + break; + } while (cmpxchg_relaxed(oldp, state, new) != state); +} /* * Spectre v1. @@ -30,3 +56,265 @@ ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, { return sprintf(buf, "Mitigation: __user pointer sanitization\n"); } + +/* + * Spectre v2. + * + * This one sucks. A CPU is either: + * + * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2. + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in software by firmware. + * - Mitigated in software by a CPU-specific dance in the kernel. + * - Vulnerable. + * + * It's not unlikely for different CPUs in a big.LITTLE system to fall into + * different camps. + */ +static enum mitigation_state spectre_v2_state; + +static bool __read_mostly __nospectre_v2; +static int __init parse_spectre_v2_param(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_spectre_v2_param); + +static bool spectre_v2_mitigations_off(void) +{ + bool ret = __nospectre_v2 || cpu_mitigations_off(); + + if (ret) + pr_info_once("spectre-v2 mitigation disabled by command line option\n"); + + return ret; +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + switch (spectre_v2_state) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void) +{ + u64 pfr0; + static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ } + }; + + /* If the CPU has CSV2 set, we're safe */ + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + return SPECTRE_UNAFFECTED; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + return SPECTRE_UNAFFECTED; + + return SPECTRE_VULNERABLE; +} + +#define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED (1) + +static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope) +{ + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED) + return false; + + return true; +} + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +enum mitigation_state arm64_get_spectre_v2_state(void) +{ + return spectre_v2_state; +} + +#ifdef CONFIG_KVM +#ifdef CONFIG_RANDOMIZE_BASE +#include +#include + +atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); + +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + void *dst = lm_alias(__bp_harden_hyp_vecs + slot * SZ_2K); + int i; + + for (i = 0; i < SZ_2K; i += 0x80) + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); + + __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); +} + +static void install_bp_hardening_cb(bp_hardening_cb_t fn) +{ + static DEFINE_RAW_SPINLOCK(bp_lock); + int cpu, slot = -1; + const char *hyp_vecs_start = __smccc_workaround_1_smc; + const char *hyp_vecs_end = __smccc_workaround_1_smc + + __SMCCC_WORKAROUND_1_SMC_SZ; + + /* + * detect_harden_bp_fw() passes NULL for the hyp_vecs start/end if + * we're a guest. Skip the hyp-vectors work. + */ + if (!is_hyp_mode_available()) { + __this_cpu_write(bp_hardening_data.fn, fn); + return; + } + + raw_spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.fn, cpu) == fn) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + slot = atomic_inc_return(&arm64_el2_vector_last_slot); + BUG_ON(slot >= BP_HARDEN_EL2_SLOTS); + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.fn, fn); + raw_spin_unlock(&bp_lock); +} +#else +static void install_bp_hardening_cb(bp_hardening_cb_t fn) +{ + __this_cpu_write(bp_hardening_data.fn, fn); +} +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_KVM */ + +static void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static void qcom_link_stack_sanitisation(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static enum mitigation_state spectre_v2_enable_fw_mitigation(void) +{ + bp_hardening_cb_t cb; + enum mitigation_state state; + + state = spectre_v2_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v2_mitigations_off()) + return SPECTRE_VULNERABLE; + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + cb = call_hvc_arch_workaround_1; + break; + + case SMCCC_CONDUIT_SMC: + cb = call_smc_arch_workaround_1; + break; + + default: + return SPECTRE_VULNERABLE; + } + + install_bp_hardening_cb(cb); + return SPECTRE_MITIGATED; +} + +static enum mitigation_state spectre_v2_enable_sw_mitigation(void) +{ + u32 midr; + + if (spectre_v2_mitigations_off()) + return SPECTRE_VULNERABLE; + + midr = read_cpuid_id(); + if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) && + ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1)) + return SPECTRE_VULNERABLE; + + install_bp_hardening_cb(qcom_link_stack_sanitisation); + return SPECTRE_MITIGATED; +} + +void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v2_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v2_enable_fw_mitigation(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v2_enable_sw_mitigation(); + + update_mitigation_state(&spectre_v2_state, state); +} diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 550dfa3e53cd..413d46b9bc07 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -24,13 +24,13 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) feature = smccc_get_arg1(vcpu); switch (feature) { case ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: break; - case KVM_BP_HARDEN_WA_NEEDED: + case SPECTRE_MITIGATED: val = SMCCC_RET_SUCCESS; break; - case KVM_BP_HARDEN_NOT_REQUIRED: + case SPECTRE_UNAFFECTED: val = SMCCC_RET_NOT_REQUIRED; break; } diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 83415e96b589..fbdd6f3bea7f 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -425,12 +425,12 @@ static int get_kernel_wa_level(u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - switch (kvm_arm_harden_branch_predictor()) { - case KVM_BP_HARDEN_UNKNOWN: + switch (arm64_get_spectre_v2_state()) { + case SPECTRE_VULNERABLE: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; - case KVM_BP_HARDEN_WA_NEEDED: + case SPECTRE_MITIGATED: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL; - case KVM_BP_HARDEN_NOT_REQUIRED: + case SPECTRE_UNAFFECTED: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED; } return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; From e1026237f90677fd5a454f63057a62f984c2188d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 16 Sep 2020 00:07:05 +0100 Subject: [PATCH 123/148] KVM: arm64: Set CSV2 for guests on hardware unaffected by Spectre-v2 If the system is not affected by Spectre-v2, then advertise to the KVM guest that it is not affected, without the need for a safelist in the guest. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kvm/sys_regs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 077293b5115f..7b8a8f6169d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1131,6 +1131,9 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_sve(vcpu)) val &= ~(0xfUL << ID_AA64PFR0_SVE_SHIFT); val &= ~(0xfUL << ID_AA64PFR0_AMU_SHIFT); + if (!(val & (0xfUL << ID_AA64PFR0_CSV2_SHIFT)) && + arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) + val |= (1UL << ID_AA64PFR0_CSV2_SHIFT); } else if (id == SYS_ID_AA64ISAR1_EL1 && !vcpu_has_ptrauth(vcpu)) { val &= ~((0xfUL << ID_AA64ISAR1_APA_SHIFT) | (0xfUL << ID_AA64ISAR1_API_SHIFT) | From a8de949893880a26458de03f5bc70075aba13d95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 22:20:35 +0100 Subject: [PATCH 124/148] arm64: Group start_thread() functions together The is_ttbrX_addr() functions have somehow ended up in the middle of the start_thread() functions, so move them out of the way to keep the code readable. Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 436ab1549ec6..d636c3eb9cf6 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -220,18 +220,6 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc, regs->sp = sp; } -static inline bool is_ttbr0_addr(unsigned long addr) -{ - /* entry assembly clears tags for TTBR0 addrs */ - return addr < TASK_SIZE; -} - -static inline bool is_ttbr1_addr(unsigned long addr) -{ - /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */ - return arch_kasan_reset_tag(addr) >= PAGE_OFFSET; -} - #ifdef CONFIG_COMPAT static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) @@ -252,6 +240,18 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc, } #endif +static inline bool is_ttbr0_addr(unsigned long addr) +{ + /* entry assembly clears tags for TTBR0 addrs */ + return addr < TASK_SIZE; +} + +static inline bool is_ttbr1_addr(unsigned long addr) +{ + /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */ + return arch_kasan_reset_tag(addr) >= PAGE_OFFSET; +} + /* Forward declaration, a strange C thing */ struct task_struct; From 532d581583f25d4c297c721700f74b913f8cf37b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 23:56:12 +0100 Subject: [PATCH 125/148] arm64: Treat SSBS as a non-strict system feature If all CPUs discovered during boot have SSBS, then spectre-v4 will be considered to be "mitigated". However, we still allow late CPUs without SSBS to be onlined, albeit with a "SANITY CHECK" warning. This is problematic for userspace because it means that the system can quietly transition to "Vulnerable" at runtime. Avoid this by treating SSBS as a non-strict system feature: if all of the CPUs discovered during boot have SSBS, then late arriving secondaries better have it as well. Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4bb45b1d4ae4..033e6c952705 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -227,7 +227,7 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_MPAMFRAC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_RASFRAC_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_SSBS_SHIFT, 4, ID_AA64PFR1_SSBS_PSTATE_NI), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_BT_SHIFT, 4, 0), ARM64_FTR_END, @@ -487,7 +487,7 @@ static const struct arm64_ftr_bits ftr_id_pfr1[] = { }; static const struct arm64_ftr_bits ftr_id_pfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_SSBS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_CSV3_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -1977,7 +1977,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Speculative Store Bypassing Safe (SSBS)", .capability = ARM64_SSBS, - .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, .matches = has_cpuid_feature, .sys_reg = SYS_ID_AA64PFR1_EL1, .field_pos = ID_AA64PFR1_SSBS_SHIFT, From 9b0955baa4208abd72840c13c5f56a0f133c7cb3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Sep 2020 23:00:31 +0100 Subject: [PATCH 126/148] arm64: Rename ARM64_SSBD to ARM64_SPECTRE_V4 In a similar manner to the renaming of ARM64_HARDEN_BRANCH_PREDICTOR to ARM64_SPECTRE_V2, rename ARM64_SSBD to ARM64_SPECTRE_V4. This isn't _entirely_ accurate, as we also need to take into account the interaction with SSBS, but that will be taken care of in subsequent patches. Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpucaps.h | 2 +- arch/arm64/kernel/cpu_errata.c | 2 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 348bfcf6c818..c4ac9a13ad5f 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -37,7 +37,7 @@ #define ARM64_HAS_CACHE_IDC 27 #define ARM64_HAS_CACHE_DIC 28 #define ARM64_HW_DBM 29 -#define ARM64_SSBD 30 +#define ARM64_SPECTRE_V4 30 #define ARM64_MISMATCHED_CACHE_TYPE 31 #define ARM64_HAS_STAGE2_FWB 32 #define ARM64_HAS_CRC32 33 diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 4103391f51b5..6a48957d44fc 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -675,7 +675,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif { .desc = "Speculative Store Bypass Disable", - .capability = ARM64_SSBD, + .capability = ARM64_SPECTRE_V4, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = has_ssbd_mitigation, .cpu_enable = cpu_enable_ssbd_mitigation, diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index b503f19c37c5..5dec5589625d 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -481,7 +481,7 @@ exit: static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu) { - if (!cpus_have_final_cap(ARM64_SSBD)) + if (!cpus_have_final_cap(ARM64_SPECTRE_V4)) return false; return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG); From 9e78b659b4539ee20fd0c415cf8c231cea59e9c0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 18 Sep 2020 11:45:57 +0100 Subject: [PATCH 127/148] arm64: Move SSBD prctl() handler alongside other spectre mitigation code As part of the spectre consolidation effort to shift all of the ghosts into their own proton pack, move all of the horrible SSBD prctl() code out of its own 'ssbd.c' file. Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 2 +- arch/arm64/kernel/proton-pack.c | 118 +++++++++++++++++++++++++++++ arch/arm64/kernel/ssbd.c | 129 -------------------------------- 3 files changed, 119 insertions(+), 130 deletions(-) delete mode 100644 arch/arm64/kernel/ssbd.c diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 15b0fcbb6883..bd12b9a2ab4a 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -19,7 +19,7 @@ obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ return_address.o cpuinfo.o cpu_errata.o \ cpufeature.o alternative.o cacheinfo.o \ smp.o smp_spin_table.o topology.o smccc-call.o \ - ssbd.o syscall.o proton-pack.o + syscall.o proton-pack.o targets += efi-entry.o diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 4f1411b45301..b373e4782ccd 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -318,3 +319,120 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) update_mitigation_state(&spectre_v2_state, state); } + +/* Spectre v4 prctl */ +static void ssbd_ssbs_enable(struct task_struct *task) +{ + u64 val = is_compat_thread(task_thread_info(task)) ? + PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + task_pt_regs(task)->pstate |= val; +} + +static void ssbd_ssbs_disable(struct task_struct *task) +{ + u64 val = is_compat_thread(task_thread_info(task)) ? + PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + task_pt_regs(task)->pstate &= ~val; +} + +/* + * prctl interface for SSBD + */ +static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + int state = arm64_get_ssbd_state(); + + /* Unsupported */ + if (state == ARM64_SSBD_UNKNOWN) + return -ENODEV; + + /* Treat the unaffected/mitigated state separately */ + if (state == ARM64_SSBD_MITIGATED) { + switch (ctrl) { + case PR_SPEC_ENABLE: + return -EPERM; + case PR_SPEC_DISABLE: + case PR_SPEC_FORCE_DISABLE: + return 0; + } + } + + /* + * Things are a bit backward here: the arm64 internal API + * *enables the mitigation* when the userspace API *disables + * speculation*. So much fun. + */ + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (state == ARM64_SSBD_FORCE_ENABLE || + task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + clear_tsk_thread_flag(task, TIF_SSBD); + ssbd_ssbs_enable(task); + break; + case PR_SPEC_DISABLE: + if (state == ARM64_SSBD_FORCE_DISABLE) + return -EPERM; + task_set_spec_ssb_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); + ssbd_ssbs_disable(task); + break; + case PR_SPEC_FORCE_DISABLE: + if (state == ARM64_SSBD_FORCE_DISABLE) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); + ssbd_ssbs_disable(task); + break; + default: + return -ERANGE; + } + + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +static int ssbd_prctl_get(struct task_struct *task) +{ + switch (arm64_get_ssbd_state()) { + case ARM64_SSBD_UNKNOWN: + return -ENODEV; + case ARM64_SSBD_FORCE_ENABLE: + return PR_SPEC_DISABLE; + case ARM64_SSBD_KERNEL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + case ARM64_SSBD_FORCE_DISABLE: + return PR_SPEC_ENABLE; + default: + return PR_SPEC_NOT_AFFECTED; + } +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssbd_prctl_get(task); + default: + return -ENODEV; + } +} diff --git a/arch/arm64/kernel/ssbd.c b/arch/arm64/kernel/ssbd.c deleted file mode 100644 index b26955f56750..000000000000 --- a/arch/arm64/kernel/ssbd.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 ARM Ltd, All Rights Reserved. - */ - -#include -#include -#include -#include -#include -#include - -#include - -static void ssbd_ssbs_enable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate |= val; -} - -static void ssbd_ssbs_disable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; - - task_pt_regs(task)->pstate &= ~val; -} - -/* - * prctl interface for SSBD - */ -static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) -{ - int state = arm64_get_ssbd_state(); - - /* Unsupported */ - if (state == ARM64_SSBD_UNKNOWN) - return -ENODEV; - - /* Treat the unaffected/mitigated state separately */ - if (state == ARM64_SSBD_MITIGATED) { - switch (ctrl) { - case PR_SPEC_ENABLE: - return -EPERM; - case PR_SPEC_DISABLE: - case PR_SPEC_FORCE_DISABLE: - return 0; - } - } - - /* - * Things are a bit backward here: the arm64 internal API - * *enables the mitigation* when the userspace API *disables - * speculation*. So much fun. - */ - switch (ctrl) { - case PR_SPEC_ENABLE: - /* If speculation is force disabled, enable is not allowed */ - if (state == ARM64_SSBD_FORCE_ENABLE || - task_spec_ssb_force_disable(task)) - return -EPERM; - task_clear_spec_ssb_disable(task); - clear_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_enable(task); - break; - case PR_SPEC_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - case PR_SPEC_FORCE_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - task_set_spec_ssb_force_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); - break; - default: - return -ERANGE; - } - - return 0; -} - -int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, - unsigned long ctrl) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_set(task, ctrl); - default: - return -ENODEV; - } -} - -static int ssbd_prctl_get(struct task_struct *task) -{ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_UNKNOWN: - return -ENODEV; - case ARM64_SSBD_FORCE_ENABLE: - return PR_SPEC_DISABLE; - case ARM64_SSBD_KERNEL: - if (task_spec_ssb_force_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; - if (task_spec_ssb_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_DISABLE; - return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case ARM64_SSBD_FORCE_DISABLE: - return PR_SPEC_ENABLE; - default: - return PR_SPEC_NOT_AFFECTED; - } -} - -int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) -{ - switch (which) { - case PR_SPEC_STORE_BYPASS: - return ssbd_prctl_get(task); - default: - return -ENODEV; - } -} From c28762070ca651fe7a981b8f31d972c9b7d2c386 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 18 Sep 2020 11:54:33 +0100 Subject: [PATCH 128/148] arm64: Rewrite Spectre-v4 mitigation code Rewrite the Spectre-v4 mitigation handling code to follow the same approach as that taken by Spectre-v2. For now, report to KVM that the system is vulnerable (by forcing 'ssbd_state' to ARM64_SSBD_UNKNOWN), as this will be cleared up in subsequent steps. Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 19 +- arch/arm64/include/asm/spectre.h | 5 + arch/arm64/kernel/cpu_errata.c | 217 +------------- arch/arm64/kernel/cpufeature.c | 41 --- arch/arm64/kernel/entry.S | 6 +- arch/arm64/kernel/hibernate.c | 6 +- arch/arm64/kernel/process.c | 17 +- arch/arm64/kernel/proton-pack.c | 447 +++++++++++++++++++++++++---- arch/arm64/kernel/suspend.c | 3 +- 9 files changed, 405 insertions(+), 356 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index d636c3eb9cf6..7d90ea2e2063 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -198,25 +198,12 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) regs->pmr_save = GIC_PRIO_IRQON; } -static inline void set_ssbs_bit(struct pt_regs *regs) -{ - regs->pstate |= PSR_SSBS_BIT; -} - -static inline void set_compat_ssbs_bit(struct pt_regs *regs) -{ - regs->pstate |= PSR_AA32_SSBS_BIT; -} - static inline void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { start_thread_common(regs, pc); regs->pstate = PSR_MODE_EL0t; - - if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE) - set_ssbs_bit(regs); - + spectre_v4_enable_task_mitigation(current); regs->sp = sp; } @@ -233,9 +220,7 @@ static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc, regs->pstate |= PSR_AA32_E_BIT; #endif - if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE) - set_compat_ssbs_bit(regs); - + spectre_v4_enable_task_mitigation(current); regs->compat_sp = sp; } #endif diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index b776abe28dff..fcdfbce302bd 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -24,4 +24,9 @@ enum mitigation_state arm64_get_spectre_v2_state(void); bool has_spectre_v2(const struct arm64_cpu_capabilities *cap, int scope); void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused); +enum mitigation_state arm64_get_spectre_v4_state(void); +bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope); +void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused); +void spectre_v4_enable_task_mitigation(struct task_struct *tsk); + #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 6a48957d44fc..7fc54c3d4285 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -106,62 +106,7 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); - -int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; -static bool __ssb_safe = true; - -static const struct ssbd_options { - const char *str; - int state; -} ssbd_options[] = { - { "force-on", ARM64_SSBD_FORCE_ENABLE, }, - { "force-off", ARM64_SSBD_FORCE_DISABLE, }, - { "kernel", ARM64_SSBD_KERNEL, }, -}; - -static int __init ssbd_cfg(char *buf) -{ - int i; - - if (!buf || !buf[0]) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(ssbd_options); i++) { - int len = strlen(ssbd_options[i].str); - - if (strncmp(buf, ssbd_options[i].str, len)) - continue; - - ssbd_state = ssbd_options[i].state; - return 0; - } - - return -EINVAL; -} -early_param("ssbd", ssbd_cfg); - -void __init arm64_update_smccc_conduit(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - u32 insn; - - BUG_ON(nr_inst != 1); - - switch (arm_smccc_1_1_get_conduit()) { - case SMCCC_CONDUIT_HVC: - insn = aarch64_insn_get_hvc_value(); - break; - case SMCCC_CONDUIT_SMC: - insn = aarch64_insn_get_smc_value(); - break; - default: - return; - } - - *updptr = cpu_to_le32(insn); -} +int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN; void __init arm64_enable_wa2_handling(struct alt_instr *alt, __le32 *origptr, __le32 *updptr, @@ -177,144 +122,6 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt, *updptr = cpu_to_le32(aarch64_insn_gen_nop()); } -void arm64_set_ssbd_mitigation(bool state) -{ - int conduit; - - if (this_cpu_has_cap(ARM64_SSBS)) { - if (state) - asm volatile(SET_PSTATE_SSBS(0)); - else - asm volatile(SET_PSTATE_SSBS(1)); - return; - } - - conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, state, - NULL); - - WARN_ON_ONCE(conduit == SMCCC_CONDUIT_NONE); -} - -static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, - int scope) -{ - struct arm_smccc_res res; - bool required = true; - s32 val; - bool this_cpu_safe = false; - int conduit; - - WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); - - if (cpu_mitigations_off()) - ssbd_state = ARM64_SSBD_FORCE_DISABLE; - - /* delay setting __ssb_safe until we get a firmware response */ - if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list)) - this_cpu_safe = true; - - if (this_cpu_has_cap(ARM64_SSBS)) { - if (!this_cpu_safe) - __ssb_safe = false; - required = false; - goto out_printmsg; - } - - conduit = arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, - ARM_SMCCC_ARCH_WORKAROUND_2, &res); - - if (conduit == SMCCC_CONDUIT_NONE) { - ssbd_state = ARM64_SSBD_UNKNOWN; - if (!this_cpu_safe) - __ssb_safe = false; - return false; - } - - val = (s32)res.a0; - - switch (val) { - case SMCCC_RET_NOT_SUPPORTED: - ssbd_state = ARM64_SSBD_UNKNOWN; - if (!this_cpu_safe) - __ssb_safe = false; - return false; - - /* machines with mixed mitigation requirements must not return this */ - case SMCCC_RET_NOT_REQUIRED: - pr_info_once("%s mitigation not required\n", entry->desc); - ssbd_state = ARM64_SSBD_MITIGATED; - return false; - - case SMCCC_RET_SUCCESS: - __ssb_safe = false; - required = true; - break; - - case 1: /* Mitigation not required on this CPU */ - required = false; - break; - - default: - WARN_ON(1); - if (!this_cpu_safe) - __ssb_safe = false; - return false; - } - - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - arm64_set_ssbd_mitigation(false); - required = false; - break; - - case ARM64_SSBD_KERNEL: - if (required) { - __this_cpu_write(arm64_ssbd_callback_required, 1); - arm64_set_ssbd_mitigation(true); - } - break; - - case ARM64_SSBD_FORCE_ENABLE: - arm64_set_ssbd_mitigation(true); - required = true; - break; - - default: - WARN_ON(1); - break; - } - -out_printmsg: - switch (ssbd_state) { - case ARM64_SSBD_FORCE_DISABLE: - pr_info_once("%s disabled from command-line\n", entry->desc); - break; - - case ARM64_SSBD_FORCE_ENABLE: - pr_info_once("%s forced from command-line\n", entry->desc); - break; - } - - return required; -} - -static void cpu_enable_ssbd_mitigation(const struct arm64_cpu_capabilities *cap) -{ - if (ssbd_state != ARM64_SSBD_FORCE_DISABLE) - cap->matches(cap, SCOPE_LOCAL_CPU); -} - -/* known invulnerable cores */ -static const struct midr_range arm64_ssb_cpus[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), - MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), - MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), - MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), - {}, -}; - #ifdef CONFIG_ARM64_ERRATUM_1463225 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); @@ -674,12 +481,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #endif { - .desc = "Speculative Store Bypass Disable", + .desc = "Spectre-v4", .capability = ARM64_SPECTRE_V4, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .matches = has_ssbd_mitigation, - .cpu_enable = cpu_enable_ssbd_mitigation, - .midr_range_list = arm64_ssb_cpus, + .matches = has_spectre_v4, + .cpu_enable = spectre_v4_enable_mitigation, }, #ifdef CONFIG_ARM64_ERRATUM_1418040 { @@ -732,18 +538,3 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { } }; - -ssize_t cpu_show_spec_store_bypass(struct device *dev, - struct device_attribute *attr, char *buf) -{ - if (__ssb_safe) - return sprintf(buf, "Not affected\n"); - - switch (ssbd_state) { - case ARM64_SSBD_KERNEL: - case ARM64_SSBD_FORCE_ENABLE: - return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); - } - - return sprintf(buf, "Vulnerable\n"); -} diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 033e6c952705..a4debb63ebfb 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1583,46 +1583,6 @@ static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) WARN_ON(val & (7 << 27 | 7 << 21)); } -static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) -{ - if (user_mode(regs)) - return 1; - - if (instr & BIT(PSTATE_Imm_shift)) - regs->pstate |= PSR_SSBS_BIT; - else - regs->pstate &= ~PSR_SSBS_BIT; - - arm64_skip_faulting_instruction(regs, 4); - return 0; -} - -static struct undef_hook ssbs_emulation_hook = { - .instr_mask = ~(1U << PSTATE_Imm_shift), - .instr_val = 0xd500401f | PSTATE_SSBS, - .fn = ssbs_emulation_handler, -}; - -static void cpu_enable_ssbs(const struct arm64_cpu_capabilities *__unused) -{ - static bool undef_hook_registered = false; - static DEFINE_RAW_SPINLOCK(hook_lock); - - raw_spin_lock(&hook_lock); - if (!undef_hook_registered) { - register_undef_hook(&ssbs_emulation_hook); - undef_hook_registered = true; - } - raw_spin_unlock(&hook_lock); - - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { - sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); - arm64_set_ssbd_mitigation(false); - } else { - arm64_set_ssbd_mitigation(true); - } -} - #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { @@ -1983,7 +1943,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR1_SSBS_SHIFT, .sign = FTR_UNSIGNED, .min_field_value = ID_AA64PFR1_SSBS_PSTATE_ONLY, - .cpu_enable = cpu_enable_ssbs, }, #ifdef CONFIG_ARM64_CNP { diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 81b709349d7b..aeb337029d56 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -132,8 +132,8 @@ alternative_else_nop_endif * them if required. */ .macro apply_ssbd, state, tmp1, tmp2 -alternative_cb arm64_enable_wa2_handling - b .L__asm_ssbd_skip\@ +alternative_cb spectre_v4_patch_fw_mitigation_enable + b .L__asm_ssbd_skip\@ // Patched to NOP alternative_cb_end ldr_this_cpu \tmp2, arm64_ssbd_callback_required, \tmp1 cbz \tmp2, .L__asm_ssbd_skip\@ @@ -141,7 +141,7 @@ alternative_cb_end tbnz \tmp2, #TIF_SSBD, .L__asm_ssbd_skip\@ mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 mov w1, #\state -alternative_cb arm64_update_smccc_conduit +alternative_cb spectre_v4_patch_fw_mitigation_conduit nop // Patched to SMC/HVC #0 alternative_cb_end .L__asm_ssbd_skip\@: diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 68e14152d6e9..c7b00120dc3e 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -332,11 +332,7 @@ int swsusp_arch_suspend(void) * mitigation off behind our back, let's set the state * to what we expect it to be. */ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_ENABLE: - case ARM64_SSBD_KERNEL: - arm64_set_ssbd_mitigation(true); - } + spectre_v4_enable_mitigation(NULL); } local_daif_restore(flags); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index f1804496b935..9dbd35b95253 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -421,8 +421,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, cpus_have_const_cap(ARM64_HAS_UAO)) childregs->pstate |= PSR_UAO_BIT; - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - set_ssbs_bit(childregs); + spectre_v4_enable_task_mitigation(p); if (system_uses_irq_prio_masking()) childregs->pmr_save = GIC_PRIO_IRQON; @@ -472,8 +471,6 @@ void uao_thread_switch(struct task_struct *next) */ static void ssbs_thread_switch(struct task_struct *next) { - struct pt_regs *regs = task_pt_regs(next); - /* * Nothing to do for kernel threads, but 'regs' may be junk * (e.g. idle task) so check the flags and bail early. @@ -485,18 +482,10 @@ static void ssbs_thread_switch(struct task_struct *next) * If all CPUs implement the SSBS extension, then we just need to * context-switch the PSTATE field. */ - if (cpu_have_feature(cpu_feature(SSBS))) + if (cpus_have_const_cap(ARM64_SSBS)) return; - /* If the mitigation is enabled, then we leave SSBS clear. */ - if ((arm64_get_ssbd_state() == ARM64_SSBD_FORCE_ENABLE) || - test_tsk_thread_flag(next, TIF_SSBD)) - return; - - if (compat_user_mode(regs)) - set_compat_ssbs_bit(regs); - else if (user_mode(regs)) - set_ssbs_bit(regs); + spectre_v4_enable_task_mitigation(next); } /* diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index b373e4782ccd..b1ea935fd948 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -320,79 +320,394 @@ void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused) update_mitigation_state(&spectre_v2_state, state); } -/* Spectre v4 prctl */ -static void ssbd_ssbs_enable(struct task_struct *task) -{ - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; +/* + * Spectre v4. + * + * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is + * either: + * + * - Mitigated in hardware and listed in our "safe list". + * - Mitigated in hardware via PSTATE.SSBS. + * - Mitigated in software by firmware (sometimes referred to as SSBD). + * + * Wait, that doesn't sound so bad, does it? Keep reading... + * + * A major source of headaches is that the software mitigation is enabled both + * on a per-task basis, but can also be forced on for the kernel, necessitating + * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs + * allow EL0 to toggle SSBS directly, which can end up with the prctl() state + * being stale when re-entering the kernel. The usual big.LITTLE caveats apply, + * so you can have systems that have both firmware and SSBS mitigations. This + * means we actually have to reject late onlining of CPUs with mitigations if + * all of the currently onlined CPUs are safelisted, as the mitigation tends to + * be opt-in for userspace. Yes, really, the cure is worse than the disease. + * + * The only good part is that if the firmware mitigation is present, then it is + * present for all CPUs, meaning we don't have to worry about late onlining of a + * vulnerable CPU if one of the boot CPUs is using the firmware mitigation. + * + * Give me a VAX-11/780 any day of the week... + */ +static enum mitigation_state spectre_v4_state; - task_pt_regs(task)->pstate |= val; +/* This is the per-cpu state tracking whether we need to talk to firmware */ +DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); + +enum spectre_v4_policy { + SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, + SPECTRE_V4_POLICY_MITIGATION_ENABLED, + SPECTRE_V4_POLICY_MITIGATION_DISABLED, +}; + +static enum spectre_v4_policy __read_mostly __spectre_v4_policy; + +static const struct spectre_v4_param { + const char *str; + enum spectre_v4_policy policy; +} spectre_v4_params[] = { + { "force-on", SPECTRE_V4_POLICY_MITIGATION_ENABLED, }, + { "force-off", SPECTRE_V4_POLICY_MITIGATION_DISABLED, }, + { "kernel", SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, }, +}; +static int __init parse_spectre_v4_param(char *str) +{ + int i; + + if (!str || !str[0]) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) { + const struct spectre_v4_param *param = &spectre_v4_params[i]; + + if (strncmp(str, param->str, strlen(param->str))) + continue; + + __spectre_v4_policy = param->policy; + return 0; + } + + return -EINVAL; +} +early_param("ssbd", parse_spectre_v4_param); + +/* + * Because this was all written in a rush by people working in different silos, + * we've ended up with multiple command line options to control the same thing. + * Wrap these up in some helpers, which prefer disabling the mitigation if faced + * with contradictory parameters. The mitigation is always either "off", + * "dynamic" or "on". + */ +static bool spectre_v4_mitigations_off(void) +{ + bool ret = cpu_mitigations_off() || + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED; + + if (ret) + pr_info_once("spectre-v4 mitigation disabled by command-line option\n"); + + return ret; } -static void ssbd_ssbs_disable(struct task_struct *task) +/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */ +static bool spectre_v4_mitigations_dynamic(void) { - u64 val = is_compat_thread(task_thread_info(task)) ? - PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC; +} - task_pt_regs(task)->pstate &= ~val; +static bool spectre_v4_mitigations_on(void) +{ + return !spectre_v4_mitigations_off() && + __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED; +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: + return sprintf(buf, "Not affected\n"); + case SPECTRE_MITIGATED: + return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n"); + case SPECTRE_VULNERABLE: + fallthrough; + default: + return sprintf(buf, "Vulnerable\n"); + } +} + +enum mitigation_state arm64_get_spectre_v4_state(void) +{ + return spectre_v4_state; +} + +static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void) +{ + static const struct midr_range spectre_v4_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER), + MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER), + { /* sentinel */ }, + }; + + if (is_midr_in_range_list(read_cpuid_id(), spectre_v4_safe_list)) + return SPECTRE_UNAFFECTED; + + /* CPU features are detected first */ + if (this_cpu_has_cap(ARM64_SSBS)) + return SPECTRE_MITIGATED; + + return SPECTRE_VULNERABLE; +} + +static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void) +{ + int ret; + struct arm_smccc_res res; + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_2, &res); + + ret = res.a0; + switch (ret) { + case SMCCC_RET_SUCCESS: + return SPECTRE_MITIGATED; + case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED: + fallthrough; + case SMCCC_RET_NOT_REQUIRED: + return SPECTRE_UNAFFECTED; + default: + fallthrough; + case SMCCC_RET_NOT_SUPPORTED: + return SPECTRE_VULNERABLE; + } +} + +bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope) +{ + enum mitigation_state state; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_get_cpu_fw_mitigation_state(); + + return state != SPECTRE_UNAFFECTED; +} + +static int ssbs_emulation_handler(struct pt_regs *regs, u32 instr) +{ + if (user_mode(regs)) + return 1; + + if (instr & BIT(PSTATE_Imm_shift)) + regs->pstate |= PSR_SSBS_BIT; + else + regs->pstate &= ~PSR_SSBS_BIT; + + arm64_skip_faulting_instruction(regs, 4); + return 0; +} + +static struct undef_hook ssbs_emulation_hook = { + .instr_mask = ~(1U << PSTATE_Imm_shift), + .instr_val = 0xd500401f | PSTATE_SSBS, + .fn = ssbs_emulation_handler, +}; + +static enum mitigation_state spectre_v4_enable_hw_mitigation(void) +{ + static bool undef_hook_registered = false; + static DEFINE_RAW_SPINLOCK(hook_lock); + enum mitigation_state state; + + /* + * If the system is mitigated but this CPU doesn't have SSBS, then + * we must be on the safelist and there's nothing more to do. + */ + state = spectre_v4_get_cpu_hw_mitigation_state(); + if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS)) + return state; + + raw_spin_lock(&hook_lock); + if (!undef_hook_registered) { + register_undef_hook(&ssbs_emulation_hook); + undef_hook_registered = true; + } + raw_spin_unlock(&hook_lock); + + if (spectre_v4_mitigations_off()) { + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); + asm volatile(SET_PSTATE_SSBS(1)); + return SPECTRE_VULNERABLE; + } + + /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ + asm volatile(SET_PSTATE_SSBS(0)); + return SPECTRE_MITIGATED; } /* - * prctl interface for SSBD + * Patch a branch over the Spectre-v4 mitigation code with a NOP so that + * we fallthrough and check whether firmware needs to be called on this CPU. + */ +void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); /* Branch -> NOP */ + + if (spectre_v4_mitigations_off()) + return; + + if (cpus_have_final_cap(ARM64_SSBS)) + return; + + if (spectre_v4_mitigations_dynamic()) + *updptr = cpu_to_le32(aarch64_insn_gen_nop()); +} + +/* + * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction + * to call into firmware to adjust the mitigation state. + */ +void __init spectre_v4_patch_fw_mitigation_conduit(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + u32 insn; + + BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */ + + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + insn = aarch64_insn_get_hvc_value(); + break; + case SMCCC_CONDUIT_SMC: + insn = aarch64_insn_get_smc_value(); + break; + default: + return; + } + + *updptr = cpu_to_le32(insn); +} + +static enum mitigation_state spectre_v4_enable_fw_mitigation(void) +{ + enum mitigation_state state; + + state = spectre_v4_get_cpu_fw_mitigation_state(); + if (state != SPECTRE_MITIGATED) + return state; + + if (spectre_v4_mitigations_off()) { + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL); + return SPECTRE_VULNERABLE; + } + + arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL); + + if (spectre_v4_mitigations_dynamic()) + __this_cpu_write(arm64_ssbd_callback_required, 1); + + return SPECTRE_MITIGATED; +} + +void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused) +{ + enum mitigation_state state; + + WARN_ON(preemptible()); + + state = spectre_v4_enable_hw_mitigation(); + if (state == SPECTRE_VULNERABLE) + state = spectre_v4_enable_fw_mitigation(); + + update_mitigation_state(&spectre_v4_state, state); +} + +static void __update_pstate_ssbs(struct pt_regs *regs, bool state) +{ + u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT; + + if (state) + regs->pstate |= bit; + else + regs->pstate &= ~bit; +} + +void spectre_v4_enable_task_mitigation(struct task_struct *tsk) +{ + struct pt_regs *regs = task_pt_regs(tsk); + bool ssbs = false, kthread = tsk->flags & PF_KTHREAD; + + if (spectre_v4_mitigations_off()) + ssbs = true; + else if (spectre_v4_mitigations_dynamic() && !kthread) + ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD); + + __update_pstate_ssbs(regs, ssbs); +} + +/* + * The Spectre-v4 mitigation can be controlled via a prctl() from userspace. + * This is interesting because the "speculation disabled" behaviour can be + * configured so that it is preserved across exec(), which means that the + * prctl() may be necessary even when PSTATE.SSBS can be toggled directly + * from userspace. */ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) { - int state = arm64_get_ssbd_state(); - - /* Unsupported */ - if (state == ARM64_SSBD_UNKNOWN) - return -ENODEV; - - /* Treat the unaffected/mitigated state separately */ - if (state == ARM64_SSBD_MITIGATED) { - switch (ctrl) { - case PR_SPEC_ENABLE: - return -EPERM; - case PR_SPEC_DISABLE: - case PR_SPEC_FORCE_DISABLE: - return 0; - } - } - - /* - * Things are a bit backward here: the arm64 internal API - * *enables the mitigation* when the userspace API *disables - * speculation*. So much fun. - */ switch (ctrl) { case PR_SPEC_ENABLE: - /* If speculation is force disabled, enable is not allowed */ - if (state == ARM64_SSBD_FORCE_ENABLE || - task_spec_ssb_force_disable(task)) + /* Enable speculation: disable mitigation */ + /* + * Force disabled speculation prevents it from being + * re-enabled. + */ + if (task_spec_ssb_force_disable(task)) return -EPERM; + + /* + * If the mitigation is forced on, then speculation is forced + * off and we again prevent it from being re-enabled. + */ + if (spectre_v4_mitigations_on()) + return -EPERM; + task_clear_spec_ssb_disable(task); clear_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_enable(task); - break; - case PR_SPEC_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) - return -EPERM; - task_set_spec_ssb_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); break; case PR_SPEC_FORCE_DISABLE: - if (state == ARM64_SSBD_FORCE_DISABLE) + /* Force disable speculation: force enable mitigation */ + /* + * If the mitigation is forced off, then speculation is forced + * on and we prevent it from being disabled. + */ + if (spectre_v4_mitigations_off()) return -EPERM; - task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + fallthrough; + case PR_SPEC_DISABLE: + /* Disable speculation: enable mitigation */ + /* Same as PR_SPEC_FORCE_DISABLE */ + if (spectre_v4_mitigations_off()) + return -EPERM; + + task_set_spec_ssb_disable(task); set_tsk_thread_flag(task, TIF_SSBD); - ssbd_ssbs_disable(task); break; default: return -ERANGE; } + spectre_v4_enable_task_mitigation(task); return 0; } @@ -409,22 +724,32 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, static int ssbd_prctl_get(struct task_struct *task) { - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_UNKNOWN: - return -ENODEV; - case ARM64_SSBD_FORCE_ENABLE: - return PR_SPEC_DISABLE; - case ARM64_SSBD_KERNEL: - if (task_spec_ssb_force_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; - if (task_spec_ssb_disable(task)) - return PR_SPEC_PRCTL | PR_SPEC_DISABLE; - return PR_SPEC_PRCTL | PR_SPEC_ENABLE; - case ARM64_SSBD_FORCE_DISABLE: - return PR_SPEC_ENABLE; - default: + switch (spectre_v4_state) { + case SPECTRE_UNAFFECTED: return PR_SPEC_NOT_AFFECTED; + case SPECTRE_MITIGATED: + if (spectre_v4_mitigations_on()) + return PR_SPEC_NOT_AFFECTED; + + if (spectre_v4_mitigations_dynamic()) + break; + + /* Mitigations are disabled, so we're vulnerable. */ + fallthrough; + case SPECTRE_VULNERABLE: + fallthrough; + default: + return PR_SPEC_ENABLE; } + + /* Check the mitigation state for this task */ + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index c1dee9066ff9..584c14ce3c86 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -72,8 +72,7 @@ void notrace __cpu_suspend_exit(void) * have turned the mitigation on. If the user has forcefully * disabled it, make sure their wishes are obeyed. */ - if (arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) - arm64_set_ssbd_mitigation(false); + spectre_v4_enable_mitigation(NULL); } /* From 29e8910a566aad3ee72f729e45842858d51ced8d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Sep 2020 12:25:40 +0100 Subject: [PATCH 129/148] KVM: arm64: Simplify handling of ARCH_WORKAROUND_2 Owing to the fact that the host kernel is always mitigated, we can drastically simplify the WA2 handling by keeping the mitigation state ON when entering the guest. This means the guest is either unaffected or not mitigated. This results in a nice simplification of the mitigation space, and the removal of a lot of code that was never really used anyway. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_asm.h | 3 -- arch/arm64/include/asm/kvm_emulate.h | 14 ------ arch/arm64/include/asm/kvm_mmu.h | 17 ------- arch/arm64/include/uapi/asm/kvm.h | 9 ++++ arch/arm64/kernel/cpu_errata.c | 14 ------ arch/arm64/kernel/image-vars.h | 2 - arch/arm64/kvm/arm.c | 4 -- arch/arm64/kvm/hyp/hyp-entry.S | 27 ----------- arch/arm64/kvm/hyp/include/hyp/switch.h | 29 ------------ arch/arm64/kvm/hyp/nvhe/switch.c | 4 -- arch/arm64/kvm/hyp/vhe/switch.c | 4 -- arch/arm64/kvm/hypercalls.c | 14 +++--- arch/arm64/kvm/psci.c | 59 +++++++++++-------------- arch/arm64/kvm/reset.c | 4 -- 14 files changed, 41 insertions(+), 163 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index e9378cc8049d..abe02cf66880 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -9,9 +9,6 @@ #include -#define VCPU_WORKAROUND_2_FLAG_SHIFT 0 -#define VCPU_WORKAROUND_2_FLAG (_AC(1, UL) << VCPU_WORKAROUND_2_FLAG_SHIFT) - #define ARM_EXIT_WITH_SERROR_BIT 31 #define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_SERROR_BIT)) #define ARM_EXCEPTION_IS_TRAP(x) (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_TRAP) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 49a55be2b9a2..96eccb107ec2 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -383,20 +383,6 @@ static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; } -static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG; -} - -static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu, - bool flag) -{ - if (flag) - vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; - else - vcpu->arch.workaround_flags &= ~VCPU_WORKAROUND_2_FLAG; -} - static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 873e12430ac7..36606ef9e435 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -526,23 +526,6 @@ static inline int kvm_map_vectors(void) } #endif -DECLARE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); - -static inline int hyp_map_aux_data(void) -{ - int cpu, err; - - for_each_possible_cpu(cpu) { - u64 *ptr; - - ptr = per_cpu_ptr(&arm64_ssbd_callback_required, cpu); - err = create_hyp_mappings(ptr, ptr + 1, PAGE_HYP); - if (err) - return err; - } - return 0; -} - #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) /* diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index ba85bb23f060..7d804fd0a682 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -242,6 +242,15 @@ struct kvm_vcpu_events { #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 + +/* + * Only two states can be presented by the host kernel: + * - NOT_REQUIRED: the guest doesn't need to do anything + * - NOT_AVAIL: the guest isn't mitigated (it can still use SSBS if available) + * + * All the other values are deprecated. The host still accepts all + * values (they are ABI), but will narrow them to the above two. + */ #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 #define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7fc54c3d4285..7e9caef13db4 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -108,20 +108,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN; -void __init arm64_enable_wa2_handling(struct alt_instr *alt, - __le32 *origptr, __le32 *updptr, - int nr_inst) -{ - BUG_ON(nr_inst != 1); - /* - * Only allow mitigation on EL1 entry/exit and guest - * ARCH_WORKAROUND_2 handling if the SSBD state allows it to - * be flipped. - */ - if (arm64_get_ssbd_state() == ARM64_SSBD_KERNEL) - *updptr = cpu_to_le32(aarch64_insn_gen_nop()); -} - #ifdef CONFIG_ARM64_ERRATUM_1463225 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8982b68289b7..d0f3f35dd0d7 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -64,12 +64,10 @@ __efistub__ctype = _ctype; #define KVM_NVHE_ALIAS(sym) __kvm_nvhe_##sym = sym; /* Alternative callbacks for init-time patching of nVHE hyp code. */ -KVM_NVHE_ALIAS(arm64_enable_wa2_handling); KVM_NVHE_ALIAS(kvm_patch_vector_branch); KVM_NVHE_ALIAS(kvm_update_va_mask); /* Global kernel state accessed by nVHE hyp code. */ -KVM_NVHE_ALIAS(arm64_ssbd_callback_required); KVM_NVHE_ALIAS(kvm_host_data); KVM_NVHE_ALIAS(kvm_vgic_global_state); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 46dc3d75cf13..b1bd2f4cd3e0 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1549,10 +1549,6 @@ static int init_hyp_mode(void) } } - err = hyp_map_aux_data(); - if (err) - kvm_err("Cannot map host auxiliary data: %d\n", err); - return 0; out_err: diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index db2dd7500617..3cca756e9e41 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -116,33 +116,6 @@ el1_hvc_guest: ARM_SMCCC_ARCH_WORKAROUND_2) cbnz w1, el1_trap -alternative_cb arm64_enable_wa2_handling - b wa2_end -alternative_cb_end - get_vcpu_ptr x2, x0 - ldr x0, [x2, #VCPU_WORKAROUND_FLAGS] - - // Sanitize the argument and update the guest flags - ldr x1, [sp, #8] // Guest's x1 - clz w1, w1 // Murphy's device: - lsr w1, w1, #5 // w1 = !!w1 without using - eor w1, w1, #1 // the flags... - bfi x0, x1, #VCPU_WORKAROUND_2_FLAG_SHIFT, #1 - str x0, [x2, #VCPU_WORKAROUND_FLAGS] - - /* Check that we actually need to perform the call */ - hyp_ldr_this_cpu x0, arm64_ssbd_callback_required, x2 - cbz x0, wa2_end - - mov w0, #ARM_SMCCC_ARCH_WORKAROUND_2 - smc #0 - - /* Don't leak data from the SMC call */ - mov x3, xzr -wa2_end: - mov x2, xzr - mov x1, xzr - wa_epilogue: mov x0, xzr add sp, sp, #16 diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 5dec5589625d..a6840823b60e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -479,35 +479,6 @@ exit: return false; } -static inline bool __needs_ssbd_off(struct kvm_vcpu *vcpu) -{ - if (!cpus_have_final_cap(ARM64_SPECTRE_V4)) - return false; - - return !(vcpu->arch.workaround_flags & VCPU_WORKAROUND_2_FLAG); -} - -static inline void __set_guest_arch_workaround_state(struct kvm_vcpu *vcpu) -{ - /* - * The host runs with the workaround always present. If the - * guest wants it disabled, so be it... - */ - if (__needs_ssbd_off(vcpu) && - __hyp_this_cpu_read(arm64_ssbd_callback_required)) - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 0, NULL); -} - -static inline void __set_host_arch_workaround_state(struct kvm_vcpu *vcpu) -{ - /* - * If the guest has disabled the workaround, bring it back on. - */ - if (__needs_ssbd_off(vcpu) && - __hyp_this_cpu_read(arm64_ssbd_callback_required)) - arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_2, 1, NULL); -} - static inline void __kvm_unexpected_el2_exception(void) { unsigned long addr, fixup; diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 0970442d2dbc..8d3dd4f47924 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -202,8 +202,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __debug_switch_to_guest(vcpu); - __set_guest_arch_workaround_state(vcpu); - do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu, host_ctxt); @@ -211,8 +209,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); - __set_host_arch_workaround_state(vcpu); - __sysreg_save_state_nvhe(guest_ctxt); __sysreg32_save_state(vcpu); __timer_disable_traps(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c1da4f86ccac..ecf67e678203 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -131,8 +131,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); - __set_guest_arch_workaround_state(vcpu); - do { /* Jump in the fire! */ exit_code = __guest_enter(vcpu, host_ctxt); @@ -140,8 +138,6 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); - __set_host_arch_workaround_state(vcpu); - sysreg_save_guest_state_vhe(guest_ctxt); __deactivate_traps(vcpu); diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 413d46b9bc07..69e023dfafce 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -36,15 +36,13 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) } break; case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - case KVM_SSBD_UNKNOWN: + switch (arm64_get_ssbd_state()) { + case ARM64_SSBD_FORCE_DISABLE: + case ARM64_SSBD_UNKNOWN: break; - case KVM_SSBD_KERNEL: - val = SMCCC_RET_SUCCESS; - break; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: + case ARM64_SSBD_KERNEL: + case ARM64_SSBD_FORCE_ENABLE: + case ARM64_SSBD_MITIGATED: val = SMCCC_RET_NOT_REQUIRED; break; } diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index fbdd6f3bea7f..87e6e3818fb5 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -435,17 +435,15 @@ static int get_kernel_wa_level(u64 regid) } return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - switch (kvm_arm_have_ssbd()) { - case KVM_SSBD_FORCE_DISABLE: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; - case KVM_SSBD_KERNEL: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL; - case KVM_SSBD_FORCE_ENABLE: - case KVM_SSBD_MITIGATED: + switch (arm64_get_ssbd_state()) { + case ARM64_SSBD_FORCE_ENABLE: + case ARM64_SSBD_MITIGATED: + case ARM64_SSBD_KERNEL: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - case KVM_SSBD_UNKNOWN: + case ARM64_SSBD_UNKNOWN: + case ARM64_SSBD_FORCE_DISABLE: default: - return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN; + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; } } @@ -462,14 +460,8 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) val = kvm_psci_version(vcpu, vcpu->kvm); break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; - - if (val == KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && - kvm_arm_get_vcpu_workaround_2_flag(vcpu)) - val |= KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED; break; default: return -ENOENT; @@ -527,34 +519,35 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED)) return -EINVAL; - wa_level = val & KVM_REG_FEATURE_LEVEL_MASK; - - if (get_kernel_wa_level(reg->id) < wa_level) - return -EINVAL; - /* The enabled bit must not be set unless the level is AVAIL. */ - if (wa_level != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL && - wa_level != val) + if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) && + (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL) return -EINVAL; - /* Are we finished or do we need to check the enable bit ? */ - if (kvm_arm_have_ssbd() != KVM_SSBD_KERNEL) - return 0; - /* - * If this kernel supports the workaround to be switched on - * or off, make sure it matches the requested setting. + * Map all the possible incoming states to the only two we + * really want to deal with. */ - switch (wa_level) { + switch (val & KVM_REG_FEATURE_LEVEL_MASK) { + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: + case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: - kvm_arm_set_vcpu_workaround_2_flag(vcpu, - val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED); - break; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: - kvm_arm_set_vcpu_workaround_2_flag(vcpu, true); + wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; break; + default: + return -EINVAL; } + /* + * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the + * other way around. + */ + if (get_kernel_wa_level(reg->id) < wa_level) + return -EINVAL; + return 0; default: return -ENOENT; diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index ee33875c5c2a..f6e8b4a75cbb 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -319,10 +319,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.reset_state.reset = false; } - /* Default workaround setup is enabled (if supported) */ - if (kvm_arm_have_ssbd() == KVM_SSBD_KERNEL) - vcpu->arch.workaround_flags |= VCPU_WORKAROUND_2_FLAG; - /* Reset timer */ ret = kvm_timer_vcpu_reset(vcpu); out: From 7311467702710cc30ac4e3a6c6670a766e7667f9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Sep 2020 13:59:32 +0100 Subject: [PATCH 130/148] KVM: arm64: Get rid of kvm_arm_have_ssbd() kvm_arm_have_ssbd() is now completely unused, get rid of it. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_host.h | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 59f6cff0f0cb..8785ebf9bc29 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -631,29 +631,6 @@ static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} #endif -#define KVM_SSBD_UNKNOWN -1 -#define KVM_SSBD_FORCE_DISABLE 0 -#define KVM_SSBD_KERNEL 1 -#define KVM_SSBD_FORCE_ENABLE 2 -#define KVM_SSBD_MITIGATED 3 - -static inline int kvm_arm_have_ssbd(void) -{ - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_DISABLE: - return KVM_SSBD_FORCE_DISABLE; - case ARM64_SSBD_KERNEL: - return KVM_SSBD_KERNEL; - case ARM64_SSBD_FORCE_ENABLE: - return KVM_SSBD_FORCE_ENABLE; - case ARM64_SSBD_MITIGATED: - return KVM_SSBD_MITIGATED; - case ARM64_SSBD_UNKNOWN: - default: - return KVM_SSBD_UNKNOWN; - } -} - void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu); From d63d975a71b332df36cc802e6e77a462af6b9fef Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Sep 2020 14:08:54 +0100 Subject: [PATCH 131/148] KVM: arm64: Convert ARCH_WORKAROUND_2 to arm64_get_spectre_v4_state() Convert the KVM WA2 code to using the Spectre infrastructure, making the code much more readable. It also allows us to take SSBS into account for the mitigation. Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/hypercalls.c | 23 +++++++++++++++++------ arch/arm64/kvm/psci.c | 19 ++++++++++++------- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b1bd2f4cd3e0..9b90a701462c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1292,7 +1292,7 @@ static void cpu_init_hyp_mode(void) * at EL2. */ if (this_cpu_has_cap(ARM64_SSBS) && - arm64_get_ssbd_state() == ARM64_SSBD_FORCE_DISABLE) { + arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { kvm_call_hyp_nvhe(__kvm_enable_ssbs); } } diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 69e023dfafce..9824025ccc5c 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -36,13 +36,24 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) } break; case ARM_SMCCC_ARCH_WORKAROUND_2: - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_DISABLE: - case ARM64_SSBD_UNKNOWN: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_VULNERABLE: break; - case ARM64_SSBD_KERNEL: - case ARM64_SSBD_FORCE_ENABLE: - case ARM64_SSBD_MITIGATED: + case SPECTRE_MITIGATED: + /* + * SSBS everywhere: Indicate no firmware + * support, as the SSBS support will be + * indicated to the guest and the default is + * safe. + * + * Otherwise, expose a permanent mitigation + * to the guest, and hide SSBS so that the + * guest stays protected. + */ + if (cpus_have_final_cap(ARM64_SSBS)) + break; + fallthrough; + case SPECTRE_UNAFFECTED: val = SMCCC_RET_NOT_REQUIRED; break; } diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index 87e6e3818fb5..db4056ecccfd 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -435,14 +435,19 @@ static int get_kernel_wa_level(u64 regid) } return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL; case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: - switch (arm64_get_ssbd_state()) { - case ARM64_SSBD_FORCE_ENABLE: - case ARM64_SSBD_MITIGATED: - case ARM64_SSBD_KERNEL: + switch (arm64_get_spectre_v4_state()) { + case SPECTRE_MITIGATED: + /* + * As for the hypercall discovery, we pretend we + * don't have any FW mitigation if SSBS is there at + * all times. + */ + if (cpus_have_final_cap(ARM64_SSBS)) + return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; + fallthrough; + case SPECTRE_UNAFFECTED: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED; - case ARM64_SSBD_UNKNOWN: - case ARM64_SSBD_FORCE_DISABLE: - default: + case SPECTRE_VULNERABLE: return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; } } From 31c84d6c9cde673b3866972552ec52e4c522b4fa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Sep 2020 14:11:25 +0100 Subject: [PATCH 132/148] arm64: Get rid of arm64_ssbd_state Out with the old ghost, in with the new... Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpufeature.h | 14 -------------- arch/arm64/kernel/cpu_errata.c | 2 -- 2 files changed, 16 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 3b48aa121cee..fba6700b457b 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -698,20 +698,6 @@ static inline bool system_supports_tlb_range(void) cpus_have_const_cap(ARM64_HAS_TLB_RANGE); } -#define ARM64_SSBD_UNKNOWN -1 -#define ARM64_SSBD_FORCE_DISABLE 0 -#define ARM64_SSBD_KERNEL 1 -#define ARM64_SSBD_FORCE_ENABLE 2 -#define ARM64_SSBD_MITIGATED 3 - -static inline int arm64_get_ssbd_state(void) -{ - extern int ssbd_state; - return ssbd_state; -} - -void arm64_set_ssbd_mitigation(bool state); - extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 7e9caef13db4..6c8303559beb 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -106,8 +106,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *cap) sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0); } -int ssbd_state __read_mostly = ARM64_SSBD_UNKNOWN; - #ifdef CONFIG_ARM64_ERRATUM_1463225 DEFINE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa); From 9ef2b48be9bba70ded9372fc81e678e707306060 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 28 Sep 2020 11:45:24 +0100 Subject: [PATCH 133/148] KVM: arm64: Allow patching EL2 vectors even with KASLR is not enabled Patching the EL2 exception vectors is integral to the Spectre-v2 workaround, where it can be necessary to execute CPU-specific sequences to nobble the branch predictor before running the hypervisor text proper. Remove the dependency on CONFIG_RANDOMIZE_BASE and allow the EL2 vectors to be patched even when KASLR is not enabled. Fixes: 7a132017e7a5 ("KVM: arm64: Replace CONFIG_KVM_INDIRECT_VECTORS with CONFIG_RANDOMIZE_BASE") Reported-by: kernel test robot Link: https://lore.kernel.org/r/202009221053.Jv1XsQUZ%lkp@intel.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_asm.h | 2 -- arch/arm64/include/asm/kvm_mmu.h | 51 +------------------------------- arch/arm64/kernel/proton-pack.c | 2 -- arch/arm64/kvm/arm.c | 34 +++++++++++++++++++++ arch/arm64/kvm/hyp/Makefile | 3 +- arch/arm64/kvm/hyp/hyp-entry.S | 2 -- 6 files changed, 36 insertions(+), 58 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index abe02cf66880..7f7072f6cb45 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -99,11 +99,9 @@ DECLARE_KVM_HYP_SYM(__kvm_hyp_vector); #define __kvm_hyp_init CHOOSE_NVHE_SYM(__kvm_hyp_init) #define __kvm_hyp_vector CHOOSE_HYP_SYM(__kvm_hyp_vector) -#ifdef CONFIG_RANDOMIZE_BASE extern atomic_t arm64_el2_vector_last_slot; DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs); #define __bp_harden_hyp_vecs CHOOSE_HYP_SYM(__bp_harden_hyp_vecs) -#endif extern void __kvm_flush_vm_context(void); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 36606ef9e435..cff1cebc7590 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -9,6 +9,7 @@ #include #include +#include #include /* @@ -430,7 +431,6 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, return ret; } -#ifdef CONFIG_RANDOMIZE_BASE /* * EL2 vectors can be mapped and rerouted in a number of ways, * depending on the kernel configuration and CPU present: @@ -451,12 +451,9 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, * VHE, as we don't have hypervisor-specific mappings. If the system * is VHE and yet selects this capability, it will be ignored. */ -#include - extern void *__kvm_bp_vect_base; extern int __kvm_harden_el2_vector_slot; -/* This is called on both VHE and !VHE systems */ static inline void *kvm_get_hyp_vector(void) { struct bp_hardening_data *data = arm64_get_bp_hardening_data(); @@ -480,52 +477,6 @@ static inline void *kvm_get_hyp_vector(void) return vect; } -/* This is only called on a !VHE system */ -static inline int kvm_map_vectors(void) -{ - /* - * SV2 = ARM64_SPECTRE_V2 - * HEL2 = ARM64_HARDEN_EL2_VECTORS - * - * !SV2 + !HEL2 -> use direct vectors - * SV2 + !HEL2 -> use hardened vectors in place - * !SV2 + HEL2 -> allocate one vector slot and use exec mapping - * SV2 + HEL2 -> use hardened vertors and use exec mapping - */ - if (cpus_have_const_cap(ARM64_SPECTRE_V2)) { - __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs); - __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); - } - - if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { - phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs); - unsigned long size = __BP_HARDEN_HYP_VECS_SZ; - - /* - * Always allocate a spare vector slot, as we don't - * know yet which CPUs have a BP hardening slot that - * we can reuse. - */ - __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot); - BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS); - return create_hyp_exec_mappings(vect_pa, size, - &__kvm_bp_vect_base); - } - - return 0; -} -#else -static inline void *kvm_get_hyp_vector(void) -{ - return kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); -} - -static inline int kvm_map_vectors(void) -{ - return 0; -} -#endif - #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) /* diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index b1ea935fd948..1fbaa0240d4c 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -177,7 +177,6 @@ enum mitigation_state arm64_get_spectre_v2_state(void) } #ifdef CONFIG_KVM -#ifdef CONFIG_RANDOMIZE_BASE #include #include @@ -235,7 +234,6 @@ static void install_bp_hardening_cb(bp_hardening_cb_t fn) { __this_cpu_write(bp_hardening_data.fn, fn); } -#endif /* CONFIG_RANDOMIZE_BASE */ #endif /* CONFIG_KVM */ static void call_smc_arch_workaround_1(void) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 9b90a701462c..13e559ac7235 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1256,6 +1256,40 @@ long kvm_arch_vm_ioctl(struct file *filp, } } +static int kvm_map_vectors(void) +{ + /* + * SV2 = ARM64_SPECTRE_V2 + * HEL2 = ARM64_HARDEN_EL2_VECTORS + * + * !SV2 + !HEL2 -> use direct vectors + * SV2 + !HEL2 -> use hardened vectors in place + * !SV2 + HEL2 -> allocate one vector slot and use exec mapping + * SV2 + HEL2 -> use hardened vectors and use exec mapping + */ + if (cpus_have_const_cap(ARM64_SPECTRE_V2)) { + __kvm_bp_vect_base = kvm_ksym_ref(__bp_harden_hyp_vecs); + __kvm_bp_vect_base = kern_hyp_va(__kvm_bp_vect_base); + } + + if (cpus_have_const_cap(ARM64_HARDEN_EL2_VECTORS)) { + phys_addr_t vect_pa = __pa_symbol(__bp_harden_hyp_vecs); + unsigned long size = __BP_HARDEN_HYP_VECS_SZ; + + /* + * Always allocate a spare vector slot, as we don't + * know yet which CPUs have a BP hardening slot that + * we can reuse. + */ + __kvm_harden_el2_vector_slot = atomic_inc_return(&arm64_el2_vector_last_slot); + BUG_ON(__kvm_harden_el2_vector_slot >= BP_HARDEN_EL2_SLOTS); + return create_hyp_exec_mappings(vect_pa, size, + &__kvm_bp_vect_base); + } + + return 0; +} + static void cpu_init_hyp_mode(void) { phys_addr_t pgd_ptr; diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 89d2bf73acb5..d898f0da5802 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -10,5 +10,4 @@ subdir-ccflags-y := -I$(incdir) \ -DDISABLE_BRANCH_PROFILING \ $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_KVM) += vhe/ nvhe/ -obj-$(CONFIG_RANDOMIZE_BASE) += smccc_wa.o +obj-$(CONFIG_KVM) += vhe/ nvhe/ smccc_wa.o diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 3cca756e9e41..7ea277b82967 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -259,7 +259,6 @@ SYM_CODE_START(__kvm_hyp_vector) valid_vect el1_error // Error 32-bit EL1 SYM_CODE_END(__kvm_hyp_vector) -#ifdef CONFIG_RANDOMIZE_BASE .macro hyp_ventry .align 7 1: esb @@ -309,4 +308,3 @@ SYM_CODE_START(__bp_harden_hyp_vecs) 1: .org __bp_harden_hyp_vecs + __BP_HARDEN_HYP_VECS_SZ .org 1b SYM_CODE_END(__bp_harden_hyp_vecs) -#endif From 5c8b0cbd9d6bac5f40943b5a7d8eac8cb86cbe7f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 28 Sep 2020 14:06:50 +0100 Subject: [PATCH 134/148] arm64: Pull in task_stack_page() to Spectre-v4 mitigation code The kbuild robot reports that we're relying on an implicit inclusion to get a definition of task_stack_page() in the Spectre-v4 mitigation code, which is not always in place for some configurations: | arch/arm64/kernel/proton-pack.c:329:2: error: implicit declaration of function 'task_stack_page' [-Werror,-Wimplicit-function-declaration] | task_pt_regs(task)->pstate |= val; | ^ | arch/arm64/include/asm/processor.h:268:36: note: expanded from macro 'task_pt_regs' | ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) | ^ | arch/arm64/kernel/proton-pack.c:329:2: note: did you mean 'task_spread_page'? Add the missing include to fix the build error. Fixes: a44acf477220 ("arm64: Move SSBD prctl() handler alongside other spectre mitigation code") Reported-by: Anthony Steinhauser Reported-by: kernel test robot Link: https://lore.kernel.org/r/202009260013.Ul7AD29w%lkp@intel.com Signed-off-by: Will Deacon --- arch/arm64/kernel/proton-pack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 1fbaa0240d4c..16c9a095a746 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include From 780c083a8f840ca9162c7a4090ff5e10d15152a2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 28 Sep 2020 14:03:00 +0100 Subject: [PATCH 135/148] arm64: Add support for PR_SPEC_DISABLE_NOEXEC prctl() option The PR_SPEC_DISABLE_NOEXEC option to the PR_SPEC_STORE_BYPASS prctl() allows the SSB mitigation to be enabled only until the next execve(), at which point the state will revert back to PR_SPEC_ENABLE and the mitigation will be disabled. Add support for PR_SPEC_DISABLE_NOEXEC on arm64. Reported-by: Anthony Steinhauser Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 6 ++++++ arch/arm64/kernel/proton-pack.c | 38 +++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 9dbd35b95253..085d8ca39e47 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -609,6 +610,11 @@ void arch_setup_new_exec(void) current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; ptrauth_thread_init_user(current); + + if (task_spec_ssb_noexec(current)) { + arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, + PR_SPEC_ENABLE); + } } #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 16c9a095a746..68b710f1b43f 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -660,6 +660,20 @@ void spectre_v4_enable_task_mitigation(struct task_struct *tsk) * prctl() may be necessary even when PSTATE.SSBS can be toggled directly * from userspace. */ +static void ssbd_prctl_enable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_set_spec_ssb_disable(task); + set_tsk_thread_flag(task, TIF_SSBD); +} + +static void ssbd_prctl_disable_mitigation(struct task_struct *task) +{ + task_clear_spec_ssb_noexec(task); + task_clear_spec_ssb_disable(task); + clear_tsk_thread_flag(task, TIF_SSBD); +} + static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { @@ -679,8 +693,7 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v4_mitigations_on()) return -EPERM; - task_clear_spec_ssb_disable(task); - clear_tsk_thread_flag(task, TIF_SSBD); + ssbd_prctl_disable_mitigation(task); break; case PR_SPEC_FORCE_DISABLE: /* Force disable speculation: force enable mitigation */ @@ -699,8 +712,22 @@ static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl) if (spectre_v4_mitigations_off()) return -EPERM; - task_set_spec_ssb_disable(task); - set_tsk_thread_flag(task, TIF_SSBD); + ssbd_prctl_enable_mitigation(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + /* Disable speculation until execve(): enable mitigation */ + /* + * If the mitigation state is forced one way or the other, then + * we must fail now before we try to toggle it on execve(). + */ + if (task_spec_ssb_force_disable(task) || + spectre_v4_mitigations_off() || + spectre_v4_mitigations_on()) { + return -EPERM; + } + + ssbd_prctl_enable_mitigation(task); + task_set_spec_ssb_noexec(task); break; default: return -ERANGE; @@ -745,6 +772,9 @@ static int ssbd_prctl_get(struct task_struct *task) if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; + if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; From 6a1bdb173f9967b2329aab0f25bcba963f54e06b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Sep 2020 13:20:40 +0100 Subject: [PATCH 136/148] arm64: mm: Make flush_tlb_fix_spurious_fault() a no-op Our use of broadcast TLB maintenance means that spurious page-faults that have been handled already by another CPU do not require additional TLB maintenance. Make flush_tlb_fix_spurious_fault() a no-op and rely on the existing TLB invalidation instead. Add an explicit flush_tlb_page() when making a page dirty, as the TLB is permitted to cache the old read-only entry. Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20200728092220.GA21800@willie-the-truck Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 8 ++++++++ arch/arm64/mm/fault.c | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index bc68da9f5706..02ad3105c14c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -50,6 +50,14 @@ extern void __pgd_error(const char *file, int line, unsigned long val); __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* + * Outside of a few very special situations (e.g. hibernation), we always + * use broadcast TLB invalidation instructions, therefore a spurious page + * fault on one CPU which has been handled concurrently by another CPU + * does not need to perform additional invalidation. + */ +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f07333e86c2f..a696a7921da4 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -218,7 +218,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma, pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); } while (pteval != old_pteval); - flush_tlb_fix_spurious_fault(vma, address); + /* Invalidate a stale read-only entry */ + if (dirty) + flush_tlb_page(vma, address); return 1; } From 80d6b466679c3dced3b359a6379c6d913de39afd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 1 Oct 2020 09:48:21 +0100 Subject: [PATCH 137/148] arm64: dbm: Invalidate local TLB when setting TCR_EL1.HD TCR_EL1.HD is permitted to be cached in a TLB, so invalidate the local TLB after setting the bit when detected support for the feature. Although this isn't strictly necessary, since we can happily operate with the bit effectively clear, the current code uses an ISB in a half-hearted attempt to make the change effective, so let's just fix that up. Link: https://lore.kernel.org/r/20201001110405.18617-1-will@kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/kernel/cpufeature.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6424584be01e..a474a4f39c95 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1443,6 +1443,7 @@ static inline void __cpu_enable_hw_dbm(void) write_sysreg(tcr, tcr_el1); isb(); + local_flush_tlb_all(); } static bool cpu_has_broken_dbm(void) From d9ef632fab9ba81b708763bcbcfdbea9a55c95d2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 1 Oct 2020 11:54:54 +0100 Subject: [PATCH 138/148] perf: arm-cmn: Fix unsigned comparison to less than zero Ensure that the 'irq' field of 'struct arm_cmn_dtc' is a signed int so that it can be compared '< 0'. Link: https://lore.kernel.org/r/20200929170835.GA15956@embeddedor Addresses-Coverity-ID: 1497488 ("Unsigned compared against 0") Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver") Reported-by: Gustavo A. R. Silva Reviewed-by: Gustavo A. R. Silva Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index e824b5b83ea2..cd4da4c5dac0 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -217,7 +217,7 @@ struct arm_cmn_node { struct arm_cmn_dtc { void __iomem *base; - unsigned int irq; + int irq; int irq_friend; bool cc_active; From 887e2cff0f8dc4dac4c35e631b711325c6dd65b8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 1 Oct 2020 11:57:01 +0100 Subject: [PATCH 139/148] perf: arm-cmn: Fix conversion specifiers for node type The node type field is an enum type, so print it as a 32-bit quantity rather than as an unsigned short. Link: https://lore.kernel.org/r/202009302350.QIzfkx62-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Will Deacon --- drivers/perf/arm-cmn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index cd4da4c5dac0..a76ff594f3ca 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1324,7 +1324,7 @@ static void arm_cmn_init_node_info(struct arm_cmn *cmn, u32 offset, struct arm_c else level = 2; - dev_dbg(cmn->dev, "node%*c%#06hx%*ctype:%-#6hx id:%-4hd off:%#x\n", + dev_dbg(cmn->dev, "node%*c%#06hx%*ctype:%-#6x id:%-4hd off:%#x\n", (level * 2) + 1, ' ', node->id, 5 - (level * 2), ' ', node->type, node->logid, offset); } @@ -1430,7 +1430,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset) break; /* Something has gone horribly wrong */ default: - dev_err(cmn->dev, "invalid device node type: 0x%hx\n", dn->type); + dev_err(cmn->dev, "invalid device node type: 0x%x\n", dn->type); return -ENODEV; } } From e9b60476bea058d85f8029e6701d9476f7fdb92f Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:25 +0530 Subject: [PATCH 140/148] kselftest/arm64: Add utilities and a test to validate mte memory This test checks that the memory tag is present after mte allocation and the memory is accessible with those tags. This testcase verifies all sync, async and none mte error reporting mode. The allocated mte buffers are verified for Allocated range (no error expected while accessing buffer), Underflow range, and Overflow range. Different test scenarios covered here are, * Verify that mte memory are accessible at byte/block level. * Force underflow and overflow to occur and check the data consistency. * Check to/from between tagged and untagged memory. * Check that initial allocated memory to have 0 tag. This change also creates the necessary infrastructure to add mte test cases. MTE kselftests can use the several utility functions provided here to add wide variety of mte test scenarios. GCC compiler need flag '-march=armv8.5-a+memtag' so those flags are verified before compilation. The mte testcases can be launched with kselftest framework as, make TARGETS=arm64 ARM64_SUBTARGETS=mte kselftest or compiled as, make -C tools/testing/selftests TARGETS=arm64 ARM64_SUBTARGETS=mte CC='compiler' Co-developed-by: Gabor Kertesz Signed-off-by: Gabor Kertesz Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-2-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/Makefile | 2 +- tools/testing/selftests/arm64/mte/.gitignore | 1 + tools/testing/selftests/arm64/mte/Makefile | 29 ++ .../selftests/arm64/mte/check_buffer_fill.c | 475 ++++++++++++++++++ .../selftests/arm64/mte/mte_common_util.c | 341 +++++++++++++ .../selftests/arm64/mte/mte_common_util.h | 117 +++++ tools/testing/selftests/arm64/mte/mte_def.h | 60 +++ .../testing/selftests/arm64/mte/mte_helper.S | 114 +++++ 8 files changed, 1138 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/mte/.gitignore create mode 100644 tools/testing/selftests/arm64/mte/Makefile create mode 100644 tools/testing/selftests/arm64/mte/check_buffer_fill.c create mode 100644 tools/testing/selftests/arm64/mte/mte_common_util.c create mode 100644 tools/testing/selftests/arm64/mte/mte_common_util.h create mode 100644 tools/testing/selftests/arm64/mte/mte_def.h create mode 100644 tools/testing/selftests/arm64/mte/mte_helper.S diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 93b567d23c8b..3723d9dea11a 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -4,7 +4,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),aarch64 arm64)) -ARM64_SUBTARGETS ?= tags signal +ARM64_SUBTARGETS ?= tags signal mte else ARM64_SUBTARGETS := endif diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore new file mode 100644 index 000000000000..3f8c1f6c82b9 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -0,0 +1 @@ +check_buffer_fill diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile new file mode 100644 index 000000000000..2480226dfe57 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/Makefile @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020 ARM Limited + +CFLAGS += -std=gnu99 -I. +SRCS := $(filter-out mte_common_util.c,$(wildcard *.c)) +PROGS := $(patsubst %.c,%,$(SRCS)) + +#Add mte compiler option +ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep gcc),) +CFLAGS += -march=armv8.5-a+memtag +endif + +#check if the compiler works well +mte_cc_support := $(shell if ($(CC) $(CFLAGS) -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi) + +ifeq ($(mte_cc_support),1) +# Generated binaries to be installed by top KSFT script +TEST_GEN_PROGS := $(PROGS) + +# Get Kernel headers installed and use them. +KSFT_KHDR_INSTALL := 1 +endif + +# Include KSFT lib.mk. +include ../../lib.mk + +ifeq ($(mte_cc_support),1) +$(TEST_GEN_PROGS): mte_common_util.c mte_common_util.h mte_helper.S +endif diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c new file mode 100644 index 000000000000..242635d79035 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define OVERFLOW_RANGE MT_GRANULE_SIZE + +static int sizes[] = { + 1, 555, 1033, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE, + /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0 +}; + +enum mte_block_test_alloc { + UNTAGGED_TAGGED, + TAGGED_UNTAGGED, + TAGGED_TAGGED, + BLOCK_ALLOC_MAX, +}; + +static int check_buffer_by_byte(int mem_type, int mode) +{ + char *ptr; + int i, j, item; + bool err; + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + item = sizeof(sizes)/sizeof(int); + + for (i = 0; i < item; i++) { + ptr = (char *)mte_allocate_memory(sizes[i], mem_type, 0, true); + if (check_allocated_memory(ptr, sizes[i], mem_type, true) != KSFT_PASS) + return KSFT_FAIL; + mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[i]); + /* Set some value in tagged memory */ + for (j = 0; j < sizes[i]; j++) + ptr[j] = '1'; + mte_wait_after_trig(); + err = cur_mte_cxt.fault_valid; + /* Check the buffer whether it is filled. */ + for (j = 0; j < sizes[i] && !err; j++) { + if (ptr[j] != '1') + err = true; + } + mte_free_memory((void *)ptr, sizes[i], mem_type, true); + + if (err) + break; + } + if (!err) + return KSFT_PASS; + else + return KSFT_FAIL; +} + +static int check_buffer_underflow_by_byte(int mem_type, int mode, + int underflow_range) +{ + char *ptr; + int i, j, item, last_index; + bool err; + char *und_ptr = NULL; + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + item = sizeof(sizes)/sizeof(int); + for (i = 0; i < item; i++) { + ptr = (char *)mte_allocate_memory_tag_range(sizes[i], mem_type, 0, + underflow_range, 0); + if (check_allocated_memory_range(ptr, sizes[i], mem_type, + underflow_range, 0) != KSFT_PASS) + return KSFT_FAIL; + + mte_initialize_current_context(mode, (uintptr_t)ptr, -underflow_range); + last_index = 0; + /* Set some value in tagged memory and make the buffer underflow */ + for (j = sizes[i] - 1; (j >= -underflow_range) && + (cur_mte_cxt.fault_valid == false); j--) { + ptr[j] = '1'; + last_index = j; + } + mte_wait_after_trig(); + err = false; + /* Check whether the buffer is filled */ + for (j = 0; j < sizes[i]; j++) { + if (ptr[j] != '1') { + err = true; + ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n", + j, ptr); + break; + } + } + if (err) + goto check_buffer_underflow_by_byte_err; + + switch (mode) { + case MTE_NONE_ERR: + if (cur_mte_cxt.fault_valid == true || last_index != -underflow_range) { + err = true; + break; + } + /* There were no fault so the underflow area should be filled */ + und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr - underflow_range); + for (j = 0 ; j < underflow_range; j++) { + if (und_ptr[j] != '1') { + err = true; + break; + } + } + break; + case MTE_ASYNC_ERR: + /* Imprecise fault should occur otherwise return error */ + if (cur_mte_cxt.fault_valid == false) { + err = true; + break; + } + /* + * The imprecise fault is checked after the write to the buffer, + * so the underflow area before the fault should be filled. + */ + und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr); + for (j = last_index ; j < 0 ; j++) { + if (und_ptr[j] != '1') { + err = true; + break; + } + } + break; + case MTE_SYNC_ERR: + /* Precise fault should occur otherwise return error */ + if (!cur_mte_cxt.fault_valid || (last_index != (-1))) { + err = true; + break; + } + /* Underflow area should not be filled */ + und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr); + if (und_ptr[-1] == '1') + err = true; + break; + default: + err = true; + break; + } +check_buffer_underflow_by_byte_err: + mte_free_memory_tag_range((void *)ptr, sizes[i], mem_type, underflow_range, 0); + if (err) + break; + } + return (err ? KSFT_FAIL : KSFT_PASS); +} + +static int check_buffer_overflow_by_byte(int mem_type, int mode, + int overflow_range) +{ + char *ptr; + int i, j, item, last_index; + bool err; + size_t tagged_size, overflow_size; + char *over_ptr = NULL; + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + item = sizeof(sizes)/sizeof(int); + for (i = 0; i < item; i++) { + ptr = (char *)mte_allocate_memory_tag_range(sizes[i], mem_type, 0, + 0, overflow_range); + if (check_allocated_memory_range(ptr, sizes[i], mem_type, + 0, overflow_range) != KSFT_PASS) + return KSFT_FAIL; + + tagged_size = MT_ALIGN_UP(sizes[i]); + + mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[i] + overflow_range); + + /* Set some value in tagged memory and make the buffer underflow */ + for (j = 0, last_index = 0 ; (j < (sizes[i] + overflow_range)) && + (cur_mte_cxt.fault_valid == false); j++) { + ptr[j] = '1'; + last_index = j; + } + mte_wait_after_trig(); + err = false; + /* Check whether the buffer is filled */ + for (j = 0; j < sizes[i]; j++) { + if (ptr[j] != '1') { + err = true; + ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n", + j, ptr); + break; + } + } + if (err) + goto check_buffer_overflow_by_byte_err; + + overflow_size = overflow_range - (tagged_size - sizes[i]); + + switch (mode) { + case MTE_NONE_ERR: + if ((cur_mte_cxt.fault_valid == true) || + (last_index != (sizes[i] + overflow_range - 1))) { + err = true; + break; + } + /* There were no fault so the overflow area should be filled */ + over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr + tagged_size); + for (j = 0 ; j < overflow_size; j++) { + if (over_ptr[j] != '1') { + err = true; + break; + } + } + break; + case MTE_ASYNC_ERR: + /* Imprecise fault should occur otherwise return error */ + if (cur_mte_cxt.fault_valid == false) { + err = true; + break; + } + /* + * The imprecise fault is checked after the write to the buffer, + * so the overflow area should be filled before the fault. + */ + over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr); + for (j = tagged_size ; j < last_index; j++) { + if (over_ptr[j] != '1') { + err = true; + break; + } + } + break; + case MTE_SYNC_ERR: + /* Precise fault should occur otherwise return error */ + if (!cur_mte_cxt.fault_valid || (last_index != tagged_size)) { + err = true; + break; + } + /* Underflow area should not be filled */ + over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr + tagged_size); + for (j = 0 ; j < overflow_size; j++) { + if (over_ptr[j] == '1') + err = true; + } + break; + default: + err = true; + break; + } +check_buffer_overflow_by_byte_err: + mte_free_memory_tag_range((void *)ptr, sizes[i], mem_type, 0, overflow_range); + if (err) + break; + } + return (err ? KSFT_FAIL : KSFT_PASS); +} + +static int check_buffer_by_block_iterate(int mem_type, int mode, size_t size) +{ + char *src, *dst; + int j, result = KSFT_PASS; + enum mte_block_test_alloc alloc_type = UNTAGGED_TAGGED; + + for (alloc_type = UNTAGGED_TAGGED; alloc_type < (int) BLOCK_ALLOC_MAX; alloc_type++) { + switch (alloc_type) { + case UNTAGGED_TAGGED: + src = (char *)mte_allocate_memory(size, mem_type, 0, false); + if (check_allocated_memory(src, size, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + dst = (char *)mte_allocate_memory(size, mem_type, 0, true); + if (check_allocated_memory(dst, size, mem_type, true) != KSFT_PASS) { + mte_free_memory((void *)src, size, mem_type, false); + return KSFT_FAIL; + } + + break; + case TAGGED_UNTAGGED: + dst = (char *)mte_allocate_memory(size, mem_type, 0, false); + if (check_allocated_memory(dst, size, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + src = (char *)mte_allocate_memory(size, mem_type, 0, true); + if (check_allocated_memory(src, size, mem_type, true) != KSFT_PASS) { + mte_free_memory((void *)dst, size, mem_type, false); + return KSFT_FAIL; + } + break; + case TAGGED_TAGGED: + src = (char *)mte_allocate_memory(size, mem_type, 0, true); + if (check_allocated_memory(src, size, mem_type, true) != KSFT_PASS) + return KSFT_FAIL; + + dst = (char *)mte_allocate_memory(size, mem_type, 0, true); + if (check_allocated_memory(dst, size, mem_type, true) != KSFT_PASS) { + mte_free_memory((void *)src, size, mem_type, true); + return KSFT_FAIL; + } + break; + default: + return KSFT_FAIL; + } + + cur_mte_cxt.fault_valid = false; + result = KSFT_PASS; + mte_initialize_current_context(mode, (uintptr_t)dst, size); + /* Set some value in memory and copy*/ + memset((void *)src, (int)'1', size); + memcpy((void *)dst, (void *)src, size); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid) { + result = KSFT_FAIL; + goto check_buffer_by_block_err; + } + /* Check the buffer whether it is filled. */ + for (j = 0; j < size; j++) { + if (src[j] != dst[j] || src[j] != '1') { + result = KSFT_FAIL; + break; + } + } +check_buffer_by_block_err: + mte_free_memory((void *)src, size, mem_type, + MT_FETCH_TAG((uintptr_t)src) ? true : false); + mte_free_memory((void *)dst, size, mem_type, + MT_FETCH_TAG((uintptr_t)dst) ? true : false); + if (result != KSFT_PASS) + return result; + } + return result; +} + +static int check_buffer_by_block(int mem_type, int mode) +{ + int i, item, result = KSFT_PASS; + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + item = sizeof(sizes)/sizeof(int); + cur_mte_cxt.fault_valid = false; + for (i = 0; i < item; i++) { + result = check_buffer_by_block_iterate(mem_type, mode, sizes[i]); + if (result != KSFT_PASS) + break; + } + return result; +} + +static int compare_memory_tags(char *ptr, size_t size, int tag) +{ + int i, new_tag; + + for (i = 0 ; i < size ; i += MT_GRANULE_SIZE) { + new_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i))); + if (tag != new_tag) { + ksft_print_msg("FAIL: child mte tag mismatch\n"); + return KSFT_FAIL; + } + } + return KSFT_PASS; +} + +static int check_memory_initial_tags(int mem_type, int mode, int mapping) +{ + char *ptr; + int run, fd; + int total = sizeof(sizes)/sizeof(int); + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < total; run++) { + /* check initial tags for anonymous mmap */ + ptr = (char *)mte_allocate_memory(sizes[run], mem_type, mapping, false); + if (check_allocated_memory(ptr, sizes[run], mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + if (compare_memory_tags(ptr, sizes[run], 0) != KSFT_PASS) { + mte_free_memory((void *)ptr, sizes[run], mem_type, false); + return KSFT_FAIL; + } + mte_free_memory((void *)ptr, sizes[run], mem_type, false); + + /* check initial tags for file mmap */ + fd = create_temp_file(); + if (fd == -1) + return KSFT_FAIL; + ptr = (char *)mte_allocate_file_memory(sizes[run], mem_type, mapping, false, fd); + if (check_allocated_memory(ptr, sizes[run], mem_type, false) != KSFT_PASS) { + close(fd); + return KSFT_FAIL; + } + if (compare_memory_tags(ptr, sizes[run], 0) != KSFT_PASS) { + mte_free_memory((void *)ptr, sizes[run], mem_type, false); + close(fd); + return KSFT_FAIL; + } + mte_free_memory((void *)ptr, sizes[run], mem_type, false); + close(fd); + } + return KSFT_PASS; +} + +int main(int argc, char *argv[]) +{ + int err; + size_t page_size = getpagesize(); + int item = sizeof(sizes)/sizeof(int); + + sizes[item - 3] = page_size - 1; + sizes[item - 2] = page_size; + sizes[item - 1] = page_size + 1; + + err = mte_default_setup(); + if (err) + return err; + + /* Register SIGSEGV handler */ + mte_register_signal(SIGSEGV, mte_default_handler); + + /* Buffer by byte tests */ + evaluate_test(check_buffer_by_byte(USE_MMAP, MTE_SYNC_ERR), + "Check buffer correctness by byte with sync err mode and mmap memory\n"); + evaluate_test(check_buffer_by_byte(USE_MMAP, MTE_ASYNC_ERR), + "Check buffer correctness by byte with async err mode and mmap memory\n"); + evaluate_test(check_buffer_by_byte(USE_MPROTECT, MTE_SYNC_ERR), + "Check buffer correctness by byte with sync err mode and mmap/mprotect memory\n"); + evaluate_test(check_buffer_by_byte(USE_MPROTECT, MTE_ASYNC_ERR), + "Check buffer correctness by byte with async err mode and mmap/mprotect memory\n"); + + /* Check buffer underflow with underflow size as 16 */ + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_SYNC_ERR, MT_GRANULE_SIZE), + "Check buffer write underflow by byte with sync mode and mmap memory\n"); + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, MT_GRANULE_SIZE), + "Check buffer write underflow by byte with async mode and mmap memory\n"); + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_NONE_ERR, MT_GRANULE_SIZE), + "Check buffer write underflow by byte with tag check fault ignore and mmap memory\n"); + + /* Check buffer underflow with underflow size as page size */ + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_SYNC_ERR, page_size), + "Check buffer write underflow by byte with sync mode and mmap memory\n"); + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, page_size), + "Check buffer write underflow by byte with async mode and mmap memory\n"); + evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_NONE_ERR, page_size), + "Check buffer write underflow by byte with tag check fault ignore and mmap memory\n"); + + /* Check buffer overflow with overflow size as 16 */ + evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_SYNC_ERR, MT_GRANULE_SIZE), + "Check buffer write overflow by byte with sync mode and mmap memory\n"); + evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, MT_GRANULE_SIZE), + "Check buffer write overflow by byte with async mode and mmap memory\n"); + evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_NONE_ERR, MT_GRANULE_SIZE), + "Check buffer write overflow by byte with tag fault ignore mode and mmap memory\n"); + + /* Buffer by block tests */ + evaluate_test(check_buffer_by_block(USE_MMAP, MTE_SYNC_ERR), + "Check buffer write correctness by block with sync mode and mmap memory\n"); + evaluate_test(check_buffer_by_block(USE_MMAP, MTE_ASYNC_ERR), + "Check buffer write correctness by block with async mode and mmap memory\n"); + evaluate_test(check_buffer_by_block(USE_MMAP, MTE_NONE_ERR), + "Check buffer write correctness by block with tag fault ignore and mmap memory\n"); + + /* Initial tags are supposed to be 0 */ + evaluate_test(check_memory_initial_tags(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check initial tags with private mapping, sync error mode and mmap memory\n"); + evaluate_test(check_memory_initial_tags(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE), + "Check initial tags with private mapping, sync error mode and mmap/mprotect memory\n"); + evaluate_test(check_memory_initial_tags(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED), + "Check initial tags with shared mapping, sync error mode and mmap memory\n"); + evaluate_test(check_memory_initial_tags(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED), + "Check initial tags with shared mapping, sync error mode and mmap/mprotect memory\n"); + + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c new file mode 100644 index 000000000000..39f8908988ea --- /dev/null +++ b/tools/testing/selftests/arm64/mte/mte_common_util.c @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define INIT_BUFFER_SIZE 256 + +struct mte_fault_cxt cur_mte_cxt; +static unsigned int mte_cur_mode; +static unsigned int mte_cur_pstate_tco; + +void mte_default_handler(int signum, siginfo_t *si, void *uc) +{ + unsigned long addr = (unsigned long)si->si_addr; + + if (signum == SIGSEGV) { +#ifdef DEBUG + ksft_print_msg("INFO: SIGSEGV signal at pc=%lx, fault addr=%lx, si_code=%lx\n", + ((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code); +#endif + if (si->si_code == SEGV_MTEAERR) { + if (cur_mte_cxt.trig_si_code == si->si_code) + cur_mte_cxt.fault_valid = true; + return; + } + /* Compare the context for precise error */ + else if (si->si_code == SEGV_MTESERR) { + if (cur_mte_cxt.trig_si_code == si->si_code && + ((cur_mte_cxt.trig_range >= 0 && + addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) && + addr <= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)) || + (cur_mte_cxt.trig_range < 0 && + addr <= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) && + addr >= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)))) { + cur_mte_cxt.fault_valid = true; + /* Adjust the pc by 4 */ + ((ucontext_t *)uc)->uc_mcontext.pc += 4; + } else { + ksft_print_msg("Invalid MTE synchronous exception caught!\n"); + exit(1); + } + } else { + ksft_print_msg("Unknown SIGSEGV exception caught!\n"); + exit(1); + } + } else if (signum == SIGBUS) { + ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n", + ((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code); + if ((cur_mte_cxt.trig_range >= 0 && + addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) && + addr <= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)) || + (cur_mte_cxt.trig_range < 0 && + addr <= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) && + addr >= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range))) { + cur_mte_cxt.fault_valid = true; + /* Adjust the pc by 4 */ + ((ucontext_t *)uc)->uc_mcontext.pc += 4; + } + } +} + +void mte_register_signal(int signal, void (*handler)(int, siginfo_t *, void *)) +{ + struct sigaction sa; + + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sigaction(signal, &sa, NULL); +} + +void mte_wait_after_trig(void) +{ + sched_yield(); +} + +void *mte_insert_tags(void *ptr, size_t size) +{ + void *tag_ptr; + int align_size; + + if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) { + ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr); + return NULL; + } + align_size = MT_ALIGN_UP(size); + tag_ptr = mte_insert_random_tag(ptr); + mte_set_tag_address_range(tag_ptr, align_size); + return tag_ptr; +} + +void mte_clear_tags(void *ptr, size_t size) +{ + if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) { + ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr); + return; + } + size = MT_ALIGN_UP(size); + ptr = (void *)MT_CLEAR_TAG((unsigned long)ptr); + mte_clear_tag_address_range(ptr, size); +} + +static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping, + size_t range_before, size_t range_after, + bool tags, int fd) +{ + void *ptr; + int prot_flag, map_flag; + size_t entire_size = size + range_before + range_after; + + if (mem_type != USE_MALLOC && mem_type != USE_MMAP && + mem_type != USE_MPROTECT) { + ksft_print_msg("FAIL: Invalid allocate request\n"); + return NULL; + } + if (mem_type == USE_MALLOC) + return malloc(entire_size) + range_before; + + prot_flag = PROT_READ | PROT_WRITE; + if (mem_type == USE_MMAP) + prot_flag |= PROT_MTE; + + map_flag = mapping; + if (fd == -1) + map_flag = MAP_ANONYMOUS | map_flag; + if (!(mapping & MAP_SHARED)) + map_flag |= MAP_PRIVATE; + ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0); + if (ptr == MAP_FAILED) { + ksft_print_msg("FAIL: mmap allocation\n"); + return NULL; + } + if (mem_type == USE_MPROTECT) { + if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) { + munmap(ptr, size); + ksft_print_msg("FAIL: mprotect PROT_MTE property\n"); + return NULL; + } + } + if (tags) + ptr = mte_insert_tags(ptr + range_before, size); + return ptr; +} + +void *mte_allocate_memory_tag_range(size_t size, int mem_type, int mapping, + size_t range_before, size_t range_after) +{ + return __mte_allocate_memory_range(size, mem_type, mapping, range_before, + range_after, true, -1); +} + +void *mte_allocate_memory(size_t size, int mem_type, int mapping, bool tags) +{ + return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, -1); +} + +void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags, int fd) +{ + int index; + char buffer[INIT_BUFFER_SIZE]; + + if (mem_type != USE_MPROTECT && mem_type != USE_MMAP) { + ksft_print_msg("FAIL: Invalid mmap file request\n"); + return NULL; + } + /* Initialize the file for mappable size */ + lseek(fd, 0, SEEK_SET); + for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) + write(fd, buffer, INIT_BUFFER_SIZE); + index -= INIT_BUFFER_SIZE; + write(fd, buffer, size - index); + return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd); +} + +void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping, + size_t range_before, size_t range_after, int fd) +{ + int index; + char buffer[INIT_BUFFER_SIZE]; + int map_size = size + range_before + range_after; + + if (mem_type != USE_MPROTECT && mem_type != USE_MMAP) { + ksft_print_msg("FAIL: Invalid mmap file request\n"); + return NULL; + } + /* Initialize the file for mappable size */ + lseek(fd, 0, SEEK_SET); + for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE) + write(fd, buffer, INIT_BUFFER_SIZE); + index -= INIT_BUFFER_SIZE; + write(fd, buffer, map_size - index); + return __mte_allocate_memory_range(size, mem_type, mapping, range_before, + range_after, true, fd); +} + +static void __mte_free_memory_range(void *ptr, size_t size, int mem_type, + size_t range_before, size_t range_after, bool tags) +{ + switch (mem_type) { + case USE_MALLOC: + free(ptr - range_before); + break; + case USE_MMAP: + case USE_MPROTECT: + if (tags) + mte_clear_tags(ptr, size); + munmap(ptr - range_before, size + range_before + range_after); + break; + default: + ksft_print_msg("FAIL: Invalid free request\n"); + break; + } +} + +void mte_free_memory_tag_range(void *ptr, size_t size, int mem_type, + size_t range_before, size_t range_after) +{ + __mte_free_memory_range(ptr, size, mem_type, range_before, range_after, true); +} + +void mte_free_memory(void *ptr, size_t size, int mem_type, bool tags) +{ + __mte_free_memory_range(ptr, size, mem_type, 0, 0, tags); +} + +void mte_initialize_current_context(int mode, uintptr_t ptr, ssize_t range) +{ + cur_mte_cxt.fault_valid = false; + cur_mte_cxt.trig_addr = ptr; + cur_mte_cxt.trig_range = range; + if (mode == MTE_SYNC_ERR) + cur_mte_cxt.trig_si_code = SEGV_MTESERR; + else if (mode == MTE_ASYNC_ERR) + cur_mte_cxt.trig_si_code = SEGV_MTEAERR; + else + cur_mte_cxt.trig_si_code = 0; +} + +int mte_switch_mode(int mte_option, unsigned long incl_mask) +{ + unsigned long en = 0; + + if (!(mte_option == MTE_SYNC_ERR || mte_option == MTE_ASYNC_ERR || + mte_option == MTE_NONE_ERR || incl_mask <= MTE_ALLOW_NON_ZERO_TAG)) { + ksft_print_msg("FAIL: Invalid mte config option\n"); + return -EINVAL; + } + en = PR_TAGGED_ADDR_ENABLE; + if (mte_option == MTE_SYNC_ERR) + en |= PR_MTE_TCF_SYNC; + else if (mte_option == MTE_ASYNC_ERR) + en |= PR_MTE_TCF_ASYNC; + else if (mte_option == MTE_NONE_ERR) + en |= PR_MTE_TCF_NONE; + + en |= (incl_mask << PR_MTE_TAG_SHIFT); + /* Enable address tagging ABI, mte error reporting mode and tag inclusion mask. */ + if (!prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) == 0) { + ksft_print_msg("FAIL:prctl PR_SET_TAGGED_ADDR_CTRL for mte mode\n"); + return -EINVAL; + } + return 0; +} + +#define ID_AA64PFR1_MTE_SHIFT 8 +#define ID_AA64PFR1_MTE 2 + +int mte_default_setup(void) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + unsigned long en = 0; + int ret; + + if (!(hwcaps & HWCAP_CPUID)) { + ksft_print_msg("FAIL: CPUID registers unavailable\n"); + return KSFT_FAIL; + } + /* Read ID_AA64PFR1_EL1 register */ + asm volatile("mrs %0, id_aa64pfr1_el1" : "=r"(hwcaps) : : "memory"); + if (((hwcaps >> ID_AA64PFR1_MTE_SHIFT) & MT_TAG_MASK) != ID_AA64PFR1_MTE) { + ksft_print_msg("FAIL: MTE features unavailable\n"); + return KSFT_SKIP; + } + /* Get current mte mode */ + ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0); + if (ret < 0) { + ksft_print_msg("FAIL:prctl PR_GET_TAGGED_ADDR_CTRL with error =%d\n", ret); + return KSFT_FAIL; + } + if (ret & PR_MTE_TCF_SYNC) + mte_cur_mode = MTE_SYNC_ERR; + else if (ret & PR_MTE_TCF_ASYNC) + mte_cur_mode = MTE_ASYNC_ERR; + else if (ret & PR_MTE_TCF_NONE) + mte_cur_mode = MTE_NONE_ERR; + + mte_cur_pstate_tco = mte_get_pstate_tco(); + /* Disable PSTATE.TCO */ + mte_disable_pstate_tco(); + return 0; +} + +void mte_restore_setup(void) +{ + mte_switch_mode(mte_cur_mode, MTE_ALLOW_NON_ZERO_TAG); + if (mte_cur_pstate_tco == MT_PSTATE_TCO_EN) + mte_enable_pstate_tco(); + else if (mte_cur_pstate_tco == MT_PSTATE_TCO_DIS) + mte_disable_pstate_tco(); +} + +int create_temp_file(void) +{ + int fd; + char filename[] = "/dev/shm/tmp_XXXXXX"; + + /* Create a file in the tmpfs filesystem */ + fd = mkstemp(&filename[0]); + if (fd == -1) { + ksft_print_msg("FAIL: Unable to open temporary file\n"); + return 0; + } + unlink(&filename[0]); + return fd; +} diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h new file mode 100644 index 000000000000..45160e061a0e --- /dev/null +++ b/tools/testing/selftests/arm64/mte/mte_common_util.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ARM Limited */ + +#ifndef _MTE_COMMON_UTIL_H +#define _MTE_COMMON_UTIL_H + +#include +#include +#include +#include +#include +#include +#include "mte_def.h" +#include "kselftest.h" + +enum mte_mem_type { + USE_MALLOC, + USE_MMAP, + USE_MPROTECT, +}; + +enum mte_mode { + MTE_NONE_ERR, + MTE_SYNC_ERR, + MTE_ASYNC_ERR, +}; + +struct mte_fault_cxt { + /* Address start which triggers mte tag fault */ + unsigned long trig_addr; + /* Address range for mte tag fault and negative value means underflow */ + ssize_t trig_range; + /* siginfo si code */ + unsigned long trig_si_code; + /* Flag to denote if correct fault caught */ + bool fault_valid; +}; + +extern struct mte_fault_cxt cur_mte_cxt; + +/* MTE utility functions */ +void mte_default_handler(int signum, siginfo_t *si, void *uc); +void mte_register_signal(int signal, void (*handler)(int, siginfo_t *, void *)); +void mte_wait_after_trig(void); +void *mte_allocate_memory(size_t size, int mem_type, int mapping, bool tags); +void *mte_allocate_memory_tag_range(size_t size, int mem_type, int mapping, + size_t range_before, size_t range_after); +void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, + bool tags, int fd); +void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping, + size_t range_before, size_t range_after, int fd); +void mte_free_memory(void *ptr, size_t size, int mem_type, bool tags); +void mte_free_memory_tag_range(void *ptr, size_t size, int mem_type, + size_t range_before, size_t range_after); +void *mte_insert_tags(void *ptr, size_t size); +void mte_clear_tags(void *ptr, size_t size); +int mte_default_setup(void); +void mte_restore_setup(void); +int mte_switch_mode(int mte_option, unsigned long incl_mask); +void mte_initialize_current_context(int mode, uintptr_t ptr, ssize_t range); + +/* Common utility functions */ +int create_temp_file(void); + +/* Assembly MTE utility functions */ +void *mte_insert_random_tag(void *ptr); +void *mte_get_tag_address(void *ptr); +void mte_set_tag_address_range(void *ptr, int range); +void mte_clear_tag_address_range(void *ptr, int range); +void mte_disable_pstate_tco(void); +void mte_enable_pstate_tco(void); +unsigned int mte_get_pstate_tco(void); + +/* Test framework static inline functions/macros */ +static inline void evaluate_test(int err, const char *msg) +{ + if (err == KSFT_PASS) + ksft_test_result_pass(msg); + else if (err == KSFT_FAIL) + ksft_test_result_fail(msg); +} + +static inline int check_allocated_memory(void *ptr, size_t size, + int mem_type, bool tags) +{ + if (ptr == NULL) { + ksft_print_msg("FAIL: memory allocation\n"); + return KSFT_FAIL; + } + + if (tags && !MT_FETCH_TAG((uintptr_t)ptr)) { + ksft_print_msg("FAIL: tag not found at addr(%p)\n", ptr); + mte_free_memory((void *)ptr, size, mem_type, false); + return KSFT_FAIL; + } + + return KSFT_PASS; +} + +static inline int check_allocated_memory_range(void *ptr, size_t size, int mem_type, + size_t range_before, size_t range_after) +{ + if (ptr == NULL) { + ksft_print_msg("FAIL: memory allocation\n"); + return KSFT_FAIL; + } + + if (!MT_FETCH_TAG((uintptr_t)ptr)) { + ksft_print_msg("FAIL: tag not found at addr(%p)\n", ptr); + mte_free_memory_tag_range((void *)ptr, size, mem_type, range_before, + range_after); + return KSFT_FAIL; + } + return KSFT_PASS; +} + +#endif /* _MTE_COMMON_UTIL_H */ diff --git a/tools/testing/selftests/arm64/mte/mte_def.h b/tools/testing/selftests/arm64/mte/mte_def.h new file mode 100644 index 000000000000..9b188254b61a --- /dev/null +++ b/tools/testing/selftests/arm64/mte/mte_def.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ARM Limited */ + +/* + * Below definitions may be found in kernel headers, However, they are + * redefined here to decouple the MTE selftests compilations from them. + */ +#ifndef SEGV_MTEAERR +#define SEGV_MTEAERR 8 +#endif +#ifndef SEGV_MTESERR +#define SEGV_MTESERR 9 +#endif +#ifndef PROT_MTE +#define PROT_MTE 0x20 +#endif +#ifndef HWCAP2_MTE +#define HWCAP2_MTE (1 << 18) +#endif + +#ifndef PR_MTE_TCF_SHIFT +#define PR_MTE_TCF_SHIFT 1 +#endif +#ifndef PR_MTE_TCF_NONE +#define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT) +#endif +#ifndef PR_MTE_TCF_SYNC +#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT) +#endif +#ifndef PR_MTE_TCF_ASYNC +#define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT) +#endif +#ifndef PR_MTE_TAG_SHIFT +#define PR_MTE_TAG_SHIFT 3 +#endif + +/* MTE Hardware feature definitions below. */ +#define MT_TAG_SHIFT 56 +#define MT_TAG_MASK 0xFUL +#define MT_FREE_TAG 0x0UL +#define MT_GRANULE_SIZE 16 +#define MT_TAG_COUNT 16 +#define MT_INCLUDE_TAG_MASK 0xFFFF +#define MT_EXCLUDE_TAG_MASK 0x0 + +#define MT_ALIGN_GRANULE (MT_GRANULE_SIZE - 1) +#define MT_CLEAR_TAG(x) ((x) & ~(MT_TAG_MASK << MT_TAG_SHIFT)) +#define MT_SET_TAG(x, y) ((x) | (y << MT_TAG_SHIFT)) +#define MT_FETCH_TAG(x) ((x >> MT_TAG_SHIFT) & (MT_TAG_MASK)) +#define MT_ALIGN_UP(x) ((x + MT_ALIGN_GRANULE) & ~(MT_ALIGN_GRANULE)) + +#define MT_PSTATE_TCO_SHIFT 25 +#define MT_PSTATE_TCO_MASK ~(0x1 << MT_PSTATE_TCO_SHIFT) +#define MT_PSTATE_TCO_EN 1 +#define MT_PSTATE_TCO_DIS 0 + +#define MT_EXCLUDE_TAG(x) (1 << (x)) +#define MT_INCLUDE_VALID_TAG(x) (MT_INCLUDE_TAG_MASK ^ MT_EXCLUDE_TAG(x)) +#define MT_INCLUDE_VALID_TAGS(x) (MT_INCLUDE_TAG_MASK ^ (x)) +#define MTE_ALLOW_NON_ZERO_TAG MT_INCLUDE_VALID_TAG(0) diff --git a/tools/testing/selftests/arm64/mte/mte_helper.S b/tools/testing/selftests/arm64/mte/mte_helper.S new file mode 100644 index 000000000000..48e049fbad9a --- /dev/null +++ b/tools/testing/selftests/arm64/mte/mte_helper.S @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2020 ARM Limited */ + +#include "mte_def.h" + +#define ENTRY(name) \ + .globl name ;\ + .p2align 2;\ + .type name, @function ;\ +name: + +#define ENDPROC(name) \ + .size name, .-name ; + + .text +/* + * mte_insert_random_tag: Insert random tag and might be same as the source tag if + * the source pointer has it. + * Input: + * x0 - source pointer with a tag/no-tag + * Return: + * x0 - pointer with random tag + */ +ENTRY(mte_insert_random_tag) + irg x0, x0, xzr + ret +ENDPROC(mte_insert_random_tag) + +/* + * mte_get_tag_address: Get the tag from given address. + * Input: + * x0 - source pointer + * Return: + * x0 - pointer with appended tag + */ +ENTRY(mte_get_tag_address) + ldg x0, [x0] + ret +ENDPROC(mte_get_tag_address) + +/* + * mte_set_tag_address_range: Set the tag range from the given address + * Input: + * x0 - source pointer with tag data + * x1 - range + * Return: + * none + */ +ENTRY(mte_set_tag_address_range) + cbz x1, 2f +1: + stg x0, [x0, #0x0] + add x0, x0, #MT_GRANULE_SIZE + sub x1, x1, #MT_GRANULE_SIZE + cbnz x1, 1b +2: + ret +ENDPROC(mte_set_tag_address_range) + +/* + * mt_clear_tag_address_range: Clear the tag range from the given address + * Input: + * x0 - source pointer with tag data + * x1 - range + * Return: + * none + */ +ENTRY(mte_clear_tag_address_range) + cbz x1, 2f +1: + stzg x0, [x0, #0x0] + add x0, x0, #MT_GRANULE_SIZE + sub x1, x1, #MT_GRANULE_SIZE + cbnz x1, 1b +2: + ret +ENDPROC(mte_clear_tag_address_range) + +/* + * mte_enable_pstate_tco: Enable PSTATE.TCO (tag check override) field + * Input: + * none + * Return: + * none + */ +ENTRY(mte_enable_pstate_tco) + msr tco, #MT_PSTATE_TCO_EN + ret +ENDPROC(mte_enable_pstate_tco) + +/* + * mte_disable_pstate_tco: Disable PSTATE.TCO (tag check override) field + * Input: + * none + * Return: + * none + */ +ENTRY(mte_disable_pstate_tco) + msr tco, #MT_PSTATE_TCO_DIS + ret +ENDPROC(mte_disable_pstate_tco) + +/* + * mte_get_pstate_tco: Get PSTATE.TCO (tag check override) field + * Input: + * none + * Return: + * x0 + */ +ENTRY(mte_get_pstate_tco) + mrs x0, tco + ubfx x0, x0, #MT_PSTATE_TCO_SHIFT, #1 + ret +ENDPROC(mte_get_pstate_tco) From f3b2a26ca78da2ef36cf76d5511e3c94baee96a1 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:26 +0530 Subject: [PATCH 141/148] kselftest/arm64: Verify mte tag inclusion via prctl This testcase verifies that the tag generated with "irg" instruction contains only included tags. This is done via prtcl call. This test covers 4 scenarios, * At least one included tag. * More than one included tags. * All included. * None included. Co-developed-by: Gabor Kertesz Signed-off-by: Gabor Kertesz Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-3-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + .../arm64/mte/check_tags_inclusion.c | 185 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_tags_inclusion.c diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 3f8c1f6c82b9..c3fca255d3d6 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -1 +1,2 @@ check_buffer_fill +check_tags_inclusion diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c new file mode 100644 index 000000000000..94d245a0ed56 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define BUFFER_SIZE (5 * MT_GRANULE_SIZE) +#define RUNS (MT_TAG_COUNT * 2) +#define MTE_LAST_TAG_MASK (0x7FFF) + +static int verify_mte_pointer_validity(char *ptr, int mode) +{ + mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE); + /* Check the validity of the tagged pointer */ + memset((void *)ptr, '1', BUFFER_SIZE); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid) + return KSFT_FAIL; + /* Proceed further for nonzero tags */ + if (!MT_FETCH_TAG((uintptr_t)ptr)) + return KSFT_PASS; + mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE + 1); + /* Check the validity outside the range */ + ptr[BUFFER_SIZE] = '2'; + mte_wait_after_trig(); + if (!cur_mte_cxt.fault_valid) + return KSFT_FAIL; + else + return KSFT_PASS; +} + +static int check_single_included_tags(int mem_type, int mode) +{ + char *ptr; + int tag, run, result = KSFT_PASS; + + ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false); + if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE, + mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + for (tag = 0; (tag < MT_TAG_COUNT) && (result == KSFT_PASS); tag++) { + mte_switch_mode(mode, MT_INCLUDE_VALID_TAG(tag)); + /* Try to catch a excluded tag by a number of tries. */ + for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) { + ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE); + /* Check tag value */ + if (MT_FETCH_TAG((uintptr_t)ptr) == tag) { + ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n", + MT_FETCH_TAG((uintptr_t)ptr), + MT_INCLUDE_VALID_TAG(tag)); + result = KSFT_FAIL; + break; + } + result = verify_mte_pointer_validity(ptr, mode); + } + } + mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE); + return result; +} + +static int check_multiple_included_tags(int mem_type, int mode) +{ + char *ptr; + int tag, run, result = KSFT_PASS; + unsigned long excl_mask = 0; + + ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false); + if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE, + mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + for (tag = 0; (tag < MT_TAG_COUNT - 1) && (result == KSFT_PASS); tag++) { + excl_mask |= 1 << tag; + mte_switch_mode(mode, MT_INCLUDE_VALID_TAGS(excl_mask)); + /* Try to catch a excluded tag by a number of tries. */ + for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) { + ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE); + /* Check tag value */ + if (MT_FETCH_TAG((uintptr_t)ptr) < tag) { + ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n", + MT_FETCH_TAG((uintptr_t)ptr), + MT_INCLUDE_VALID_TAGS(excl_mask)); + result = KSFT_FAIL; + break; + } + result = verify_mte_pointer_validity(ptr, mode); + } + } + mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE); + return result; +} + +static int check_all_included_tags(int mem_type, int mode) +{ + char *ptr; + int run, result = KSFT_PASS; + + ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false); + if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE, + mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + mte_switch_mode(mode, MT_INCLUDE_TAG_MASK); + /* Try to catch a excluded tag by a number of tries. */ + for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) { + ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE); + /* + * Here tag byte can be between 0x0 to 0xF (full allowed range) + * so no need to match so just verify if it is writable. + */ + result = verify_mte_pointer_validity(ptr, mode); + } + mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE); + return result; +} + +static int check_none_included_tags(int mem_type, int mode) +{ + char *ptr; + int run; + + ptr = (char *)mte_allocate_memory(BUFFER_SIZE, mem_type, 0, false); + if (check_allocated_memory(ptr, BUFFER_SIZE, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + mte_switch_mode(mode, MT_EXCLUDE_TAG_MASK); + /* Try to catch a excluded tag by a number of tries. */ + for (run = 0; run < RUNS; run++) { + ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE); + /* Here all tags exluded so tag value generated should be 0 */ + if (MT_FETCH_TAG((uintptr_t)ptr)) { + ksft_print_msg("FAIL: included tag value found\n"); + mte_free_memory((void *)ptr, BUFFER_SIZE, mem_type, true); + return KSFT_FAIL; + } + mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE); + /* Check the write validity of the untagged pointer */ + memset((void *)ptr, '1', BUFFER_SIZE); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid) + break; + } + mte_free_memory((void *)ptr, BUFFER_SIZE, mem_type, false); + if (cur_mte_cxt.fault_valid) + return KSFT_FAIL; + else + return KSFT_PASS; +} + +int main(int argc, char *argv[]) +{ + int err; + + err = mte_default_setup(); + if (err) + return err; + + /* Register SIGSEGV handler */ + mte_register_signal(SIGSEGV, mte_default_handler); + + evaluate_test(check_single_included_tags(USE_MMAP, MTE_SYNC_ERR), + "Check an included tag value with sync mode\n"); + evaluate_test(check_multiple_included_tags(USE_MMAP, MTE_SYNC_ERR), + "Check different included tags value with sync mode\n"); + evaluate_test(check_none_included_tags(USE_MMAP, MTE_SYNC_ERR), + "Check none included tags value with sync mode\n"); + evaluate_test(check_all_included_tags(USE_MMAP, MTE_SYNC_ERR), + "Check all included tags value with sync mode\n"); + + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} From dfe537cf47186bfb59db3e57eed7417cd5efde17 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:27 +0530 Subject: [PATCH 142/148] kselftest/arm64: Check forked child mte memory accessibility This test covers the mte memory behaviour of the forked process with different mapping properties and flags. It checks that all bytes of forked child memory are accessible with the same tag as that of the parent and memory accessed outside the tag range causes fault to occur. Co-developed-by: Gabor Kertesz Signed-off-by: Gabor Kertesz Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-4-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + .../selftests/arm64/mte/check_child_memory.c | 195 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_child_memory.c diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index c3fca255d3d6..b5fcc0fb4d97 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -1,2 +1,3 @@ check_buffer_fill check_tags_inclusion +check_child_memory diff --git a/tools/testing/selftests/arm64/mte/check_child_memory.c b/tools/testing/selftests/arm64/mte/check_child_memory.c new file mode 100644 index 000000000000..97bebdecd29e --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_child_memory.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define BUFFER_SIZE (5 * MT_GRANULE_SIZE) +#define RUNS (MT_TAG_COUNT) +#define UNDERFLOW MT_GRANULE_SIZE +#define OVERFLOW MT_GRANULE_SIZE + +static size_t page_size; +static int sizes[] = { + 1, 537, 989, 1269, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE, + /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0 +}; + +static int check_child_tag_inheritance(char *ptr, int size, int mode) +{ + int i, parent_tag, child_tag, fault, child_status; + pid_t child; + + parent_tag = MT_FETCH_TAG((uintptr_t)ptr); + fault = 0; + + child = fork(); + if (child == -1) { + ksft_print_msg("FAIL: child process creation\n"); + return KSFT_FAIL; + } else if (child == 0) { + mte_initialize_current_context(mode, (uintptr_t)ptr, size); + /* Do copy on write */ + memset(ptr, '1', size); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == true) { + fault = 1; + goto check_child_tag_inheritance_err; + } + for (i = 0 ; i < size ; i += MT_GRANULE_SIZE) { + child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i))); + if (parent_tag != child_tag) { + ksft_print_msg("FAIL: child mte tag mismatch\n"); + fault = 1; + goto check_child_tag_inheritance_err; + } + } + mte_initialize_current_context(mode, (uintptr_t)ptr, -UNDERFLOW); + memset(ptr - UNDERFLOW, '2', UNDERFLOW); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == false) { + fault = 1; + goto check_child_tag_inheritance_err; + } + mte_initialize_current_context(mode, (uintptr_t)ptr, size + OVERFLOW); + memset(ptr + size, '3', OVERFLOW); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == false) { + fault = 1; + goto check_child_tag_inheritance_err; + } +check_child_tag_inheritance_err: + _exit(fault); + } + /* Wait for child process to terminate */ + wait(&child_status); + if (WIFEXITED(child_status)) + fault = WEXITSTATUS(child_status); + else + fault = 1; + return (fault) ? KSFT_FAIL : KSFT_PASS; +} + +static int check_child_memory_mapping(int mem_type, int mode, int mapping) +{ + char *ptr; + int run, result; + int item = sizeof(sizes)/sizeof(int); + + item = sizeof(sizes)/sizeof(int); + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < item; run++) { + ptr = (char *)mte_allocate_memory_tag_range(sizes[run], mem_type, mapping, + UNDERFLOW, OVERFLOW); + if (check_allocated_memory_range(ptr, sizes[run], mem_type, + UNDERFLOW, OVERFLOW) != KSFT_PASS) + return KSFT_FAIL; + result = check_child_tag_inheritance(ptr, sizes[run], mode); + mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW); + if (result == KSFT_FAIL) + return result; + } + return KSFT_PASS; +} + +static int check_child_file_mapping(int mem_type, int mode, int mapping) +{ + char *ptr, *map_ptr; + int run, fd, map_size, result = KSFT_PASS; + int total = sizeof(sizes)/sizeof(int); + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < total; run++) { + fd = create_temp_file(); + if (fd == -1) + return KSFT_FAIL; + + map_size = sizes[run] + OVERFLOW + UNDERFLOW; + map_ptr = (char *)mte_allocate_file_memory(map_size, mem_type, mapping, false, fd); + if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) { + close(fd); + return KSFT_FAIL; + } + ptr = map_ptr + UNDERFLOW; + mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]); + /* Only mte enabled memory will allow tag insertion */ + ptr = mte_insert_tags((void *)ptr, sizes[run]); + if (!ptr || cur_mte_cxt.fault_valid == true) { + ksft_print_msg("FAIL: Insert tags on file based memory\n"); + munmap((void *)map_ptr, map_size); + close(fd); + return KSFT_FAIL; + } + result = check_child_tag_inheritance(ptr, sizes[run], mode); + mte_clear_tags((void *)ptr, sizes[run]); + munmap((void *)map_ptr, map_size); + close(fd); + if (result != KSFT_PASS) + return KSFT_FAIL; + } + return KSFT_PASS; +} + +int main(int argc, char *argv[]) +{ + int err; + int item = sizeof(sizes)/sizeof(int); + + page_size = getpagesize(); + if (!page_size) { + ksft_print_msg("ERR: Unable to get page size\n"); + return KSFT_FAIL; + } + sizes[item - 3] = page_size - 1; + sizes[item - 2] = page_size; + sizes[item - 1] = page_size + 1; + + err = mte_default_setup(); + if (err) + return err; + + /* Register SIGSEGV handler */ + mte_register_signal(SIGSEGV, mte_default_handler); + mte_register_signal(SIGBUS, mte_default_handler); + + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check child anonymous memory with private mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED), + "Check child anonymous memory with shared mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE), + "Check child anonymous memory with private mapping, imprecise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED), + "Check child anonymous memory with shared mapping, imprecise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE), + "Check child anonymous memory with private mapping, precise mode and mmap/mprotect memory\n"); + evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED), + "Check child anonymous memory with shared mapping, precise mode and mmap/mprotect memory\n"); + + evaluate_test(check_child_file_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check child file memory with private mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_file_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED), + "Check child file memory with shared mapping, precise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE), + "Check child file memory with private mapping, imprecise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED), + "Check child file memory with shared mapping, imprecise mode and mmap memory\n"); + evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE), + "Check child file memory with private mapping, precise mode and mmap/mprotect memory\n"); + evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED), + "Check child file memory with shared mapping, precise mode and mmap/mprotect memory\n"); + + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} From 53ec81d232134f61806dfd93025320caa1aaf559 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:28 +0530 Subject: [PATCH 143/148] kselftest/arm64: Verify all different mmap MTE options This testcase checks the different unsupported/supported options for mmap if used with PROT_MTE memory protection flag. These checks are, * Either pstate.tco enable or prctl PR_MTE_TCF_NONE option should not cause any tag mismatch faults. * Different combinations of anonymous/file memory mmap, mprotect, sync/async error mode and private/shared mappings should work. * mprotect should not be able to clear the PROT_MTE page property. Co-developed-by: Gabor Kertesz Signed-off-by: Gabor Kertesz Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-5-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + .../selftests/arm64/mte/check_mmap_options.c | 262 ++++++++++++++++++ 2 files changed, 263 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_mmap_options.c diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index b5fcc0fb4d97..79a215d3bbd0 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -1,3 +1,4 @@ check_buffer_fill check_tags_inclusion check_child_memory +check_mmap_options diff --git a/tools/testing/selftests/arm64/mte/check_mmap_options.c b/tools/testing/selftests/arm64/mte/check_mmap_options.c new file mode 100644 index 000000000000..33b13b86199b --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_mmap_options.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define RUNS (MT_TAG_COUNT) +#define UNDERFLOW MT_GRANULE_SIZE +#define OVERFLOW MT_GRANULE_SIZE +#define TAG_CHECK_ON 0 +#define TAG_CHECK_OFF 1 + +static size_t page_size; +static int sizes[] = { + 1, 537, 989, 1269, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE, + /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0 +}; + +static int check_mte_memory(char *ptr, int size, int mode, int tag_check) +{ + mte_initialize_current_context(mode, (uintptr_t)ptr, size); + memset(ptr, '1', size); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == true) + return KSFT_FAIL; + + mte_initialize_current_context(mode, (uintptr_t)ptr, -UNDERFLOW); + memset(ptr - UNDERFLOW, '2', UNDERFLOW); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == false && tag_check == TAG_CHECK_ON) + return KSFT_FAIL; + if (cur_mte_cxt.fault_valid == true && tag_check == TAG_CHECK_OFF) + return KSFT_FAIL; + + mte_initialize_current_context(mode, (uintptr_t)ptr, size + OVERFLOW); + memset(ptr + size, '3', OVERFLOW); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid == false && tag_check == TAG_CHECK_ON) + return KSFT_FAIL; + if (cur_mte_cxt.fault_valid == true && tag_check == TAG_CHECK_OFF) + return KSFT_FAIL; + + return KSFT_PASS; +} + +static int check_anonymous_memory_mapping(int mem_type, int mode, int mapping, int tag_check) +{ + char *ptr, *map_ptr; + int run, result, map_size; + int item = sizeof(sizes)/sizeof(int); + + item = sizeof(sizes)/sizeof(int); + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < item; run++) { + map_size = sizes[run] + OVERFLOW + UNDERFLOW; + map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false); + if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + ptr = map_ptr + UNDERFLOW; + mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]); + /* Only mte enabled memory will allow tag insertion */ + ptr = mte_insert_tags((void *)ptr, sizes[run]); + if (!ptr || cur_mte_cxt.fault_valid == true) { + ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n"); + munmap((void *)map_ptr, map_size); + return KSFT_FAIL; + } + result = check_mte_memory(ptr, sizes[run], mode, tag_check); + mte_clear_tags((void *)ptr, sizes[run]); + mte_free_memory((void *)map_ptr, map_size, mem_type, false); + if (result == KSFT_FAIL) + return KSFT_FAIL; + } + return KSFT_PASS; +} + +static int check_file_memory_mapping(int mem_type, int mode, int mapping, int tag_check) +{ + char *ptr, *map_ptr; + int run, fd, map_size; + int total = sizeof(sizes)/sizeof(int); + int result = KSFT_PASS; + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < total; run++) { + fd = create_temp_file(); + if (fd == -1) + return KSFT_FAIL; + + map_size = sizes[run] + UNDERFLOW + OVERFLOW; + map_ptr = (char *)mte_allocate_file_memory(map_size, mem_type, mapping, false, fd); + if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) { + close(fd); + return KSFT_FAIL; + } + ptr = map_ptr + UNDERFLOW; + mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]); + /* Only mte enabled memory will allow tag insertion */ + ptr = mte_insert_tags((void *)ptr, sizes[run]); + if (!ptr || cur_mte_cxt.fault_valid == true) { + ksft_print_msg("FAIL: Insert tags on file based memory\n"); + munmap((void *)map_ptr, map_size); + close(fd); + return KSFT_FAIL; + } + result = check_mte_memory(ptr, sizes[run], mode, tag_check); + mte_clear_tags((void *)ptr, sizes[run]); + munmap((void *)map_ptr, map_size); + close(fd); + if (result == KSFT_FAIL) + break; + } + return result; +} + +static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping) +{ + char *ptr, *map_ptr; + int run, prot_flag, result, fd, map_size; + int total = sizeof(sizes)/sizeof(int); + + prot_flag = PROT_READ | PROT_WRITE; + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + for (run = 0; run < total; run++) { + map_size = sizes[run] + OVERFLOW + UNDERFLOW; + ptr = (char *)mte_allocate_memory_tag_range(sizes[run], mem_type, mapping, + UNDERFLOW, OVERFLOW); + if (check_allocated_memory_range(ptr, sizes[run], mem_type, + UNDERFLOW, OVERFLOW) != KSFT_PASS) + return KSFT_FAIL; + map_ptr = ptr - UNDERFLOW; + /* Try to clear PROT_MTE property and verify it by tag checking */ + if (mprotect(map_ptr, map_size, prot_flag)) { + mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, + UNDERFLOW, OVERFLOW); + ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n"); + return KSFT_FAIL; + } + result = check_mte_memory(ptr, sizes[run], mode, TAG_CHECK_ON); + mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW); + if (result != KSFT_PASS) + return KSFT_FAIL; + + fd = create_temp_file(); + if (fd == -1) + return KSFT_FAIL; + ptr = (char *)mte_allocate_file_memory_tag_range(sizes[run], mem_type, mapping, + UNDERFLOW, OVERFLOW, fd); + if (check_allocated_memory_range(ptr, sizes[run], mem_type, + UNDERFLOW, OVERFLOW) != KSFT_PASS) { + close(fd); + return KSFT_FAIL; + } + map_ptr = ptr - UNDERFLOW; + /* Try to clear PROT_MTE property and verify it by tag checking */ + if (mprotect(map_ptr, map_size, prot_flag)) { + ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n"); + mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, + UNDERFLOW, OVERFLOW); + close(fd); + return KSFT_FAIL; + } + result = check_mte_memory(ptr, sizes[run], mode, TAG_CHECK_ON); + mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW); + close(fd); + if (result != KSFT_PASS) + return KSFT_FAIL; + } + return KSFT_PASS; +} + +int main(int argc, char *argv[]) +{ + int err; + int item = sizeof(sizes)/sizeof(int); + + err = mte_default_setup(); + if (err) + return err; + page_size = getpagesize(); + if (!page_size) { + ksft_print_msg("ERR: Unable to get page size\n"); + return KSFT_FAIL; + } + sizes[item - 3] = page_size - 1; + sizes[item - 2] = page_size; + sizes[item - 1] = page_size + 1; + + /* Register signal handlers */ + mte_register_signal(SIGBUS, mte_default_handler); + mte_register_signal(SIGSEGV, mte_default_handler); + + mte_enable_pstate_tco(); + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_OFF), + "Check anonymous memory with private mapping, sync error mode, mmap memory and tag check off\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_OFF), + "Check file memory with private mapping, sync error mode, mmap/mprotect memory and tag check off\n"); + + mte_disable_pstate_tco(); + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE, TAG_CHECK_OFF), + "Check anonymous memory with private mapping, no error mode, mmap memory and tag check off\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_NONE_ERR, MAP_PRIVATE, TAG_CHECK_OFF), + "Check file memory with private mapping, no error mode, mmap/mprotect memory and tag check off\n"); + + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check anonymous memory with private mapping, sync error mode, mmap memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check anonymous memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check anonymous memory with shared mapping, sync error mode, mmap memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check anonymous memory with shared mapping, sync error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check anonymous memory with private mapping, async error mode, mmap memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check anonymous memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check anonymous memory with shared mapping, async error mode, mmap memory and tag check on\n"); + evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check anonymous memory with shared mapping, async error mode, mmap/mprotect memory and tag check on\n"); + + evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check file memory with private mapping, sync error mode, mmap memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check file memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check file memory with shared mapping, sync error mode, mmap memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check file memory with shared mapping, sync error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check file memory with private mapping, async error mode, mmap memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON), + "Check file memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check file memory with shared mapping, async error mode, mmap memory and tag check on\n"); + evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON), + "Check file memory with shared mapping, async error mode, mmap/mprotect memory and tag check on\n"); + + evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n"); + evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE), + "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n"); + + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} From f981d8fa26469e19d2ce73006685ae88205e914a Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:29 +0530 Subject: [PATCH 144/148] kselftest/arm64: Verify KSM page merge for MTE pages Add a testcase to check that KSM should not merge pages containing same data with same/different MTE tag values. This testcase has one positive tests and passes if page merging happens according to the above rule. It also saves and restores any modified ksm sysfs entries. Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-6-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + .../selftests/arm64/mte/check_ksm_options.c | 159 ++++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_ksm_options.c diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 79a215d3bbd0..44e9bfdaeca6 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -2,3 +2,4 @@ check_buffer_fill check_tags_inclusion check_child_memory check_mmap_options +check_ksm_options diff --git a/tools/testing/selftests/arm64/mte/check_ksm_options.c b/tools/testing/selftests/arm64/mte/check_ksm_options.c new file mode 100644 index 000000000000..bc41ae630c86 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_ksm_options.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +#define TEST_UNIT 10 +#define PATH_KSM "/sys/kernel/mm/ksm/" +#define MAX_LOOP 4 + +static size_t page_sz; +static unsigned long ksm_sysfs[5]; + +static unsigned long read_sysfs(char *str) +{ + FILE *f; + unsigned long val = 0; + + f = fopen(str, "r"); + if (!f) { + ksft_print_msg("ERR: missing %s\n", str); + return 0; + } + fscanf(f, "%lu", &val); + fclose(f); + return val; +} + +static void write_sysfs(char *str, unsigned long val) +{ + FILE *f; + + f = fopen(str, "w"); + if (!f) { + ksft_print_msg("ERR: missing %s\n", str); + return; + } + fprintf(f, "%lu", val); + fclose(f); +} + +static void mte_ksm_setup(void) +{ + ksm_sysfs[0] = read_sysfs(PATH_KSM "merge_across_nodes"); + write_sysfs(PATH_KSM "merge_across_nodes", 1); + ksm_sysfs[1] = read_sysfs(PATH_KSM "sleep_millisecs"); + write_sysfs(PATH_KSM "sleep_millisecs", 0); + ksm_sysfs[2] = read_sysfs(PATH_KSM "run"); + write_sysfs(PATH_KSM "run", 1); + ksm_sysfs[3] = read_sysfs(PATH_KSM "max_page_sharing"); + write_sysfs(PATH_KSM "max_page_sharing", ksm_sysfs[3] + TEST_UNIT); + ksm_sysfs[4] = read_sysfs(PATH_KSM "pages_to_scan"); + write_sysfs(PATH_KSM "pages_to_scan", ksm_sysfs[4] + TEST_UNIT); +} + +static void mte_ksm_restore(void) +{ + write_sysfs(PATH_KSM "merge_across_nodes", ksm_sysfs[0]); + write_sysfs(PATH_KSM "sleep_millisecs", ksm_sysfs[1]); + write_sysfs(PATH_KSM "run", ksm_sysfs[2]); + write_sysfs(PATH_KSM "max_page_sharing", ksm_sysfs[3]); + write_sysfs(PATH_KSM "pages_to_scan", ksm_sysfs[4]); +} + +static void mte_ksm_scan(void) +{ + int cur_count = read_sysfs(PATH_KSM "full_scans"); + int scan_count = cur_count + 1; + int max_loop_count = MAX_LOOP; + + while ((cur_count < scan_count) && max_loop_count) { + sleep(1); + cur_count = read_sysfs(PATH_KSM "full_scans"); + max_loop_count--; + } +#ifdef DEBUG + ksft_print_msg("INFO: pages_shared=%lu pages_sharing=%lu\n", + read_sysfs(PATH_KSM "pages_shared"), + read_sysfs(PATH_KSM "pages_sharing")); +#endif +} + +static int check_madvise_options(int mem_type, int mode, int mapping) +{ + char *ptr; + int err, ret; + + err = KSFT_FAIL; + if (access(PATH_KSM, F_OK) == -1) { + ksft_print_msg("ERR: Kernel KSM config not enabled\n"); + return err; + } + + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + ptr = mte_allocate_memory(TEST_UNIT * page_sz, mem_type, mapping, true); + if (check_allocated_memory(ptr, TEST_UNIT * page_sz, mem_type, false) != KSFT_PASS) + return KSFT_FAIL; + + /* Insert same data in all the pages */ + memset(ptr, 'A', TEST_UNIT * page_sz); + ret = madvise(ptr, TEST_UNIT * page_sz, MADV_MERGEABLE); + if (ret) { + ksft_print_msg("ERR: madvise failed to set MADV_UNMERGEABLE\n"); + goto madvise_err; + } + mte_ksm_scan(); + /* Tagged pages should not merge */ + if ((read_sysfs(PATH_KSM "pages_shared") < 1) || + (read_sysfs(PATH_KSM "pages_sharing") < (TEST_UNIT - 1))) + err = KSFT_PASS; +madvise_err: + mte_free_memory(ptr, TEST_UNIT * page_sz, mem_type, true); + return err; +} + +int main(int argc, char *argv[]) +{ + int err; + + err = mte_default_setup(); + if (err) + return err; + page_sz = getpagesize(); + if (!page_sz) { + ksft_print_msg("ERR: Unable to get page size\n"); + return KSFT_FAIL; + } + /* Register signal handlers */ + mte_register_signal(SIGBUS, mte_default_handler); + mte_register_signal(SIGSEGV, mte_default_handler); + /* Enable KSM */ + mte_ksm_setup(); + + evaluate_test(check_madvise_options(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check KSM mte page merge for private mapping, sync mode and mmap memory\n"); + evaluate_test(check_madvise_options(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE), + "Check KSM mte page merge for private mapping, async mode and mmap memory\n"); + evaluate_test(check_madvise_options(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED), + "Check KSM mte page merge for shared mapping, sync mode and mmap memory\n"); + evaluate_test(check_madvise_options(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED), + "Check KSM mte page merge for shared mapping, async mode and mmap memory\n"); + + mte_ksm_restore(); + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} From 4dafc08d0ba4768e8540f49ab40c3ea26e40d554 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 2 Oct 2020 17:26:30 +0530 Subject: [PATCH 145/148] kselftest/arm64: Check mte tagged user address in kernel Add a testcase to check that user address with valid/invalid mte tag works in kernel mode. This test verifies that the kernel API's __arch_copy_from_user/__arch_copy_to_user works by considering if the user pointer has valid/invalid allocation tags. In MTE sync mode, file memory read/write and other similar interfaces fails if a user memory with invalid tag is accessed in kernel. In async mode no such failure occurs. Signed-off-by: Amit Daniel Kachhap Tested-by: Catalin Marinas Acked-by: Catalin Marinas Cc: Shuah Khan Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/r/20201002115630.24683-7-amit.kachhap@arm.com Signed-off-by: Will Deacon --- tools/testing/selftests/arm64/mte/.gitignore | 1 + .../selftests/arm64/mte/check_user_mem.c | 111 ++++++++++++++++++ .../selftests/arm64/mte/mte_common_util.h | 1 + .../testing/selftests/arm64/mte/mte_helper.S | 14 +++ 4 files changed, 127 insertions(+) create mode 100644 tools/testing/selftests/arm64/mte/check_user_mem.c diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore index 44e9bfdaeca6..bc3ac63f3314 100644 --- a/tools/testing/selftests/arm64/mte/.gitignore +++ b/tools/testing/selftests/arm64/mte/.gitignore @@ -3,3 +3,4 @@ check_tags_inclusion check_child_memory check_mmap_options check_ksm_options +check_user_mem diff --git a/tools/testing/selftests/arm64/mte/check_user_mem.c b/tools/testing/selftests/arm64/mte/check_user_mem.c new file mode 100644 index 000000000000..594e98e76880 --- /dev/null +++ b/tools/testing/selftests/arm64/mte/check_user_mem.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2020 ARM Limited + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kselftest.h" +#include "mte_common_util.h" +#include "mte_def.h" + +static size_t page_sz; + +static int check_usermem_access_fault(int mem_type, int mode, int mapping) +{ + int fd, i, err; + char val = 'A'; + size_t len, read_len; + void *ptr, *ptr_next; + + err = KSFT_FAIL; + len = 2 * page_sz; + mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG); + fd = create_temp_file(); + if (fd == -1) + return KSFT_FAIL; + for (i = 0; i < len; i++) + write(fd, &val, sizeof(val)); + lseek(fd, 0, 0); + ptr = mte_allocate_memory(len, mem_type, mapping, true); + if (check_allocated_memory(ptr, len, mem_type, true) != KSFT_PASS) { + close(fd); + return KSFT_FAIL; + } + mte_initialize_current_context(mode, (uintptr_t)ptr, len); + /* Copy from file into buffer with valid tag */ + read_len = read(fd, ptr, len); + mte_wait_after_trig(); + if (cur_mte_cxt.fault_valid || read_len < len) + goto usermem_acc_err; + /* Verify same pattern is read */ + for (i = 0; i < len; i++) + if (*(char *)(ptr + i) != val) + break; + if (i < len) + goto usermem_acc_err; + + /* Tag the next half of memory with different value */ + ptr_next = (void *)((unsigned long)ptr + page_sz); + ptr_next = mte_insert_new_tag(ptr_next); + mte_set_tag_address_range(ptr_next, page_sz); + + lseek(fd, 0, 0); + /* Copy from file into buffer with invalid tag */ + read_len = read(fd, ptr, len); + mte_wait_after_trig(); + /* + * Accessing user memory in kernel with invalid tag should fail in sync + * mode without fault but may not fail in async mode as per the + * implemented MTE userspace support in Arm64 kernel. + */ + if (mode == MTE_SYNC_ERR && + !cur_mte_cxt.fault_valid && read_len < len) { + err = KSFT_PASS; + } else if (mode == MTE_ASYNC_ERR && + !cur_mte_cxt.fault_valid && read_len == len) { + err = KSFT_PASS; + } +usermem_acc_err: + mte_free_memory((void *)ptr, len, mem_type, true); + close(fd); + return err; +} + +int main(int argc, char *argv[]) +{ + int err; + + page_sz = getpagesize(); + if (!page_sz) { + ksft_print_msg("ERR: Unable to get page size\n"); + return KSFT_FAIL; + } + err = mte_default_setup(); + if (err) + return err; + /* Register signal handlers */ + mte_register_signal(SIGSEGV, mte_default_handler); + + evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE), + "Check memory access from kernel in sync mode, private mapping and mmap memory\n"); + evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED), + "Check memory access from kernel in sync mode, shared mapping and mmap memory\n"); + + evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE), + "Check memory access from kernel in async mode, private mapping and mmap memory\n"); + evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED), + "Check memory access from kernel in async mode, shared mapping and mmap memory\n"); + + mte_restore_setup(); + ksft_print_cnts(); + return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL; +} diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h index 45160e061a0e..195a7d1879e6 100644 --- a/tools/testing/selftests/arm64/mte/mte_common_util.h +++ b/tools/testing/selftests/arm64/mte/mte_common_util.h @@ -64,6 +64,7 @@ int create_temp_file(void); /* Assembly MTE utility functions */ void *mte_insert_random_tag(void *ptr); +void *mte_insert_new_tag(void *ptr); void *mte_get_tag_address(void *ptr); void mte_set_tag_address_range(void *ptr, int range); void mte_clear_tag_address_range(void *ptr, int range); diff --git a/tools/testing/selftests/arm64/mte/mte_helper.S b/tools/testing/selftests/arm64/mte/mte_helper.S index 48e049fbad9a..a02c04cd0aac 100644 --- a/tools/testing/selftests/arm64/mte/mte_helper.S +++ b/tools/testing/selftests/arm64/mte/mte_helper.S @@ -26,6 +26,20 @@ ENTRY(mte_insert_random_tag) ret ENDPROC(mte_insert_random_tag) +/* + * mte_insert_new_tag: Insert new tag and different from the source tag if + * source pointer has it. + * Input: + * x0 - source pointer with a tag/no-tag + * Return: + * x0 - pointer with random tag + */ +ENTRY(mte_insert_new_tag) + gmi x1, x0, xzr + irg x0, x0, x1 + ret +ENDPROC(mte_insert_new_tag) + /* * mte_get_tag_address: Get the tag from given address. * Input: From 353e228eb355be5a65a3c0996c774a0f46737fda Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 5 Oct 2020 17:43:03 +0100 Subject: [PATCH 146/148] arm64: initialize per-cpu offsets earlier The current initialization of the per-cpu offset register is difficult to follow and this initialization is not always early enough for upcoming instrumentation with KCSAN, where the instrumentation callbacks use the per-cpu offset. To make it possible to support KCSAN, and to simplify reasoning about early bringup code, let's initialize the per-cpu offset earlier, before we run any C code that may consume it. To do so, this patch adds a new init_this_cpu_offset() helper that's called before the usual primary/secondary start functions. For consistency, this is also used to re-initialize the per-cpu offset after the runtime per-cpu areas have been allocated (which can change CPU0's offset). So that init_this_cpu_offset() isn't subject to any instrumentation that might consume the per-cpu offset, it is marked with noinstr, preventing instrumentation. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201005164303.21389-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpu.h | 2 ++ arch/arm64/kernel/head.S | 3 +++ arch/arm64/kernel/setup.c | 12 ++++++------ arch/arm64/kernel/smp.c | 13 ++++++++----- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 7faae6ff3ab4..d9d60b18e811 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -68,4 +68,6 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info); void update_cpu_features(int cpu, struct cpuinfo_arm64 *info, struct cpuinfo_arm64 *boot); +void init_this_cpu_offset(void); + #endif /* __ASM_CPU_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 037421c66b14..2720e6ec6814 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -452,6 +452,8 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl __pi_memset dsb ishst // Make zero page visible to PTW + bl init_this_cpu_offset + #ifdef CONFIG_KASAN bl kasan_early_init #endif @@ -758,6 +760,7 @@ SYM_FUNC_START_LOCAL(__secondary_switched) ptrauth_keys_init_cpu x2, x3, x4, x5 #endif + bl init_this_cpu_offset b secondary_start_kernel SYM_FUNC_END(__secondary_switched) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 77c4c9bad1b8..005171972764 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -87,12 +87,6 @@ void __init smp_setup_processor_id(void) u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; set_cpu_logical_map(0, mpidr); - /* - * clear __my_cpu_offset on boot CPU to avoid hang caused by - * using percpu variable early, for example, lockdep will - * access percpu variable inside lock_release - */ - set_my_cpu_offset(0); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } @@ -282,6 +276,12 @@ u64 cpu_logical_map(int cpu) } EXPORT_SYMBOL_GPL(cpu_logical_map); +void noinstr init_this_cpu_offset(void) +{ + unsigned int cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); +} + void __init __no_sanitize_address setup_arch(char **cmdline_p) { init_mm.start_code = (unsigned long) _text; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 355ee9eed4dd..7714310fba22 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -192,10 +192,7 @@ asmlinkage notrace void secondary_start_kernel(void) u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; const struct cpu_operations *ops; - unsigned int cpu; - - cpu = task_cpu(current); - set_my_cpu_offset(per_cpu_offset(cpu)); + unsigned int cpu = smp_processor_id(); /* * All kernel threads share the same mm context; grab a @@ -435,7 +432,13 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); + /* + * Now that setup_per_cpu_areas() has allocated the runtime per-cpu + * areas it is only safe to read the CPU0 boot-time area, and we must + * reinitialize the offset to point to the runtime area. + */ + init_this_cpu_offset(); + cpuinfo_store_boot_cpu(); /* From d433ab42fdc2c8a32e5df7d53833310f0ab9822c Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 6 Oct 2020 20:44:53 +0100 Subject: [PATCH 147/148] arm64: random: Remove no longer needed prototypes Commit 9bceb80b3cc4 ("arm64: kaslr: Use standard early random function") removed the direct calls of the __arm64_rndr() and __early_cpu_has_rndr() functions, but left the dummy prototypes in the #else branch of the #ifdef CONFIG_ARCH_RANDOM guard. Remove the redundant prototypes, as they have no users outside of this header file. Signed-off-by: Andre Przywara Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20201006194453.36519-1-andre.przywara@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/archrandom.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index 44209f6146aa..ffb1a40d5475 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -79,10 +79,5 @@ arch_get_random_seed_long_early(unsigned long *v) } #define arch_get_random_seed_long_early arch_get_random_seed_long_early -#else - -static inline bool __arm64_rndr(unsigned long *v) { return false; } -static inline bool __init __early_cpu_has_rndr(void) { return false; } - #endif /* CONFIG_ARCH_RANDOM */ #endif /* _ASM_ARCHRANDOM_H */ From d13027bb35e089bc1bb9f19c4976decf32a09b97 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 9 Oct 2020 11:24:17 +0100 Subject: [PATCH 148/148] Revert "arm64: initialize per-cpu offsets earlier" This reverts commit 353e228eb355be5a65a3c0996c774a0f46737fda. Qian Cai reports that TX2 no longer boots with his .config as it appears that task_cpu() gets instrumented and used before KASAN has been initialised. Although Mark has a proposed fix, let's take the safe option of reverting this for now and sorting it out properly later. Link: https://lore.kernel.org/r/711bc57a314d8d646b41307008db2845b7537b3d.camel@redhat.com Reported-by: Qian Cai Tested-by: Mark Rutland Signed-off-by: Will Deacon --- arch/arm64/include/asm/cpu.h | 2 -- arch/arm64/kernel/head.S | 3 --- arch/arm64/kernel/setup.c | 12 ++++++------ arch/arm64/kernel/smp.c | 13 +++++-------- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index d9d60b18e811..7faae6ff3ab4 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -68,6 +68,4 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info); void update_cpu_features(int cpu, struct cpuinfo_arm64 *info, struct cpuinfo_arm64 *boot); -void init_this_cpu_offset(void); - #endif /* __ASM_CPU_H */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index e28c9d4e5278..d8d9caf02834 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -448,8 +448,6 @@ SYM_FUNC_START_LOCAL(__primary_switched) bl __pi_memset dsb ishst // Make zero page visible to PTW - bl init_this_cpu_offset - #ifdef CONFIG_KASAN bl kasan_early_init #endif @@ -756,7 +754,6 @@ SYM_FUNC_START_LOCAL(__secondary_switched) ptrauth_keys_init_cpu x2, x3, x4, x5 #endif - bl init_this_cpu_offset b secondary_start_kernel SYM_FUNC_END(__secondary_switched) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 005171972764..77c4c9bad1b8 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -87,6 +87,12 @@ void __init smp_setup_processor_id(void) u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; set_cpu_logical_map(0, mpidr); + /* + * clear __my_cpu_offset on boot CPU to avoid hang caused by + * using percpu variable early, for example, lockdep will + * access percpu variable inside lock_release + */ + set_my_cpu_offset(0); pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n", (unsigned long)mpidr, read_cpuid_id()); } @@ -276,12 +282,6 @@ u64 cpu_logical_map(int cpu) } EXPORT_SYMBOL_GPL(cpu_logical_map); -void noinstr init_this_cpu_offset(void) -{ - unsigned int cpu = task_cpu(current); - set_my_cpu_offset(per_cpu_offset(cpu)); -} - void __init __no_sanitize_address setup_arch(char **cmdline_p) { init_mm.start_code = (unsigned long) _text; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 7714310fba22..355ee9eed4dd 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -192,7 +192,10 @@ asmlinkage notrace void secondary_start_kernel(void) u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; const struct cpu_operations *ops; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -432,13 +435,7 @@ void __init smp_cpus_done(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { - /* - * Now that setup_per_cpu_areas() has allocated the runtime per-cpu - * areas it is only safe to read the CPU0 boot-time area, and we must - * reinitialize the offset to point to the runtime area. - */ - init_this_cpu_offset(); - + set_my_cpu_offset(per_cpu_offset(smp_processor_id())); cpuinfo_store_boot_cpu(); /*