From 6569fc12e442ea973d96db39e542aa19a7bc3a79 Mon Sep 17 00:00:00 2001 From: Hsieh-Tseng Shen Date: Tue, 25 Apr 2023 18:28:28 +0800 Subject: [PATCH 1/4] riscv: mm: Ensure prot of VM_WRITE and VM_EXEC must be readable Commit 8aeb7b17f04e ("RISC-V: Make mmap() with PROT_WRITE imply PROT_READ") allows riscv to use mmap with PROT_WRITE only, and meanwhile mmap with w+x is also permitted. However, when userspace tries to access this page with PROT_WRITE|PROT_EXEC, which causes infinite loop at load page fault as well as it triggers soft lockup. According to riscv privileged spec, "Writable pages must also be marked readable". The fix to drop the `PAGE_COPY_READ_EXEC` and then `PAGE_COPY_EXEC` would be just used instead. This aligns the other arches (i.e arm64) for protection_map. Fixes: 8aeb7b17f04e ("RISC-V: Make mmap() with PROT_WRITE imply PROT_READ") Signed-off-by: Hsieh-Tseng Shen Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230425102828.1616812-1-woodrow.shen@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 3 +-- arch/riscv/mm/init.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2258b27173b0..75970ee2bda2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -165,8 +165,7 @@ extern struct pt_alloc_ops pt_ops __initdata; _PAGE_EXEC | _PAGE_WRITE) #define PAGE_COPY PAGE_READ -#define PAGE_COPY_EXEC PAGE_EXEC -#define PAGE_COPY_READ_EXEC PAGE_READ_EXEC +#define PAGE_COPY_EXEC PAGE_READ_EXEC #define PAGE_SHARED PAGE_WRITE #define PAGE_SHARED_EXEC PAGE_WRITE_EXEC diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c6bb966e4123..0fe75c9713f8 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -293,7 +293,7 @@ static const pgprot_t protection_map[16] = { [VM_EXEC] = PAGE_EXEC, [VM_EXEC | VM_READ] = PAGE_READ_EXEC, [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, - [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_READ_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, [VM_SHARED] = PAGE_NONE, [VM_SHARED | VM_READ] = PAGE_READ, [VM_SHARED | VM_WRITE] = PAGE_SHARED, From 25abe0db92437fde463204c3e4eb5a71b62d6e5d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 6 Jun 2023 15:04:44 +0200 Subject: [PATCH 2/4] riscv: Fix kfence now that the linear mapping can be backed by PUD/P4D/PGD RISC-V Kfence implementation used to rely on the fact the linear mapping was backed by at most PMD hugepages, which is not true anymore since commit 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping"). Instead of splitting PUD/P4D/PGD mappings afterwards, directly map the kfence pool region using PTE mappings by allocating this region before setup_vm_final(). Reported-by: syzbot+a74d57bddabbedd75135@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a74d57bddabbedd75135 Fixes: 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230606130444.25090-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kfence.h | 33 ------------------------------- arch/riscv/mm/init.c | 35 ++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/arch/riscv/include/asm/kfence.h b/arch/riscv/include/asm/kfence.h index d887a54042aa..0bbffd528096 100644 --- a/arch/riscv/include/asm/kfence.h +++ b/arch/riscv/include/asm/kfence.h @@ -8,41 +8,8 @@ #include #include -static inline int split_pmd_page(unsigned long addr) -{ - int i; - unsigned long pfn = PFN_DOWN(__pa((addr & PMD_MASK))); - pmd_t *pmd = pmd_off_k(addr); - pte_t *pte = pte_alloc_one_kernel(&init_mm); - - if (!pte) - return -ENOMEM; - - for (i = 0; i < PTRS_PER_PTE; i++) - set_pte(pte + i, pfn_pte(pfn + i, PAGE_KERNEL)); - set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(pte)), PAGE_TABLE)); - - flush_tlb_kernel_range(addr, addr + PMD_SIZE); - return 0; -} - static inline bool arch_kfence_init_pool(void) { - int ret; - unsigned long addr; - pmd_t *pmd; - - for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); - addr += PAGE_SIZE) { - pmd = pmd_off_k(addr); - - if (pmd_leaf(*pmd)) { - ret = split_pmd_page(addr); - if (ret) - return false; - } - } - return true; } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0fe75c9713f8..38c4b4d6b64f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -23,6 +23,7 @@ #ifdef CONFIG_RELOCATABLE #include #endif +#include #include #include @@ -1167,14 +1168,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) } static void __init create_linear_mapping_range(phys_addr_t start, - phys_addr_t end) + phys_addr_t end, + uintptr_t fixed_map_size) { phys_addr_t pa; uintptr_t va, map_size; for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); - map_size = best_map_size(pa, end - pa); + map_size = fixed_map_size ? fixed_map_size : + best_map_size(pa, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, pgprot_from_va(va)); @@ -1184,6 +1187,7 @@ static void __init create_linear_mapping_range(phys_addr_t start, static void __init create_linear_mapping_page_table(void) { phys_addr_t start, end; + phys_addr_t kfence_pool __maybe_unused; u64 i; #ifdef CONFIG_STRICT_KERNEL_RWX @@ -1197,6 +1201,19 @@ static void __init create_linear_mapping_page_table(void) memblock_mark_nomap(krodata_start, krodata_size); #endif +#ifdef CONFIG_KFENCE + /* + * kfence pool must be backed by PAGE_SIZE mappings, so allocate it + * before we setup the linear mapping so that we avoid using hugepages + * for this region. + */ + kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + BUG_ON(!kfence_pool); + + memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = __va(kfence_pool); +#endif + /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { if (start >= end) @@ -1207,17 +1224,25 @@ static void __init create_linear_mapping_page_table(void) if (end >= __pa(PAGE_OFFSET) + memory_limit) end = __pa(PAGE_OFFSET) + memory_limit; - create_linear_mapping_range(start, end); + create_linear_mapping_range(start, end, 0); } #ifdef CONFIG_STRICT_KERNEL_RWX - create_linear_mapping_range(ktext_start, ktext_start + ktext_size); + create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0); create_linear_mapping_range(krodata_start, - krodata_start + krodata_size); + krodata_start + krodata_size, 0); memblock_clear_nomap(ktext_start, ktext_size); memblock_clear_nomap(krodata_start, krodata_size); #endif + +#ifdef CONFIG_KFENCE + create_linear_mapping_range(kfence_pool, + kfence_pool + KFENCE_POOL_SIZE, + PAGE_SIZE); + + memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); +#endif } static void __init setup_vm_final(void) From 49a0a3731596fc004db6eec3fc674d92a09ef383 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 7 Jun 2023 14:58:51 +0200 Subject: [PATCH 3/4] riscv: Check the virtual alignment before choosing a map size We used to only check the alignment of the physical address to decide which mapping would fit for a certain region of the linear mapping, but it is not enough since the virtual address must also be aligned, so check that too. Fixes: 3335068f8721 ("riscv: Use PUD/P4D/PGD pages for the linear mapping") Reported-by: Song Shuai Link: https://lore.kernel.org/linux-riscv/tencent_7C3B580B47C1B17C16488EC1@qq.com/ Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230607125851.63370-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 38c4b4d6b64f..4fa420faa780 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -660,18 +660,19 @@ void __init create_pgd_mapping(pgd_t *pgdp, create_pgd_next_mapping(nextp, va, pa, sz, prot); } -static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) +static uintptr_t __init best_map_size(phys_addr_t pa, uintptr_t va, + phys_addr_t size) { - if (!(base & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE) + if (!(pa & (PGDIR_SIZE - 1)) && !(va & (PGDIR_SIZE - 1)) && size >= PGDIR_SIZE) return PGDIR_SIZE; - if (!(base & (P4D_SIZE - 1)) && size >= P4D_SIZE) + if (!(pa & (P4D_SIZE - 1)) && !(va & (P4D_SIZE - 1)) && size >= P4D_SIZE) return P4D_SIZE; - if (!(base & (PUD_SIZE - 1)) && size >= PUD_SIZE) + if (!(pa & (PUD_SIZE - 1)) && !(va & (PUD_SIZE - 1)) && size >= PUD_SIZE) return PUD_SIZE; - if (!(base & (PMD_SIZE - 1)) && size >= PMD_SIZE) + if (!(pa & (PMD_SIZE - 1)) && !(va & (PMD_SIZE - 1)) && size >= PMD_SIZE) return PMD_SIZE; return PAGE_SIZE; @@ -1177,7 +1178,7 @@ static void __init create_linear_mapping_range(phys_addr_t start, for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); map_size = fixed_map_size ? fixed_map_size : - best_map_size(pa, end - pa); + best_map_size(pa, va, end - pa); create_pgd_mapping(swapper_pg_dir, va, pa, map_size, pgprot_from_va(va)); From 99a670b2069c725a7b50318aa681d9cae8f89325 Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Thu, 4 May 2023 15:29:10 +0800 Subject: [PATCH 4/4] riscv: fix kprobe __user string arg print fault issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On riscv qemu platform, when add kprobe event on do_sys_open() to show filename string arg, it just print fault as follow: echo 'p:myprobe do_sys_open dfd=$arg1 filename=+0($arg2):string flags=$arg3 mode=$arg4' > kprobe_events bash-166 [000] ...1. 360.195367: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename=(fault) flags=0x8241 mode=0x1b6 bash-166 [000] ...1. 360.219369: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename=(fault) flags=0x8241 mode=0x1b6 bash-191 [000] ...1. 360.378827: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename=(fault) flags=0x98800 mode=0x0 As riscv do not select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE, the +0($arg2) addr is processed as a kernel address though it is a userspace address, cause the above filename=(fault) print. So select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE to avoid the issue, after that the kprobe trace is ok as below: bash-166 [000] ...1. 96.767641: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename="/dev/null" flags=0x8241 mode=0x1b6 bash-166 [000] ...1. 96.793751: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename="/dev/null" flags=0x8241 mode=0x1b6 bash-177 [000] ...1. 96.962354: myprobe: (do_sys_open+0x0/0x84) dfd=0xffffffffffffff9c filename="/sys/kernel/debug/tracing/events/kprobes/" flags=0x98800 mode=0x0 Signed-off-by: Ruan Jinjie Acked-by: Björn Töpel Fixes: 0ebeea8ca8a4 ("bpf: Restrict bpf_probe_read{, str}() only to archs where they work") Link: https://lore.kernel.org/r/20230504072910.3742842-1-ruanjinjie@huawei.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 2bb0c38419ff..5966ad97c30c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -26,6 +26,7 @@ config RISCV select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_MMIOWB + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU