d8d55f5616
The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free(). Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move. Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd. Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song <songmuchun@bytedance.com> Cc: Barry Song <song.bao.hua@hisilicon.com> Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com> Cc: Chen Huang <chenhuang5@huawei.com> Cc: David Hildenbrand <david@redhat.com> Cc: Fam Zheng <fam.zheng@bytedance.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Xiongchun Duan <duanxiongchun@bytedance.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
166 lines
4.2 KiB
C
166 lines
4.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/pagewalk.h>
|
|
#include <linux/ptdump.h>
|
|
#include <linux/kasan.h>
|
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
/*
|
|
* This is an optimization for KASAN=y case. Since all kasan page tables
|
|
* eventually point to the kasan_early_shadow_page we could call note_page()
|
|
* right away without walking through lower level page tables. This saves
|
|
* us dozens of seconds (minutes for 5-level config) while checking for
|
|
* W+X mapping or reading kernel_page_tables debugfs file.
|
|
*/
|
|
static inline int note_kasan_page_table(struct mm_walk *walk,
|
|
unsigned long addr)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
|
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pgd_t val = READ_ONCE(*pgd);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 4 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 0, pgd_val(val));
|
|
|
|
if (pgd_leaf(val)) {
|
|
st->note_page(st, addr, 0, pgd_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
p4d_t val = READ_ONCE(*p4d);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 3 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 1, p4d_val(val));
|
|
|
|
if (p4d_leaf(val)) {
|
|
st->note_page(st, addr, 1, p4d_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pud_t val = READ_ONCE(*pud);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 2 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 2, pud_val(val));
|
|
|
|
if (pud_leaf(val)) {
|
|
st->note_page(st, addr, 2, pud_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pmd_t val = READ_ONCE(*pmd);
|
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 3, pmd_val(val));
|
|
if (pmd_leaf(val)) {
|
|
st->note_page(st, addr, 3, pmd_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pte_t val = ptep_get(pte);
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 4, pte_val(val));
|
|
|
|
st->note_page(st, addr, 4, pte_val(val));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_hole(unsigned long addr, unsigned long next,
|
|
int depth, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
st->note_page(st, addr, depth, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct mm_walk_ops ptdump_ops = {
|
|
.pgd_entry = ptdump_pgd_entry,
|
|
.p4d_entry = ptdump_p4d_entry,
|
|
.pud_entry = ptdump_pud_entry,
|
|
.pmd_entry = ptdump_pmd_entry,
|
|
.pte_entry = ptdump_pte_entry,
|
|
.pte_hole = ptdump_hole,
|
|
};
|
|
|
|
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
|
|
{
|
|
const struct ptdump_range *range = st->range;
|
|
|
|
mmap_read_lock(mm);
|
|
while (range->start != range->end) {
|
|
walk_page_range_novma(mm, range->start, range->end,
|
|
&ptdump_ops, pgd, st);
|
|
range++;
|
|
}
|
|
mmap_read_unlock(mm);
|
|
|
|
/* Flush out the last page */
|
|
st->note_page(st, 0, -1, 0);
|
|
}
|