426931e7e5
Patch series "Encapsulate PTE contents from non-arch code", v3. A series to improve the encapsulation of pte entries by disallowing non-arch code from directly dereferencing pte_t pointers. This means that by default, the accesses change from a C dereference to a READ_ONCE(). This is technically the correct thing to do since where pgtables are modified by HW (for access/dirty) they are volatile and therefore we should always ensure READ_ONCE() semantics. But more importantly, by always using the helper, it can be overridden by the architecture to fully encapsulate the contents of the pte. Arch code is deliberately not converted, as the arch code knows best. It is intended that arch code (arm64) will override the default with its own implementation that can (e.g.) hide certain bits from the core code, or determine young/dirty status by mixing in state from another source. This patch (of 3): The page table dumper uses walk_page_range_novma() to walk the page tables, which does not lock the PTL before calling the pte_entry() callback. Therefore, the page table dumper's callback must use ptep_get_lockless() rather than ptep_get() to ensure that the pte it reads is not torn or otherwise corrupt when racing with writers. Link: https://lkml.kernel.org/r/20230612151545.3317766-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20230612151545.3317766-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexander Potapenko <glider@google.com> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Alex Williamson <alex.williamson@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrey Konovalov <andreyknvl@gmail.com> Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com> Cc: Christian Brauner <brauner@kernel.org> Cc: Christoph Hellwig <hch@infradead.org> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Dave Airlie <airlied@gmail.com> Cc: Dimitri Sivanich <dimitri.sivanich@hpe.com> Cc: Dmitry Vyukov <dvyukov@google.com> Cc: Ian Rogers <irogers@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi@nec.com> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: SeongJae Park <sj@kernel.org> Cc: Shakeel Butt <shakeelb@google.com> Cc: Uladzislau Rezki (Sony) <urezki@gmail.com> Cc: Vincenzo Frascino <vincenzo.frascino@arm.com> Cc: Yu Zhao <yuzhao@google.com> Cc: kernel test robot <lkp@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
166 lines
4.2 KiB
C
166 lines
4.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/pagewalk.h>
|
|
#include <linux/ptdump.h>
|
|
#include <linux/kasan.h>
|
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
/*
|
|
* This is an optimization for KASAN=y case. Since all kasan page tables
|
|
* eventually point to the kasan_early_shadow_page we could call note_page()
|
|
* right away without walking through lower level page tables. This saves
|
|
* us dozens of seconds (minutes for 5-level config) while checking for
|
|
* W+X mapping or reading kernel_page_tables debugfs file.
|
|
*/
|
|
static inline int note_kasan_page_table(struct mm_walk *walk,
|
|
unsigned long addr)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
|
|
|
|
walk->action = ACTION_CONTINUE;
|
|
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pgd_t val = READ_ONCE(*pgd);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 4 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 0, pgd_val(val));
|
|
|
|
if (pgd_leaf(val)) {
|
|
st->note_page(st, addr, 0, pgd_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
p4d_t val = READ_ONCE(*p4d);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 3 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 1, p4d_val(val));
|
|
|
|
if (p4d_leaf(val)) {
|
|
st->note_page(st, addr, 1, p4d_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pud_t val = READ_ONCE(*pud);
|
|
|
|
#if CONFIG_PGTABLE_LEVELS > 2 && \
|
|
(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
|
|
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 2, pud_val(val));
|
|
|
|
if (pud_leaf(val)) {
|
|
st->note_page(st, addr, 2, pud_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pmd_t val = READ_ONCE(*pmd);
|
|
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
|
|
return note_kasan_page_table(walk, addr);
|
|
#endif
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 3, pmd_val(val));
|
|
if (pmd_leaf(val)) {
|
|
st->note_page(st, addr, 3, pmd_val(val));
|
|
walk->action = ACTION_CONTINUE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
|
|
unsigned long next, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
pte_t val = ptep_get_lockless(pte);
|
|
|
|
if (st->effective_prot)
|
|
st->effective_prot(st, 4, pte_val(val));
|
|
|
|
st->note_page(st, addr, 4, pte_val(val));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int ptdump_hole(unsigned long addr, unsigned long next,
|
|
int depth, struct mm_walk *walk)
|
|
{
|
|
struct ptdump_state *st = walk->private;
|
|
|
|
st->note_page(st, addr, depth, 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static const struct mm_walk_ops ptdump_ops = {
|
|
.pgd_entry = ptdump_pgd_entry,
|
|
.p4d_entry = ptdump_p4d_entry,
|
|
.pud_entry = ptdump_pud_entry,
|
|
.pmd_entry = ptdump_pmd_entry,
|
|
.pte_entry = ptdump_pte_entry,
|
|
.pte_hole = ptdump_hole,
|
|
};
|
|
|
|
void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
|
|
{
|
|
const struct ptdump_range *range = st->range;
|
|
|
|
mmap_write_lock(mm);
|
|
while (range->start != range->end) {
|
|
walk_page_range_novma(mm, range->start, range->end,
|
|
&ptdump_ops, pgd, st);
|
|
range++;
|
|
}
|
|
mmap_write_unlock(mm);
|
|
|
|
/* Flush out the last page */
|
|
st->note_page(st, 0, -1, 0);
|
|
}
|