mm/gup: handle hugepd for follow_page()
Hugepd is only used in PowerPC so far on 4K page size kernels where hash mmu is used. follow_page_mask() used to leverage hugetlb APIs to access hugepd entries. Teach follow_page_mask() itself on hugepd. With previous refactors on fast-gup gup_huge_pd(), most of the code can be leveraged. There's something not needed for follow page, for example, gup_hugepte() tries to detect pgtable entry change which will never happen with slow gup (which has the pgtable lock held), but that's not a problem to check. Since follow_page() always only fetch one page, set the end to "address + PAGE_SIZE" should suffice. We will still do the pgtable walk once for each hugetlb page by setting ctx->page_mask properly. One thing worth mentioning is that some level of pgtable's _bad() helper will report is_hugepd() entries as TRUE on Power8 hash MMUs. I think it at least applies to PUD on Power8 with 4K pgsize. It means feeding a hugepd entry to pud_bad() will report a false positive. Let's leave that for now because it can be arch-specific where I am a bit declined to touch. In this patch it's not a problem as long as hugepd is detected before any bad pgtable entries. To allow slow gup like follow_*_page() to access hugepd helpers, hugepd codes are moved to the top. Besides that, the helper record_subpages() will be used by either hugepd or fast-gup now. To avoid "unused function" warnings we must provide a "#ifdef" for it, unfortunately. Link: https://lkml.kernel.org/r/20240327152332.950956-13-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Tested-by: Ryan Roberts <ryan.roberts@arm.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Jones <andrew.jones@linux.dev> Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Christoph Hellwig <hch@infradead.org> Cc: David Hildenbrand <david@redhat.com> Cc: James Houghton <jthoughton@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Lorenzo Stoakes <lstoakes@gmail.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Mike Rapoport (IBM)" <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Rik van Riel <riel@surriel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
4418c522f6
commit
a12083d721
269
mm/gup.c
269
mm/gup.c
@ -500,6 +500,149 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
|
||||
#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP)
|
||||
static int record_subpages(struct page *page, unsigned long sz,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct page **pages)
|
||||
{
|
||||
struct page *start_page;
|
||||
int nr;
|
||||
|
||||
start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
|
||||
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
|
||||
pages[nr] = nth_page(start_page, nr);
|
||||
|
||||
return nr;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
|
||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
|
||||
unsigned long sz)
|
||||
{
|
||||
unsigned long __boundary = (addr + sz) & ~(sz-1);
|
||||
return (__boundary - 1 < end - 1) ? __boundary : end;
|
||||
}
|
||||
|
||||
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
||||
unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long pte_end;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
pte_t pte;
|
||||
int refs;
|
||||
|
||||
pte_end = (addr + sz) & ~(sz-1);
|
||||
if (pte_end < end)
|
||||
end = pte_end;
|
||||
|
||||
pte = huge_ptep_get(ptep);
|
||||
|
||||
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
|
||||
return 0;
|
||||
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
|
||||
page = pte_page(pte);
|
||||
refs = record_subpages(page, sz, addr, end, pages + *nr);
|
||||
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
|
||||
* systems on Power, which does not have issue with folio writeback against
|
||||
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
|
||||
* even anonymous memory, we need to do extra check as what we do with most
|
||||
* of the other folios. See writable_file_mapping_allowed() and
|
||||
* gup_fast_folio_allowed() for more information.
|
||||
*/
|
||||
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
||||
unsigned int pdshift, unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
pte_t *ptep;
|
||||
unsigned long sz = 1UL << hugepd_shift(hugepd);
|
||||
unsigned long next;
|
||||
|
||||
ptep = hugepte_offset(hugepd, addr, pdshift);
|
||||
do {
|
||||
next = hugepte_addr_end(addr, end, sz);
|
||||
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
|
||||
return 0;
|
||||
} while (ptep++, addr = next, addr != end);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
|
||||
unsigned long addr, unsigned int pdshift,
|
||||
unsigned int flags,
|
||||
struct follow_page_context *ctx)
|
||||
{
|
||||
struct page *page;
|
||||
struct hstate *h;
|
||||
spinlock_t *ptl;
|
||||
int nr = 0, ret;
|
||||
pte_t *ptep;
|
||||
|
||||
/* Only hugetlb supports hugepd */
|
||||
if (WARN_ON_ONCE(!is_vm_hugetlb_page(vma)))
|
||||
return ERR_PTR(-EFAULT);
|
||||
|
||||
h = hstate_vma(vma);
|
||||
ptep = hugepte_offset(hugepd, addr, pdshift);
|
||||
ptl = huge_pte_lock(h, vma->vm_mm, ptep);
|
||||
ret = gup_huge_pd(hugepd, addr, pdshift, addr + PAGE_SIZE,
|
||||
flags, &page, &nr);
|
||||
spin_unlock(ptl);
|
||||
|
||||
if (ret) {
|
||||
WARN_ON_ONCE(nr != 1);
|
||||
ctx->page_mask = (1U << huge_page_order(h)) - 1;
|
||||
return page;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
#else /* CONFIG_ARCH_HAS_HUGEPD */
|
||||
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
||||
unsigned int pdshift, unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct page *follow_hugepd(struct vm_area_struct *vma, hugepd_t hugepd,
|
||||
unsigned long addr, unsigned int pdshift,
|
||||
unsigned int flags,
|
||||
struct follow_page_context *ctx)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_HAS_HUGEPD */
|
||||
|
||||
|
||||
static struct page *no_page_table(struct vm_area_struct *vma,
|
||||
unsigned int flags, unsigned long address)
|
||||
{
|
||||
@ -868,6 +1011,9 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
|
||||
return no_page_table(vma, flags, address);
|
||||
if (!pmd_present(pmdval))
|
||||
return no_page_table(vma, flags, address);
|
||||
if (unlikely(is_hugepd(__hugepd(pmd_val(pmdval)))))
|
||||
return follow_hugepd(vma, __hugepd(pmd_val(pmdval)),
|
||||
address, PMD_SHIFT, flags, ctx);
|
||||
if (pmd_devmap(pmdval)) {
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
|
||||
@ -918,6 +1064,9 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
|
||||
pud = READ_ONCE(*pudp);
|
||||
if (!pud_present(pud))
|
||||
return no_page_table(vma, flags, address);
|
||||
if (unlikely(is_hugepd(__hugepd(pud_val(pud)))))
|
||||
return follow_hugepd(vma, __hugepd(pud_val(pud)),
|
||||
address, PUD_SHIFT, flags, ctx);
|
||||
if (pud_leaf(pud)) {
|
||||
ptl = pud_lock(mm, pudp);
|
||||
page = follow_huge_pud(vma, address, pudp, flags, ctx);
|
||||
@ -941,10 +1090,13 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
|
||||
|
||||
p4dp = p4d_offset(pgdp, address);
|
||||
p4d = READ_ONCE(*p4dp);
|
||||
if (!p4d_present(p4d))
|
||||
return no_page_table(vma, flags, address);
|
||||
BUILD_BUG_ON(p4d_leaf(p4d));
|
||||
if (unlikely(p4d_bad(p4d)))
|
||||
|
||||
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d)))))
|
||||
return follow_hugepd(vma, __hugepd(p4d_val(p4d)),
|
||||
address, P4D_SHIFT, flags, ctx);
|
||||
|
||||
if (!p4d_present(p4d) || p4d_bad(p4d))
|
||||
return no_page_table(vma, flags, address);
|
||||
|
||||
return follow_pud_mask(vma, address, p4dp, flags, ctx);
|
||||
@ -994,10 +1146,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
|
||||
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
|
||||
return no_page_table(vma, flags, address);
|
||||
if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
|
||||
page = follow_hugepd(vma, __hugepd(pgd_val(*pgd)),
|
||||
address, PGDIR_SHIFT, flags, ctx);
|
||||
else if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
|
||||
page = no_page_table(vma, flags, address);
|
||||
else
|
||||
page = follow_p4d_mask(vma, address, pgd, flags, ctx);
|
||||
|
||||
return follow_p4d_mask(vma, address, pgd, flags, ctx);
|
||||
return page;
|
||||
}
|
||||
|
||||
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
@ -2954,106 +3111,6 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
|
||||
}
|
||||
#endif
|
||||
|
||||
static int record_subpages(struct page *page, unsigned long sz,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct page **pages)
|
||||
{
|
||||
struct page *start_page;
|
||||
int nr;
|
||||
|
||||
start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
|
||||
for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
|
||||
pages[nr] = nth_page(start_page, nr);
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_HUGEPD
|
||||
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
|
||||
unsigned long sz)
|
||||
{
|
||||
unsigned long __boundary = (addr + sz) & ~(sz-1);
|
||||
return (__boundary - 1 < end - 1) ? __boundary : end;
|
||||
}
|
||||
|
||||
static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
||||
unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
unsigned long pte_end;
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
pte_t pte;
|
||||
int refs;
|
||||
|
||||
pte_end = (addr + sz) & ~(sz-1);
|
||||
if (pte_end < end)
|
||||
end = pte_end;
|
||||
|
||||
pte = huge_ptep_get(ptep);
|
||||
|
||||
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
|
||||
return 0;
|
||||
|
||||
/* hugepages are never "special" */
|
||||
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
||||
|
||||
page = pte_page(pte);
|
||||
refs = record_subpages(page, sz, addr, end, pages + *nr);
|
||||
|
||||
folio = try_grab_folio(page, refs, flags);
|
||||
if (!folio)
|
||||
return 0;
|
||||
|
||||
if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
|
||||
gup_put_folio(folio, refs, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*nr += refs;
|
||||
folio_set_referenced(folio);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: currently GUP for a hugepd is only possible on hugetlbfs file
|
||||
* systems on Power, which does not have issue with folio writeback against
|
||||
* GUP updates. When hugepd will be extended to support non-hugetlbfs or
|
||||
* even anonymous memory, we need to do extra check as what we do with most
|
||||
* of the other folios. See writable_file_mapping_allowed() and
|
||||
* gup_fast_folio_allowed() for more information.
|
||||
*/
|
||||
static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
||||
unsigned int pdshift, unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
pte_t *ptep;
|
||||
unsigned long sz = 1UL << hugepd_shift(hugepd);
|
||||
unsigned long next;
|
||||
|
||||
ptep = hugepte_offset(hugepd, addr, pdshift);
|
||||
do {
|
||||
next = hugepte_addr_end(addr, end, sz);
|
||||
if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
|
||||
return 0;
|
||||
} while (ptep++, addr = next, addr != end);
|
||||
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
|
||||
unsigned int pdshift, unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_ARCH_HAS_HUGEPD */
|
||||
|
||||
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
unsigned long end, unsigned int flags,
|
||||
struct page **pages, int *nr)
|
||||
|
Loading…
Reference in New Issue
Block a user