linux/arch/powerpc/mm/book3s64/hugetlbpage.c
Aneesh Kumar K.V 2354ad252b powerpc/mm: Update default hugetlb size early
commit: d9c234005227 ("Do not depend on MAX_ORDER when grouping pages by mobility")
introduced pageblock_order which will be used to group pages better.
The kernel now groups pages based on the value of HPAGE_SHIFT. Hence HPAGE_SHIFT
should be set before we call set_pageblock_order.

set_pageblock_order happens early in the boot and default hugetlb page size
should be initialized before that to compute the right pageblock_order value.

Currently, default hugetlbe page size is set via arch_initcalls which happens
late in the boot as shown via the below callstack:

[c000000007383b10] [c000000001289328] hugetlbpage_init+0x2b8/0x2f8
[c000000007383bc0] [c0000000012749e4] do_one_initcall+0x14c/0x320
[c000000007383c90] [c00000000127505c] kernel_init_freeable+0x410/0x4e8
[c000000007383da0] [c000000000012664] kernel_init+0x30/0x15c
[c000000007383e10] [c00000000000cf14] ret_from_kernel_thread+0x5c/0x64

and the pageblock_order initialization is done early during the boot.

[c0000000018bfc80] [c0000000012ae120] set_pageblock_order+0x50/0x64
[c0000000018bfca0] [c0000000012b3d94] sparse_init+0x188/0x268
[c0000000018bfd60] [c000000001288bfc] initmem_init+0x28c/0x328
[c0000000018bfe50] [c00000000127b370] setup_arch+0x410/0x480
[c0000000018bfed0] [c00000000127401c] start_kernel+0xb8/0x934
[c0000000018bff90] [c00000000000d984] start_here_common+0x1c/0x98

delaying default hugetlb page size initialization implies the kernel will
initialize pageblock_order to (MAX_ORDER - 1) which is not an optimal
value for mobility grouping. IIUC we always had this issue. But it was not
a problem for hash translation mode because (MAX_ORDER - 1) is the same as
HUGETLB_PAGE_ORDER (8) in the case of hash (16MB). With radix,
HUGETLB_PAGE_ORDER will be 5 (2M size) and hence pageblock_order should be
5 instead of 8.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20220211065215.101767-1-aneesh.kumar@linux.ibm.com
2022-02-12 22:47:44 +11:00

165 lines
4.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
*
* Copyright (C) 2003 David Gibson, IBM Corporation.
*
* Based on the IA-32 version:
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <asm/cacheflush.h>
#include <asm/machdep.h>
unsigned int hpage_shift;
EXPORT_SYMBOL(hpage_shift);
#ifdef CONFIG_PPC_64S_HASH_MMU
int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
pte_t *ptep, unsigned long trap, unsigned long flags,
int ssize, unsigned int shift, unsigned int mmu_psize)
{
real_pte_t rpte;
unsigned long vpn;
unsigned long old_pte, new_pte;
unsigned long rflags, pa;
long slot, offset;
BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
/* Search the Linux page table for a match with va */
vpn = hpt_vpn(ea, vsid, ssize);
/*
* At this point, we have a pte (old_pte) which can be used to build
* or update an HPTE. There are 2 cases:
*
* 1. There is a valid (present) pte with no associated HPTE (this is
* the most common case)
* 2. There is a valid (present) pte with an associated HPTE. The
* current values of the pp bits in the HPTE prevent access
* because we are doing software DIRTY bit management and the
* page is currently not DIRTY.
*/
do {
old_pte = pte_val(*ptep);
/* If PTE busy, retry the access */
if (unlikely(old_pte & H_PAGE_BUSY))
return 0;
/* If PTE permissions don't match, take page fault */
if (unlikely(!check_pte_access(access, old_pte)))
return 1;
/*
* Try to lock the PTE, add ACCESSED and DIRTY if it was
* a write access
*/
new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED;
if (access & _PAGE_WRITE)
new_pte |= _PAGE_DIRTY;
} while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
/* Make sure this is a hugetlb entry */
if (old_pte & (H_PAGE_THP_HUGE | _PAGE_DEVMAP))
return 0;
rflags = htab_convert_pte_flags(new_pte, flags);
if (unlikely(mmu_psize == MMU_PAGE_16G))
offset = PTRS_PER_PUD;
else
offset = PTRS_PER_PMD;
rpte = __real_pte(__pte(old_pte), ptep, offset);
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
/*
* No CPU has hugepages but lacks no execute, so we
* don't need to worry about that case
*/
rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
/* Check if pte already has an hpte (case 2) */
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
/* There MIGHT be an HPTE for this pte */
unsigned long gslot;
gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
mmu_psize, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
if (likely(!(old_pte & H_PAGE_HASHPTE))) {
unsigned long hash = hpt_hash(vpn, shift, ssize);
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
/* clear HPTE slot informations in new PTE */
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
mmu_psize, ssize);
/*
* Hypervisor failure. Restore old pte and return -1
* similar to __hash_page_*
*/
if (unlikely(slot == -2)) {
*ptep = __pte(old_pte);
hash_failure_debug(ea, access, vsid, trap, ssize,
mmu_psize, mmu_psize, old_pte);
return -1;
}
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, offset);
}
/*
* No need to use ldarx/stdcx here
*/
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
}
#endif
pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
unsigned long pte_val;
/*
* Clear the _PAGE_PRESENT so that no hardware parallel update is
* possible. Also keep the pte_present true so that we don't take
* wrong fault.
*/
pte_val = pte_update(vma->vm_mm, addr, ptep,
_PAGE_PRESENT, _PAGE_INVALID, 1);
return __pte(pte_val);
}
void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t old_pte, pte_t pte)
{
if (radix_enabled())
return radix__huge_ptep_modify_prot_commit(vma, addr, ptep,
old_pte, pte);
set_huge_pte_at(vma->vm_mm, addr, ptep, pte);
}
void __init hugetlbpage_init_defaultsize(void)
{
/* Set default large page size. Currently, we pick 16M or 1M
* depending on what is available
*/
if (mmu_psize_defs[MMU_PAGE_16M].shift)
hpage_shift = mmu_psize_defs[MMU_PAGE_16M].shift;
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
hpage_shift = mmu_psize_defs[MMU_PAGE_1M].shift;
else if (mmu_psize_defs[MMU_PAGE_2M].shift)
hpage_shift = mmu_psize_defs[MMU_PAGE_2M].shift;
}