4ad0a4c234
- Add HOTPLUG_SMT support (/sys/devices/system/cpu/smt) and honour the configured SMT state when hotplugging CPUs into the system. - Combine final TLB flush and lazy TLB mm shootdown IPIs when using the Radix MMU to avoid a broadcast TLBIE flush on exit. - Drop the exclusion between ptrace/perf watchpoints, and drop the now unused associated arch hooks. - Add support for the "nohlt" command line option to disable CPU idle. - Add support for -fpatchable-function-entry for ftrace, with GCC >= 13.1. - Rework memory block size determination, and support 256MB size on systems with GPUs that have hotpluggable memory. - Various other small features and fixes. Thanks to: Andrew Donnellan, Aneesh Kumar K.V, Arnd Bergmann, Athira Rajeev, Benjamin Gray, Christophe Leroy, Frederic Barrat, Gautam Menghani, Geoff Levand, Hari Bathini, Immad Mir, Jialin Zhang, Joel Stanley, Jordan Niethe, Justin Stitt, Kajol Jain, Kees Cook, Krzysztof Kozlowski, Laurent Dufour, Liang He, Linus Walleij, Mahesh Salgaonkar, Masahiro Yamada, Michal Suchanek, Nageswara R Sastry, Nathan Chancellor, Nathan Lynch, Naveen N Rao, Nicholas Piggin, Nick Desaulniers, Omar Sandoval, Randy Dunlap, Reza Arbab, Rob Herring, Russell Currey, Sourabh Jain, Thomas Gleixner, Trevor Woerner, Uwe Kleine-König, Vaibhav Jain, Xiongfeng Wang, Yuan Tan, Zhang Rui, Zheng Zengkai. -----BEGIN PGP SIGNATURE----- iQJHBAABCAAxFiEEJFGtCPCthwEv2Y/bUevqPMjhpYAFAmTwgbwTHG1wZUBlbGxl cm1hbi5pZC5hdQAKCRBR6+o8yOGlgFmpD/432vipeoqvkAYsyK0xi/Y3GcY0wcyd WJApLXXadEbtKQrgXQ6sowWqalg5thYnQCRarg/tXKK/po3KfgwkPjGDpOL+cIdr 12QVN2XJm9VmJ1wYJxzk+yXx4F43AdmMdr94qWAGufbTHezwb4UpzVR1NxtFrOE/ X5TNsC2+2mdZY/ZaNHS5vsTIFv3EhQfqgjZPlIAdLn6CGc8xWT514Q/uHA8+ytM/ HL7Hqs33DoPSvgTa5TT/2E0d0k5nO3P5KObzAjpYlireTPaBi51mpKGewcrtm0o2 v3cBlbfx3C7pe9ZhKBK9BH8cjynfiqsVZ9/lCw/7eBNdm9tHuzG0jeS7Db9tCZXS fM7G2R7SoIusPTqxlBmkU5DpYslwrHiVgCyy3ijxkoA/fakVwh/GgTcMsRt73IY6 n6DsUvWwuYHCIeIiHmHQJqCqCRtV+aMzU3AbbBHOjtdIanhlW16M686dEsgCirh7 akRVRD5VqKaqXs34PpkRL89Xv3wZRjl6XZ3hZFfCjSYXfpXDXhgSToIskpHYhKL8 gpY7WtG9YQP05Xz5HRCx6EluaZVeKe0lZi6fezX7Mi9AygJQO8FfXqP1mHBlEq40 ThWtvL9D89RV6lADqqFN20XepgvKNOyAXcE4szvsnIZYUSPmZQZSPxx+DHtROaLP jX3ifxtxJp92pQ== =5g7K -----END PGP SIGNATURE----- Merge tag 'powerpc-6.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux Pull powerpc updates from Michael Ellerman: - Add HOTPLUG_SMT support (/sys/devices/system/cpu/smt) and honour the configured SMT state when hotplugging CPUs into the system - Combine final TLB flush and lazy TLB mm shootdown IPIs when using the Radix MMU to avoid a broadcast TLBIE flush on exit - Drop the exclusion between ptrace/perf watchpoints, and drop the now unused associated arch hooks - Add support for the "nohlt" command line option to disable CPU idle - Add support for -fpatchable-function-entry for ftrace, with GCC >= 13.1 - Rework memory block size determination, and support 256MB size on systems with GPUs that have hotpluggable memory - Various other small features and fixes Thanks to Andrew Donnellan, Aneesh Kumar K.V, Arnd Bergmann, Athira Rajeev, Benjamin Gray, Christophe Leroy, Frederic Barrat, Gautam Menghani, Geoff Levand, Hari Bathini, Immad Mir, Jialin Zhang, Joel Stanley, Jordan Niethe, Justin Stitt, Kajol Jain, Kees Cook, Krzysztof Kozlowski, Laurent Dufour, Liang He, Linus Walleij, Mahesh Salgaonkar, Masahiro Yamada, Michal Suchanek, Nageswara R Sastry, Nathan Chancellor, Nathan Lynch, Naveen N Rao, Nicholas Piggin, Nick Desaulniers, Omar Sandoval, Randy Dunlap, Reza Arbab, Rob Herring, Russell Currey, Sourabh Jain, Thomas Gleixner, Trevor Woerner, Uwe Kleine-König, Vaibhav Jain, Xiongfeng Wang, Yuan Tan, Zhang Rui, and Zheng Zengkai. * tag 'powerpc-6.6-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (135 commits) macintosh/ams: linux/platform_device.h is needed powerpc/xmon: Reapply "Relax frame size for clang" powerpc/mm/book3s64: Use 256M as the upper limit with coherent device memory attached powerpc/mm/book3s64: Fix build error with SPARSEMEM disabled powerpc/iommu: Fix notifiers being shared by PCI and VIO buses powerpc/mpc5xxx: Add missing fwnode_handle_put() powerpc/config: Disable SLAB_DEBUG_ON in skiroot powerpc/pseries: Remove unused hcall tracing instruction powerpc/pseries: Fix hcall tracepoints with JUMP_LABEL=n powerpc: dts: add missing space before { powerpc/eeh: Use pci_dev_id() to simplify the code powerpc/64s: Move CPU -mtune options into Kconfig powerpc/powermac: Fix unused function warning powerpc/pseries: Rework lppaca_shared_proc() to avoid DEBUG_PREEMPT powerpc: Don't include lppaca.h in paca.h powerpc/pseries: Move hcall_vphn() prototype into vphn.h powerpc/pseries: Move VPHN constants into vphn.h cxl: Drop unused detach_spa() powerpc: Drop zalloc_maybe_bootmem() powerpc/powernv: Use struct opal_prd_msg in more places ...
677 lines
18 KiB
C
677 lines
18 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* PowerPC version
|
|
* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
|
|
*
|
|
* Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
|
|
* and Cort Dougan (PReP) (cort@cs.nmt.edu)
|
|
* Copyright (C) 1996 Paul Mackerras
|
|
*
|
|
* Derived from "arch/i386/mm/init.c"
|
|
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
|
|
*
|
|
* Dave Engebretsen <engebret@us.ibm.com>
|
|
* Rework for PPC64 port.
|
|
*/
|
|
|
|
#undef DEBUG
|
|
|
|
#include <linux/signal.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/stddef.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/init.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/idr.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/module.h>
|
|
#include <linux/poison.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/of_fdt.h>
|
|
#include <linux/libfdt.h>
|
|
#include <linux/memremap.h>
|
|
#include <linux/memory.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/page.h>
|
|
#include <asm/prom.h>
|
|
#include <asm/rtas.h>
|
|
#include <asm/io.h>
|
|
#include <asm/mmu_context.h>
|
|
#include <asm/mmu.h>
|
|
#include <linux/uaccess.h>
|
|
#include <asm/smp.h>
|
|
#include <asm/machdep.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/eeh.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/mmzone.h>
|
|
#include <asm/cputable.h>
|
|
#include <asm/sections.h>
|
|
#include <asm/iommu.h>
|
|
#include <asm/vdso.h>
|
|
#include <asm/hugetlb.h>
|
|
|
|
#include <mm/mmu_decl.h>
|
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
/*
|
|
* Given an address within the vmemmap, determine the page that
|
|
* represents the start of the subsection it is within. Note that we have to
|
|
* do this by hand as the proffered address may not be correctly aligned.
|
|
* Subtraction of non-aligned pointers produces undefined results.
|
|
*/
|
|
static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_addr)
|
|
{
|
|
unsigned long start_pfn;
|
|
unsigned long offset = vmemmap_addr - ((unsigned long)(vmemmap));
|
|
|
|
/* Return the pfn of the start of the section. */
|
|
start_pfn = (offset / sizeof(struct page)) & PAGE_SUBSECTION_MASK;
|
|
return pfn_to_page(start_pfn);
|
|
}
|
|
|
|
/*
|
|
* Since memory is added in sub-section chunks, before creating a new vmemmap
|
|
* mapping, the kernel should check whether there is an existing memmap mapping
|
|
* covering the new subsection added. This is needed because kernel can map
|
|
* vmemmap area using 16MB pages which will cover a memory range of 16G. Such
|
|
* a range covers multiple subsections (2M)
|
|
*
|
|
* If any subsection in the 16G range mapped by vmemmap is valid we consider the
|
|
* vmemmap populated (There is a page table entry already present). We can't do
|
|
* a page table lookup here because with the hash translation we don't keep
|
|
* vmemmap details in linux page table.
|
|
*/
|
|
int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
|
|
{
|
|
struct page *start;
|
|
unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
|
|
start = vmemmap_subsection_start(vmemmap_addr);
|
|
|
|
for (; (unsigned long)start < vmemmap_end; start += PAGES_PER_SUBSECTION)
|
|
/*
|
|
* pfn valid check here is intended to really check
|
|
* whether we have any subsection already initialized
|
|
* in this range.
|
|
*/
|
|
if (pfn_valid(page_to_pfn(start)))
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* vmemmap virtual address space management does not have a traditional page
|
|
* table to track which virtual struct pages are backed by physical mapping.
|
|
* The virtual to physical mappings are tracked in a simple linked list
|
|
* format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
|
|
* all times where as the 'next' list maintains the available
|
|
* vmemmap_backing structures which have been deleted from the
|
|
* 'vmemmap_global' list during system runtime (memory hotplug remove
|
|
* operation). The freed 'vmemmap_backing' structures are reused later when
|
|
* new requests come in without allocating fresh memory. This pointer also
|
|
* tracks the allocated 'vmemmap_backing' structures as we allocate one
|
|
* full page memory at a time when we dont have any.
|
|
*/
|
|
struct vmemmap_backing *vmemmap_list;
|
|
static struct vmemmap_backing *next;
|
|
|
|
/*
|
|
* The same pointer 'next' tracks individual chunks inside the allocated
|
|
* full page during the boot time and again tracks the freed nodes during
|
|
* runtime. It is racy but it does not happen as they are separated by the
|
|
* boot process. Will create problem if some how we have memory hotplug
|
|
* operation during boot !!
|
|
*/
|
|
static int num_left;
|
|
static int num_freed;
|
|
|
|
static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
|
|
{
|
|
struct vmemmap_backing *vmem_back;
|
|
/* get from freed entries first */
|
|
if (num_freed) {
|
|
num_freed--;
|
|
vmem_back = next;
|
|
next = next->list;
|
|
|
|
return vmem_back;
|
|
}
|
|
|
|
/* allocate a page when required and hand out chunks */
|
|
if (!num_left) {
|
|
next = vmemmap_alloc_block(PAGE_SIZE, node);
|
|
if (unlikely(!next)) {
|
|
WARN_ON(1);
|
|
return NULL;
|
|
}
|
|
num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
|
|
}
|
|
|
|
num_left--;
|
|
|
|
return next++;
|
|
}
|
|
|
|
static __meminit int vmemmap_list_populate(unsigned long phys,
|
|
unsigned long start,
|
|
int node)
|
|
{
|
|
struct vmemmap_backing *vmem_back;
|
|
|
|
vmem_back = vmemmap_list_alloc(node);
|
|
if (unlikely(!vmem_back)) {
|
|
pr_debug("vmemap list allocation failed\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
vmem_back->phys = phys;
|
|
vmem_back->virt_addr = start;
|
|
vmem_back->list = vmemmap_list;
|
|
|
|
vmemmap_list = vmem_back;
|
|
return 0;
|
|
}
|
|
|
|
bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
|
|
unsigned long page_size)
|
|
{
|
|
unsigned long nr_pfn = page_size / sizeof(struct page);
|
|
unsigned long start_pfn = page_to_pfn((struct page *)start);
|
|
|
|
if ((start_pfn + nr_pfn - 1) > altmap->end_pfn)
|
|
return true;
|
|
|
|
if (start_pfn < altmap->base_pfn)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
bool altmap_alloc;
|
|
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
|
|
|
|
/* Align to the page size of the linear mapping. */
|
|
start = ALIGN_DOWN(start, page_size);
|
|
|
|
pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
|
|
|
|
for (; start < end; start += page_size) {
|
|
void *p = NULL;
|
|
int rc;
|
|
|
|
/*
|
|
* This vmemmap range is backing different subsections. If any
|
|
* of that subsection is marked valid, that means we already
|
|
* have initialized a page table covering this range and hence
|
|
* the vmemmap range is populated.
|
|
*/
|
|
if (vmemmap_populated(start, page_size))
|
|
continue;
|
|
|
|
/*
|
|
* Allocate from the altmap first if we have one. This may
|
|
* fail due to alignment issues when using 16MB hugepages, so
|
|
* fall back to system memory if the altmap allocation fail.
|
|
*/
|
|
if (altmap && !altmap_cross_boundary(altmap, start, page_size)) {
|
|
p = vmemmap_alloc_block_buf(page_size, node, altmap);
|
|
if (!p)
|
|
pr_debug("altmap block allocation failed, falling back to system memory");
|
|
else
|
|
altmap_alloc = true;
|
|
}
|
|
if (!p) {
|
|
p = vmemmap_alloc_block_buf(page_size, node, NULL);
|
|
altmap_alloc = false;
|
|
}
|
|
if (!p)
|
|
return -ENOMEM;
|
|
|
|
if (vmemmap_list_populate(__pa(p), start, node)) {
|
|
/*
|
|
* If we don't populate vmemap list, we don't have
|
|
* the ability to free the allocated vmemmap
|
|
* pages in section_deactivate. Hence free them
|
|
* here.
|
|
*/
|
|
int nr_pfns = page_size >> PAGE_SHIFT;
|
|
unsigned long page_order = get_order(page_size);
|
|
|
|
if (altmap_alloc)
|
|
vmem_altmap_free(altmap, nr_pfns);
|
|
else
|
|
free_pages((unsigned long)p, page_order);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
pr_debug(" * %016lx..%016lx allocated at %p\n",
|
|
start, start + page_size, p);
|
|
|
|
rc = vmemmap_create_mapping(start, page_size, __pa(p));
|
|
if (rc < 0) {
|
|
pr_warn("%s: Unable to create vmemmap mapping: %d\n",
|
|
__func__, rc);
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
if (radix_enabled())
|
|
return radix__vmemmap_populate(start, end, node, altmap);
|
|
#endif
|
|
|
|
return __vmemmap_populate(start, end, node, altmap);
|
|
}
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
static unsigned long vmemmap_list_free(unsigned long start)
|
|
{
|
|
struct vmemmap_backing *vmem_back, *vmem_back_prev;
|
|
|
|
vmem_back_prev = vmem_back = vmemmap_list;
|
|
|
|
/* look for it with prev pointer recorded */
|
|
for (; vmem_back; vmem_back = vmem_back->list) {
|
|
if (vmem_back->virt_addr == start)
|
|
break;
|
|
vmem_back_prev = vmem_back;
|
|
}
|
|
|
|
if (unlikely(!vmem_back))
|
|
return 0;
|
|
|
|
/* remove it from vmemmap_list */
|
|
if (vmem_back == vmemmap_list) /* remove head */
|
|
vmemmap_list = vmem_back->list;
|
|
else
|
|
vmem_back_prev->list = vmem_back->list;
|
|
|
|
/* next point to this freed entry */
|
|
vmem_back->list = next;
|
|
next = vmem_back;
|
|
num_freed++;
|
|
|
|
return vmem_back->phys;
|
|
}
|
|
|
|
static void __ref __vmemmap_free(unsigned long start, unsigned long end,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
|
|
unsigned long page_order = get_order(page_size);
|
|
unsigned long alt_start = ~0, alt_end = ~0;
|
|
unsigned long base_pfn;
|
|
|
|
start = ALIGN_DOWN(start, page_size);
|
|
if (altmap) {
|
|
alt_start = altmap->base_pfn;
|
|
alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
|
|
}
|
|
|
|
pr_debug("vmemmap_free %lx...%lx\n", start, end);
|
|
|
|
for (; start < end; start += page_size) {
|
|
unsigned long nr_pages, addr;
|
|
struct page *page;
|
|
|
|
/*
|
|
* We have already marked the subsection we are trying to remove
|
|
* invalid. So if we want to remove the vmemmap range, we
|
|
* need to make sure there is no subsection marked valid
|
|
* in this range.
|
|
*/
|
|
if (vmemmap_populated(start, page_size))
|
|
continue;
|
|
|
|
addr = vmemmap_list_free(start);
|
|
if (!addr)
|
|
continue;
|
|
|
|
page = pfn_to_page(addr >> PAGE_SHIFT);
|
|
nr_pages = 1 << page_order;
|
|
base_pfn = PHYS_PFN(addr);
|
|
|
|
if (base_pfn >= alt_start && base_pfn < alt_end) {
|
|
vmem_altmap_free(altmap, nr_pages);
|
|
} else if (PageReserved(page)) {
|
|
/* allocated from bootmem */
|
|
if (page_size < PAGE_SIZE) {
|
|
/*
|
|
* this shouldn't happen, but if it is
|
|
* the case, leave the memory there
|
|
*/
|
|
WARN_ON_ONCE(1);
|
|
} else {
|
|
while (nr_pages--)
|
|
free_reserved_page(page++);
|
|
}
|
|
} else {
|
|
free_pages((unsigned long)(__va(addr)), page_order);
|
|
}
|
|
|
|
vmemmap_remove_mapping(start, page_size);
|
|
}
|
|
}
|
|
|
|
void __ref vmemmap_free(unsigned long start, unsigned long end,
|
|
struct vmem_altmap *altmap)
|
|
{
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
if (radix_enabled())
|
|
return radix__vmemmap_free(start, end, altmap);
|
|
#endif
|
|
return __vmemmap_free(start, end, altmap);
|
|
}
|
|
|
|
#endif
|
|
void register_page_bootmem_memmap(unsigned long section_nr,
|
|
struct page *start_page, unsigned long size)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
|
|
|
#ifdef CONFIG_PPC_BOOK3S_64
|
|
unsigned int mmu_lpid_bits;
|
|
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
|
|
EXPORT_SYMBOL_GPL(mmu_lpid_bits);
|
|
#endif
|
|
unsigned int mmu_pid_bits;
|
|
|
|
static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
|
|
|
|
static int __init parse_disable_radix(char *p)
|
|
{
|
|
bool val;
|
|
|
|
if (!p)
|
|
val = true;
|
|
else if (kstrtobool(p, &val))
|
|
return -EINVAL;
|
|
|
|
disable_radix = val;
|
|
|
|
return 0;
|
|
}
|
|
early_param("disable_radix", parse_disable_radix);
|
|
|
|
/*
|
|
* If we're running under a hypervisor, we need to check the contents of
|
|
* /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
|
|
* radix. If not, we clear the radix feature bit so we fall back to hash.
|
|
*/
|
|
static void __init early_check_vec5(void)
|
|
{
|
|
unsigned long root, chosen;
|
|
int size;
|
|
const u8 *vec5;
|
|
u8 mmu_supported;
|
|
|
|
root = of_get_flat_dt_root();
|
|
chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
|
|
if (chosen == -FDT_ERR_NOTFOUND) {
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
|
|
return;
|
|
}
|
|
vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
|
|
if (!vec5) {
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
|
|
return;
|
|
}
|
|
if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
|
|
return;
|
|
}
|
|
|
|
/* Check for supported configuration */
|
|
mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
|
|
OV5_FEAT(OV5_MMU_SUPPORT);
|
|
if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
|
|
/* Hypervisor only supports radix - check enabled && GTSE */
|
|
if (!early_radix_enabled()) {
|
|
pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
|
|
}
|
|
if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
|
|
OV5_FEAT(OV5_RADIX_GTSE))) {
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
|
|
} else
|
|
cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
|
|
/* Do radix anyway - the hypervisor said we had to */
|
|
cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
|
|
} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
|
|
/* Hypervisor only supports hash - disable radix */
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
|
|
}
|
|
}
|
|
|
|
static int __init dt_scan_mmu_pid_width(unsigned long node,
|
|
const char *uname, int depth,
|
|
void *data)
|
|
{
|
|
int size = 0;
|
|
const __be32 *prop;
|
|
const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
|
|
|
|
/* We are scanning "cpu" nodes only */
|
|
if (type == NULL || strcmp(type, "cpu") != 0)
|
|
return 0;
|
|
|
|
/* Find MMU LPID, PID register size */
|
|
prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
|
|
if (prop && size == 4)
|
|
mmu_lpid_bits = be32_to_cpup(prop);
|
|
|
|
prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
|
|
if (prop && size == 4)
|
|
mmu_pid_bits = be32_to_cpup(prop);
|
|
|
|
if (!mmu_pid_bits && !mmu_lpid_bits)
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Outside hotplug the kernel uses this value to map the kernel direct map
|
|
* with radix. To be compatible with older kernels, let's keep this value
|
|
* as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
|
|
* things with 1GB size in the case where we don't support hotplug.
|
|
*/
|
|
#ifndef CONFIG_MEMORY_HOTPLUG
|
|
#define DEFAULT_MEMORY_BLOCK_SIZE SZ_16M
|
|
#else
|
|
#define DEFAULT_MEMORY_BLOCK_SIZE MIN_MEMORY_BLOCK_SIZE
|
|
#endif
|
|
|
|
static void update_memory_block_size(unsigned long *block_size, unsigned long mem_size)
|
|
{
|
|
unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
|
|
|
|
for (; *block_size > min_memory_block_size; *block_size >>= 2) {
|
|
if ((mem_size & *block_size) == 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
static int __init probe_memory_block_size(unsigned long node, const char *uname, int
|
|
depth, void *data)
|
|
{
|
|
const char *type;
|
|
unsigned long *block_size = (unsigned long *)data;
|
|
const __be32 *reg, *endp;
|
|
int l;
|
|
|
|
if (depth != 1)
|
|
return 0;
|
|
/*
|
|
* If we have dynamic-reconfiguration-memory node, use the
|
|
* lmb value.
|
|
*/
|
|
if (strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
|
|
|
|
const __be32 *prop;
|
|
|
|
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &l);
|
|
|
|
if (!prop || l < dt_root_size_cells * sizeof(__be32))
|
|
/*
|
|
* Nothing in the device tree
|
|
*/
|
|
*block_size = DEFAULT_MEMORY_BLOCK_SIZE;
|
|
else
|
|
*block_size = of_read_number(prop, dt_root_size_cells);
|
|
/*
|
|
* We have found the final value. Don't probe further.
|
|
*/
|
|
return 1;
|
|
}
|
|
/*
|
|
* Find all the device tree nodes of memory type and make sure
|
|
* the area can be mapped using the memory block size value
|
|
* we end up using. We start with 1G value and keep reducing
|
|
* it such that we can map the entire area using memory_block_size.
|
|
* This will be used on powernv and older pseries that don't
|
|
* have ibm,lmb-size node.
|
|
* For ex: with P5 we can end up with
|
|
* memory@0 -> 128MB
|
|
* memory@128M -> 64M
|
|
* This will end up using 64MB memory block size value.
|
|
*/
|
|
type = of_get_flat_dt_prop(node, "device_type", NULL);
|
|
if (type == NULL || strcmp(type, "memory") != 0)
|
|
return 0;
|
|
|
|
reg = of_get_flat_dt_prop(node, "linux,usable-memory", &l);
|
|
if (!reg)
|
|
reg = of_get_flat_dt_prop(node, "reg", &l);
|
|
if (!reg)
|
|
return 0;
|
|
|
|
endp = reg + (l / sizeof(__be32));
|
|
while ((endp - reg) >= (dt_root_addr_cells + dt_root_size_cells)) {
|
|
const char *compatible;
|
|
u64 size;
|
|
|
|
dt_mem_next_cell(dt_root_addr_cells, ®);
|
|
size = dt_mem_next_cell(dt_root_size_cells, ®);
|
|
|
|
if (size) {
|
|
update_memory_block_size(block_size, size);
|
|
continue;
|
|
}
|
|
/*
|
|
* ibm,coherent-device-memory with linux,usable-memory = 0
|
|
* Force 256MiB block size. Work around for GPUs on P9 PowerNV
|
|
* linux,usable-memory == 0 implies driver managed memory and
|
|
* we can't use large memory block size due to hotplug/unplug
|
|
* limitations.
|
|
*/
|
|
compatible = of_get_flat_dt_prop(node, "compatible", NULL);
|
|
if (compatible && !strcmp(compatible, "ibm,coherent-device-memory")) {
|
|
if (*block_size > SZ_256M)
|
|
*block_size = SZ_256M;
|
|
/*
|
|
* We keep 256M as the upper limit with GPU present.
|
|
*/
|
|
return 0;
|
|
}
|
|
}
|
|
/* continue looking for other memory device types */
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* start with 1G memory block size. Early init will
|
|
* fix this with correct value.
|
|
*/
|
|
unsigned long memory_block_size __ro_after_init = 1UL << 30;
|
|
static void __init early_init_memory_block_size(void)
|
|
{
|
|
/*
|
|
* We need to do memory_block_size probe early so that
|
|
* radix__early_init_mmu() can use this as limit for
|
|
* mapping page size.
|
|
*/
|
|
of_scan_flat_dt(probe_memory_block_size, &memory_block_size);
|
|
}
|
|
|
|
void __init mmu_early_init_devtree(void)
|
|
{
|
|
bool hvmode = !!(mfmsr() & MSR_HV);
|
|
|
|
/* Disable radix mode based on kernel command line. */
|
|
if (disable_radix) {
|
|
if (IS_ENABLED(CONFIG_PPC_64S_HASH_MMU))
|
|
cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
|
|
else
|
|
pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
|
|
}
|
|
|
|
of_scan_flat_dt(dt_scan_mmu_pid_width, NULL);
|
|
if (hvmode && !mmu_lpid_bits) {
|
|
if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
|
|
mmu_lpid_bits = 12; /* POWER8-10 */
|
|
else
|
|
mmu_lpid_bits = 10; /* POWER7 */
|
|
}
|
|
if (!mmu_pid_bits) {
|
|
if (early_cpu_has_feature(CPU_FTR_ARCH_300))
|
|
mmu_pid_bits = 20; /* POWER9-10 */
|
|
}
|
|
|
|
/*
|
|
* Check /chosen/ibm,architecture-vec-5 if running as a guest.
|
|
* When running bare-metal, we can use radix if we like
|
|
* even though the ibm,architecture-vec-5 property created by
|
|
* skiboot doesn't have the necessary bits set.
|
|
*/
|
|
if (!hvmode)
|
|
early_check_vec5();
|
|
|
|
early_init_memory_block_size();
|
|
|
|
if (early_radix_enabled()) {
|
|
radix__early_init_devtree();
|
|
|
|
/*
|
|
* We have finalized the translation we are going to use by now.
|
|
* Radix mode is not limited by RMA / VRMA addressing.
|
|
* Hence don't limit memblock allocations.
|
|
*/
|
|
ppc64_rma_size = ULONG_MAX;
|
|
memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
|
|
} else
|
|
hash__early_init_devtree();
|
|
|
|
if (IS_ENABLED(CONFIG_HUGETLB_PAGE_SIZE_VARIABLE))
|
|
hugetlbpage_init_defaultsize();
|
|
|
|
if (!(cur_cpu_spec->mmu_features & MMU_FTR_HPTE_TABLE) &&
|
|
!(cur_cpu_spec->mmu_features & MMU_FTR_TYPE_RADIX))
|
|
panic("kernel does not support any MMU type offered by platform");
|
|
}
|
|
#endif /* CONFIG_PPC_BOOK3S_64 */
|