powerpc/fadump: Reservationless firmware assisted dump
One of the primary issues with Firmware Assisted Dump (fadump) on Power is that it needs a large amount of memory to be reserved. On large systems with TeraBytes of memory, this reservation can be quite significant. In some cases, fadump fails if the memory reserved is insufficient, or if the reserved memory was DLPAR hot-removed. In the normal case, post reboot, the preserved memory is filtered to extract only relevant areas of interest using the makedumpfile tool. While the tool provides flexibility to determine what needs to be part of the dump and what memory to filter out, all supported distributions default this to "Capture only kernel data and nothing else". We take advantage of this default and the Linux kernel's Contiguous Memory Allocator (CMA) to fundamentally change the memory reservation model for fadump. Instead of setting aside a significant chunk of memory nobody can use, this patch uses CMA instead, to reserve a significant chunk of memory that the kernel is prevented from using (due to MIGRATE_CMA), but applications are free to use it. With this fadump will still be able to capture all of the kernel memory and most of the user space memory except the user pages that were present in CMA region. Essentially, on a P9 LPAR with 2 cores, 8GB RAM and current upstream: [root@zzxx-yy10 ~]# free -m total used free shared buff/cache available Mem: 7557 193 6822 12 541 6725 Swap: 4095 0 4095 With this patch: [root@zzxx-yy10 ~]# free -m total used free shared buff/cache available Mem: 8133 194 7464 12 475 7338 Swap: 4095 0 4095 Changes made here are completely transparent to how fadump has traditionally worked. Thanks to Aneesh Kumar and Anshuman Khandual for helping us understand CMA and its usage. TODO: - Handle case where CMA reservation spans nodes. Signed-off-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
parent
08fb726df1
commit
a4e92ce8e4
@ -113,7 +113,15 @@ header, is usually reserved at an offset greater than boot memory
|
|||||||
size (see Fig. 1). This area is *not* released: this region will
|
size (see Fig. 1). This area is *not* released: this region will
|
||||||
be kept permanently reserved, so that it can act as a receptacle
|
be kept permanently reserved, so that it can act as a receptacle
|
||||||
for a copy of the boot memory content in addition to CPU state
|
for a copy of the boot memory content in addition to CPU state
|
||||||
and HPTE region, in the case a crash does occur.
|
and HPTE region, in the case a crash does occur. Since this reserved
|
||||||
|
memory area is used only after the system crash, there is no point in
|
||||||
|
blocking this significant chunk of memory from production kernel.
|
||||||
|
Hence, the implementation uses the Linux kernel's Contiguous Memory
|
||||||
|
Allocator (CMA) for memory reservation if CMA is configured for kernel.
|
||||||
|
With CMA reservation this memory will be available for applications to
|
||||||
|
use it, while kernel is prevented from using it. With this fadump will
|
||||||
|
still be able to capture all of the kernel memory and most of the user
|
||||||
|
space memory except the user pages that were present in CMA region.
|
||||||
|
|
||||||
o Memory Reservation during first kernel
|
o Memory Reservation during first kernel
|
||||||
|
|
||||||
@ -162,6 +170,9 @@ How to enable firmware-assisted dump (fadump):
|
|||||||
|
|
||||||
1. Set config option CONFIG_FA_DUMP=y and build kernel.
|
1. Set config option CONFIG_FA_DUMP=y and build kernel.
|
||||||
2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
|
2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
|
||||||
|
By default, fadump reserved memory will be initialized as CMA area.
|
||||||
|
Alternatively, user can boot linux kernel with 'fadump=nocma' to
|
||||||
|
prevent fadump to use CMA.
|
||||||
3. Optionally, user can also set 'crashkernel=' kernel cmdline
|
3. Optionally, user can also set 'crashkernel=' kernel cmdline
|
||||||
to specify size of the memory to reserve for boot memory dump
|
to specify size of the memory to reserve for boot memory dump
|
||||||
preservation.
|
preservation.
|
||||||
@ -172,6 +183,10 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been deprecated. Instead
|
|||||||
2. If firmware-assisted dump fails to reserve memory then it
|
2. If firmware-assisted dump fails to reserve memory then it
|
||||||
will fallback to existing kdump mechanism if 'crashkernel='
|
will fallback to existing kdump mechanism if 'crashkernel='
|
||||||
option is set at kernel cmdline.
|
option is set at kernel cmdline.
|
||||||
|
3. if user wants to capture all of user space memory and ok with
|
||||||
|
reserved memory not available to production system, then
|
||||||
|
'fadump=nocma' kernel parameter can be used to fallback to
|
||||||
|
old behaviour.
|
||||||
|
|
||||||
Sysfs/debugfs files:
|
Sysfs/debugfs files:
|
||||||
------------
|
------------
|
||||||
|
@ -48,6 +48,10 @@
|
|||||||
|
|
||||||
#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
|
#define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt)
|
||||||
|
|
||||||
|
/* Alignement per CMA requirement. */
|
||||||
|
#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \
|
||||||
|
max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
|
||||||
|
|
||||||
/* Firmware provided dump sections */
|
/* Firmware provided dump sections */
|
||||||
#define FADUMP_CPU_STATE_DATA 0x0001
|
#define FADUMP_CPU_STATE_DATA 0x0001
|
||||||
#define FADUMP_HPTE_REGION 0x0002
|
#define FADUMP_HPTE_REGION 0x0002
|
||||||
@ -141,6 +145,7 @@ struct fw_dump {
|
|||||||
unsigned long fadump_supported:1;
|
unsigned long fadump_supported:1;
|
||||||
unsigned long dump_active:1;
|
unsigned long dump_active:1;
|
||||||
unsigned long dump_registered:1;
|
unsigned long dump_registered:1;
|
||||||
|
unsigned long nocma:1;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include <linux/kobject.h>
|
#include <linux/kobject.h>
|
||||||
#include <linux/sysfs.h>
|
#include <linux/sysfs.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
|
#include <linux/cma.h>
|
||||||
|
|
||||||
#include <asm/debugfs.h>
|
#include <asm/debugfs.h>
|
||||||
#include <asm/page.h>
|
#include <asm/page.h>
|
||||||
@ -46,6 +47,9 @@
|
|||||||
static struct fw_dump fw_dump;
|
static struct fw_dump fw_dump;
|
||||||
static struct fadump_mem_struct fdm;
|
static struct fadump_mem_struct fdm;
|
||||||
static const struct fadump_mem_struct *fdm_active;
|
static const struct fadump_mem_struct *fdm_active;
|
||||||
|
#ifdef CONFIG_CMA
|
||||||
|
static struct cma *fadump_cma;
|
||||||
|
#endif
|
||||||
|
|
||||||
static DEFINE_MUTEX(fadump_mutex);
|
static DEFINE_MUTEX(fadump_mutex);
|
||||||
struct fad_crash_memory_ranges *crash_memory_ranges;
|
struct fad_crash_memory_ranges *crash_memory_ranges;
|
||||||
@ -53,6 +57,67 @@ int crash_memory_ranges_size;
|
|||||||
int crash_mem_ranges;
|
int crash_mem_ranges;
|
||||||
int max_crash_mem_ranges;
|
int max_crash_mem_ranges;
|
||||||
|
|
||||||
|
#ifdef CONFIG_CMA
|
||||||
|
/*
|
||||||
|
* fadump_cma_init() - Initialize CMA area from a fadump reserved memory
|
||||||
|
*
|
||||||
|
* This function initializes CMA area from fadump reserved memory.
|
||||||
|
* The total size of fadump reserved memory covers for boot memory size
|
||||||
|
* + cpu data size + hpte size and metadata.
|
||||||
|
* Initialize only the area equivalent to boot memory size for CMA use.
|
||||||
|
* The reamining portion of fadump reserved memory will be not given
|
||||||
|
* to CMA and pages for thoes will stay reserved. boot memory size is
|
||||||
|
* aligned per CMA requirement to satisy cma_init_reserved_mem() call.
|
||||||
|
* But for some reason even if it fails we still have the memory reservation
|
||||||
|
* with us and we can still continue doing fadump.
|
||||||
|
*/
|
||||||
|
int __init fadump_cma_init(void)
|
||||||
|
{
|
||||||
|
unsigned long long base, size;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (!fw_dump.fadump_enabled)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do not use CMA if user has provided fadump=nocma kernel parameter.
|
||||||
|
* Return 1 to continue with fadump old behaviour.
|
||||||
|
*/
|
||||||
|
if (fw_dump.nocma)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
base = fw_dump.reserve_dump_area_start;
|
||||||
|
size = fw_dump.boot_memory_size;
|
||||||
|
|
||||||
|
if (!size)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
rc = cma_init_reserved_mem(base, size, 0, "fadump_cma", &fadump_cma);
|
||||||
|
if (rc) {
|
||||||
|
pr_err("Failed to init cma area for firmware-assisted dump,%d\n", rc);
|
||||||
|
/*
|
||||||
|
* Though the CMA init has failed we still have memory
|
||||||
|
* reservation with us. The reserved memory will be
|
||||||
|
* blocked from production system usage. Hence return 1,
|
||||||
|
* so that we can continue with fadump.
|
||||||
|
*/
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* So we now have successfully initialized cma area for fadump.
|
||||||
|
*/
|
||||||
|
pr_info("Initialized 0x%lx bytes cma area at %ldMB from 0x%lx "
|
||||||
|
"bytes of memory reserved for firmware-assisted dump\n",
|
||||||
|
cma_get_size(fadump_cma),
|
||||||
|
(unsigned long)cma_get_base(fadump_cma) >> 20,
|
||||||
|
fw_dump.reserve_dump_area_size);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static int __init fadump_cma_init(void) { return 1; }
|
||||||
|
#endif /* CONFIG_CMA */
|
||||||
|
|
||||||
/* Scan the Firmware Assisted dump configuration details. */
|
/* Scan the Firmware Assisted dump configuration details. */
|
||||||
int __init early_init_dt_scan_fw_dump(unsigned long node,
|
int __init early_init_dt_scan_fw_dump(unsigned long node,
|
||||||
const char *uname, int depth, void *data)
|
const char *uname, int depth, void *data)
|
||||||
@ -378,8 +443,15 @@ int __init fadump_reserve_mem(void)
|
|||||||
*/
|
*/
|
||||||
if (fdm_active)
|
if (fdm_active)
|
||||||
fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
|
fw_dump.boot_memory_size = be64_to_cpu(fdm_active->rmr_region.source_len);
|
||||||
else
|
else {
|
||||||
fw_dump.boot_memory_size = fadump_calculate_reserve_size();
|
fw_dump.boot_memory_size = fadump_calculate_reserve_size();
|
||||||
|
#ifdef CONFIG_CMA
|
||||||
|
if (!fw_dump.nocma)
|
||||||
|
fw_dump.boot_memory_size =
|
||||||
|
ALIGN(fw_dump.boot_memory_size,
|
||||||
|
FADUMP_CMA_ALIGNMENT);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Calculate the memory boundary.
|
* Calculate the memory boundary.
|
||||||
@ -426,8 +498,9 @@ int __init fadump_reserve_mem(void)
|
|||||||
fw_dump.fadumphdr_addr =
|
fw_dump.fadumphdr_addr =
|
||||||
be64_to_cpu(fdm_active->rmr_region.destination_address) +
|
be64_to_cpu(fdm_active->rmr_region.destination_address) +
|
||||||
be64_to_cpu(fdm_active->rmr_region.source_len);
|
be64_to_cpu(fdm_active->rmr_region.source_len);
|
||||||
pr_debug("fadumphdr_addr = %p\n",
|
pr_debug("fadumphdr_addr = %pa\n", &fw_dump.fadumphdr_addr);
|
||||||
(void *) fw_dump.fadumphdr_addr);
|
fw_dump.reserve_dump_area_start = base;
|
||||||
|
fw_dump.reserve_dump_area_size = size;
|
||||||
} else {
|
} else {
|
||||||
size = get_fadump_area_size();
|
size = get_fadump_area_size();
|
||||||
|
|
||||||
@ -455,10 +528,11 @@ int __init fadump_reserve_mem(void)
|
|||||||
(unsigned long)(size >> 20),
|
(unsigned long)(size >> 20),
|
||||||
(unsigned long)(base >> 20),
|
(unsigned long)(base >> 20),
|
||||||
(unsigned long)(memblock_phys_mem_size() >> 20));
|
(unsigned long)(memblock_phys_mem_size() >> 20));
|
||||||
}
|
|
||||||
|
|
||||||
fw_dump.reserve_dump_area_start = base;
|
fw_dump.reserve_dump_area_start = base;
|
||||||
fw_dump.reserve_dump_area_size = size;
|
fw_dump.reserve_dump_area_size = size;
|
||||||
|
return fadump_cma_init();
|
||||||
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -477,6 +551,10 @@ static int __init early_fadump_param(char *p)
|
|||||||
fw_dump.fadump_enabled = 1;
|
fw_dump.fadump_enabled = 1;
|
||||||
else if (strncmp(p, "off", 3) == 0)
|
else if (strncmp(p, "off", 3) == 0)
|
||||||
fw_dump.fadump_enabled = 0;
|
fw_dump.fadump_enabled = 0;
|
||||||
|
else if (strncmp(p, "nocma", 5) == 0) {
|
||||||
|
fw_dump.fadump_enabled = 1;
|
||||||
|
fw_dump.nocma = 1;
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -1229,7 +1307,7 @@ static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int fadump_invalidate_dump(struct fadump_mem_struct *fdm)
|
static int fadump_invalidate_dump(const struct fadump_mem_struct *fdm)
|
||||||
{
|
{
|
||||||
int rc = 0;
|
int rc = 0;
|
||||||
unsigned int wait_time;
|
unsigned int wait_time;
|
||||||
@ -1260,9 +1338,8 @@ void fadump_cleanup(void)
|
|||||||
{
|
{
|
||||||
/* Invalidate the registration only if dump is active. */
|
/* Invalidate the registration only if dump is active. */
|
||||||
if (fw_dump.dump_active) {
|
if (fw_dump.dump_active) {
|
||||||
init_fadump_mem_struct(&fdm,
|
/* pass the same memory dump structure provided by platform */
|
||||||
be64_to_cpu(fdm_active->cpu_state_data.destination_address));
|
fadump_invalidate_dump(fdm_active);
|
||||||
fadump_invalidate_dump(&fdm);
|
|
||||||
} else if (fw_dump.dump_registered) {
|
} else if (fw_dump.dump_registered) {
|
||||||
/* Un-register Firmware-assisted dump if it was registered. */
|
/* Un-register Firmware-assisted dump if it was registered. */
|
||||||
fadump_unregister_dump(&fdm);
|
fadump_unregister_dump(&fdm);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user