2019-05-27 08:55:01 +02:00
// SPDX-License-Identifier: GPL-2.0-or-later
2017-06-01 15:34:38 +10:00
/*
* Copyright ( C ) IBM Corporation , 2014 , 2017
* Anton Blanchard , Rashmica Gupta .
*/
# define pr_fmt(fmt) "memtrace: " fmt
# include <linux/bitops.h>
# include <linux/string.h>
# include <linux/memblock.h>
# include <linux/init.h>
# include <linux/moduleparam.h>
# include <linux/fs.h>
# include <linux/debugfs.h>
# include <linux/slab.h>
# include <linux/memory.h>
# include <linux/memory_hotplug.h>
2019-03-05 15:42:58 -08:00
# include <linux/numa.h>
2017-06-01 15:34:38 +10:00
# include <asm/machdep.h>
# include <asm/debugfs.h>
/* This enables us to keep track of the memory removed from each node. */
struct memtrace_entry {
void * mem ;
u64 start ;
u64 size ;
u32 nid ;
struct dentry * dir ;
char name [ 16 ] ;
} ;
static u64 memtrace_size ;
static struct memtrace_entry * memtrace_array ;
static unsigned int memtrace_array_nr ;
static ssize_t memtrace_read ( struct file * filp , char __user * ubuf ,
size_t count , loff_t * ppos )
{
struct memtrace_entry * ent = filp - > private_data ;
return simple_read_from_buffer ( ubuf , count , ppos , ent - > mem , ent - > size ) ;
}
static const struct file_operations memtrace_fops = {
. llseek = default_llseek ,
. read = memtrace_read ,
. open = simple_open ,
} ;
static int check_memblock_online ( struct memory_block * mem , void * arg )
{
if ( mem - > state ! = MEM_ONLINE )
return - 1 ;
return 0 ;
}
static int change_memblock_state ( struct memory_block * mem , void * arg )
{
unsigned long state = ( unsigned long ) arg ;
mem - > state = state ;
return 0 ;
}
2020-11-11 15:53:15 +01:00
static void memtrace_clear_range ( unsigned long start_pfn ,
unsigned long nr_pages )
{
unsigned long pfn ;
/*
* As pages are offline , we cannot trust the memmap anymore . As HIGHMEM
* does not apply , avoid passing around " struct page " and use
* clear_page ( ) instead directly .
*/
for ( pfn = start_pfn ; pfn < start_pfn + nr_pages ; pfn + + ) {
if ( IS_ALIGNED ( pfn , PAGES_PER_SECTION ) )
cond_resched ( ) ;
clear_page ( __va ( PFN_PHYS ( pfn ) ) ) ;
}
}
2018-10-30 15:10:39 -07:00
/* called with device_hotplug_lock held */
2017-06-01 15:34:38 +10:00
static bool memtrace_offline_pages ( u32 nid , u64 start_pfn , u64 nr_pages )
{
2019-07-18 15:57:46 -07:00
const unsigned long start = PFN_PHYS ( start_pfn ) ;
const unsigned long size = PFN_PHYS ( nr_pages ) ;
2017-06-01 15:34:38 +10:00
2019-07-18 15:57:46 -07:00
if ( walk_memory_blocks ( start , size , NULL , check_memblock_online ) )
2017-06-01 15:34:38 +10:00
return false ;
2019-07-18 15:57:46 -07:00
walk_memory_blocks ( start , size , ( void * ) MEM_GOING_OFFLINE ,
change_memblock_state ) ;
2017-06-01 15:34:38 +10:00
if ( offline_pages ( start_pfn , nr_pages ) ) {
2019-07-18 15:57:46 -07:00
walk_memory_blocks ( start , size , ( void * ) MEM_ONLINE ,
change_memblock_state ) ;
2017-06-01 15:34:38 +10:00
return false ;
}
2019-07-18 15:57:46 -07:00
walk_memory_blocks ( start , size , ( void * ) MEM_OFFLINE ,
change_memblock_state ) ;
2017-06-01 15:34:38 +10:00
return true ;
}
static u64 memtrace_alloc_node ( u32 nid , u64 size )
{
2018-08-17 14:25:01 +10:00
u64 start_pfn , end_pfn , nr_pages , pfn ;
2017-06-01 15:34:38 +10:00
u64 base_pfn ;
2018-08-17 14:25:01 +10:00
u64 bytes = memory_block_size_bytes ( ) ;
2017-06-01 15:34:38 +10:00
2018-05-10 23:09:13 +10:00
if ( ! node_spanned_pages ( nid ) )
2017-06-01 15:34:38 +10:00
return 0 ;
start_pfn = node_start_pfn ( nid ) ;
end_pfn = node_end_pfn ( nid ) ;
nr_pages = size > > PAGE_SHIFT ;
/* Trace memory needs to be aligned to the size */
end_pfn = round_down ( end_pfn - nr_pages , nr_pages ) ;
2018-10-30 15:10:39 -07:00
lock_device_hotplug ( ) ;
2017-06-01 15:34:38 +10:00
for ( base_pfn = end_pfn ; base_pfn > start_pfn ; base_pfn - = nr_pages ) {
2018-08-17 14:25:01 +10:00
if ( memtrace_offline_pages ( nid , base_pfn , nr_pages ) = = true ) {
2020-11-11 15:53:15 +01:00
/*
* Clear the range while we still have a linear
* mapping .
*/
memtrace_clear_range ( base_pfn , nr_pages ) ;
2018-08-17 14:25:01 +10:00
/*
* Remove memory in memory block size chunks so that
* iomem resources are always split to the same size and
* we never try to remove memory that spans two iomem
* resources .
*/
end_pfn = base_pfn + nr_pages ;
for ( pfn = base_pfn ; pfn < end_pfn ; pfn + = bytes > > PAGE_SHIFT ) {
mm/memory_hotplug: make remove_memory() take the device_hotplug_lock
Patch series "mm: online/offline_pages called w.o. mem_hotplug_lock", v3.
Reading through the code and studying how mem_hotplug_lock is to be used,
I noticed that there are two places where we can end up calling
device_online()/device_offline() - online_pages()/offline_pages() without
the mem_hotplug_lock. And there are other places where we call
device_online()/device_offline() without the device_hotplug_lock.
While e.g.
echo "online" > /sys/devices/system/memory/memory9/state
is fine, e.g.
echo 1 > /sys/devices/system/memory/memory9/online
Will not take the mem_hotplug_lock. However the device_lock() and
device_hotplug_lock.
E.g. via memory_probe_store(), we can end up calling
add_memory()->online_pages() without the device_hotplug_lock. So we can
have concurrent callers in online_pages(). We e.g. touch in
online_pages() basically unprotected zone->present_pages then.
Looks like there is a longer history to that (see Patch #2 for details),
and fixing it to work the way it was intended is not really possible. We
would e.g. have to take the mem_hotplug_lock in device/base/core.c, which
sounds wrong.
Summary: We had a lock inversion on mem_hotplug_lock and device_lock().
More details can be found in patch 3 and patch 6.
I propose the general rules (documentation added in patch 6):
1. add_memory/add_memory_resource() must only be called with
device_hotplug_lock.
2. remove_memory() must only be called with device_hotplug_lock. This is
already documented and holds for all callers.
3. device_online()/device_offline() must only be called with
device_hotplug_lock. This is already documented and true for now in core
code. Other callers (related to memory hotplug) have to be fixed up.
4. mem_hotplug_lock is taken inside of add_memory/remove_memory/
online_pages/offline_pages.
To me, this looks way cleaner than what we have right now (and easier to
verify). And looking at the documentation of remove_memory, using
lock_device_hotplug also for add_memory() feels natural.
This patch (of 6):
remove_memory() is exported right now but requires the
device_hotplug_lock, which is not exported. So let's provide a variant
that takes the lock and only export that one.
The lock is already held in
arch/powerpc/platforms/pseries/hotplug-memory.c
drivers/acpi/acpi_memhotplug.c
arch/powerpc/platforms/powernv/memtrace.c
Apart from that, there are not other users in the tree.
Link: http://lkml.kernel.org/r/20180925091457.28651-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pavel.tatashin@microsoft.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Rashmica Gupta <rashmica.g@gmail.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Cc: John Allen <jallen@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juergen Gross <jgross@suse.com>
Cc: Kate Stewart <kstewart@linuxfoundation.org>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Philippe Ombredanne <pombredanne@nexb.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-10-30 15:10:18 -07:00
__remove_memory ( nid , pfn < < PAGE_SHIFT , bytes ) ;
2018-08-17 14:25:01 +10:00
}
unlock_device_hotplug ( ) ;
2017-06-01 15:34:38 +10:00
return base_pfn < < PAGE_SHIFT ;
2018-08-17 14:25:01 +10:00
}
2017-06-01 15:34:38 +10:00
}
2018-10-30 15:10:39 -07:00
unlock_device_hotplug ( ) ;
2017-06-01 15:34:38 +10:00
return 0 ;
}
static int memtrace_init_regions_runtime ( u64 size )
{
u32 nid ;
u64 m ;
memtrace_array = kcalloc ( num_online_nodes ( ) ,
sizeof ( struct memtrace_entry ) , GFP_KERNEL ) ;
if ( ! memtrace_array ) {
pr_err ( " Failed to allocate memtrace_array \n " ) ;
return - EINVAL ;
}
for_each_online_node ( nid ) {
m = memtrace_alloc_node ( nid , size ) ;
/*
* A node might not have any local memory , so warn but
* continue on .
*/
if ( ! m ) {
pr_err ( " Failed to allocate trace memory on node %d \n " , nid ) ;
continue ;
}
pr_info ( " Allocated trace memory on node %d at 0x%016llx \n " , nid , m ) ;
memtrace_array [ memtrace_array_nr ] . start = m ;
memtrace_array [ memtrace_array_nr ] . size = size ;
memtrace_array [ memtrace_array_nr ] . nid = nid ;
memtrace_array_nr + + ;
}
return 0 ;
}
static struct dentry * memtrace_debugfs_dir ;
static int memtrace_init_debugfs ( void )
{
int ret = 0 ;
int i ;
for ( i = 0 ; i < memtrace_array_nr ; i + + ) {
struct dentry * dir ;
struct memtrace_entry * ent = & memtrace_array [ i ] ;
ent - > mem = ioremap ( ent - > start , ent - > size ) ;
/* Warn but continue on */
if ( ! ent - > mem ) {
pr_err ( " Failed to map trace memory at 0x%llx \n " ,
ent - > start ) ;
ret = - 1 ;
continue ;
}
snprintf ( ent - > name , 16 , " %08x " , ent - > nid ) ;
dir = debugfs_create_dir ( ent - > name , memtrace_debugfs_dir ) ;
ent - > dir = dir ;
debugfs_create_file ( " trace " , 0400 , dir , ent , & memtrace_fops ) ;
debugfs_create_x64 ( " start " , 0400 , dir , & ent - > start ) ;
debugfs_create_x64 ( " size " , 0400 , dir , & ent - > size ) ;
}
return ret ;
}
2018-08-03 16:06:00 +10:00
static int online_mem_block ( struct memory_block * mem , void * arg )
{
return device_online ( & mem - > dev ) ;
}
/*
* Iterate through the chunks of memory we have removed from the kernel
* and attempt to add them back to the kernel .
*/
static int memtrace_online ( void )
{
int i , ret = 0 ;
struct memtrace_entry * ent ;
for ( i = memtrace_array_nr - 1 ; i > = 0 ; i - - ) {
ent = & memtrace_array [ i ] ;
/* We have onlined this chunk previously */
2019-03-05 15:42:58 -08:00
if ( ent - > nid = = NUMA_NO_NODE )
2018-08-03 16:06:00 +10:00
continue ;
/* Remove from io mappings */
if ( ent - > mem ) {
iounmap ( ent - > mem ) ;
ent - > mem = 0 ;
}
2020-10-15 20:08:44 -07:00
if ( add_memory ( ent - > nid , ent - > start , ent - > size , MHP_NONE ) ) {
2018-08-03 16:06:00 +10:00
pr_err ( " Failed to add trace memory to node %d \n " ,
ent - > nid ) ;
ret + = 1 ;
continue ;
}
2020-04-06 20:07:28 -07:00
lock_device_hotplug ( ) ;
walk_memory_blocks ( ent - > start , ent - > size , NULL ,
online_mem_block ) ;
unlock_device_hotplug ( ) ;
2018-08-03 16:06:00 +10:00
/*
* Memory was added successfully so clean up references to it
* so on reentry we can tell that this chunk was added .
*/
debugfs_remove_recursive ( ent - > dir ) ;
pr_info ( " Added trace memory back to node %d \n " , ent - > nid ) ;
2019-03-05 15:42:58 -08:00
ent - > size = ent - > start = ent - > nid = NUMA_NO_NODE ;
2018-08-03 16:06:00 +10:00
}
if ( ret )
return ret ;
/* If all chunks of memory were added successfully, reset globals */
kfree ( memtrace_array ) ;
memtrace_array = NULL ;
memtrace_size = 0 ;
memtrace_array_nr = 0 ;
return 0 ;
}
2017-06-01 15:34:38 +10:00
static int memtrace_enable_set ( void * data , u64 val )
{
2018-08-03 16:06:00 +10:00
u64 bytes ;
/*
* Don ' t attempt to do anything if size isn ' t aligned to a memory
* block or equal to zero .
*/
bytes = memory_block_size_bytes ( ) ;
if ( val & ( bytes - 1 ) ) {
pr_err ( " Value must be aligned with 0x%llx \n " , bytes ) ;
2017-06-01 15:34:38 +10:00
return - EINVAL ;
2018-08-03 16:06:00 +10:00
}
2017-06-01 15:34:38 +10:00
2018-08-03 16:06:00 +10:00
/* Re-add/online previously removed/offlined memory */
if ( memtrace_size ) {
if ( memtrace_online ( ) )
return - EAGAIN ;
}
2017-06-01 15:34:38 +10:00
2018-08-03 16:06:00 +10:00
if ( ! val )
return 0 ;
2017-06-01 15:34:38 +10:00
2018-08-03 16:06:00 +10:00
/* Offline and remove memory */
2017-06-01 15:34:38 +10:00
if ( memtrace_init_regions_runtime ( val ) )
return - EINVAL ;
if ( memtrace_init_debugfs ( ) )
return - EINVAL ;
memtrace_size = val ;
return 0 ;
}
static int memtrace_enable_get ( void * data , u64 * val )
{
* val = memtrace_size ;
return 0 ;
}
DEFINE_SIMPLE_ATTRIBUTE ( memtrace_init_fops , memtrace_enable_get ,
memtrace_enable_set , " 0x%016llx \n " ) ;
static int memtrace_init ( void )
{
memtrace_debugfs_dir = debugfs_create_dir ( " memtrace " ,
powerpc_debugfs_root ) ;
debugfs_create_file ( " enable " , 0600 , memtrace_debugfs_dir ,
NULL , & memtrace_init_fops ) ;
return 0 ;
}
machine_device_initcall ( powernv , memtrace_init ) ;