2018-03-29 19:07:13 -07:00
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
2015-08-10 23:07:07 -04:00
# include <linux/device.h>
2015-08-10 23:07:06 -04:00
# include <linux/io.h>
2018-08-17 15:47:04 -07:00
# include <linux/kasan.h>
2015-08-17 16:00:35 +02:00
# include <linux/memory_hotplug.h>
2018-08-15 14:22:16 -04:00
# include <linux/mm.h>
# include <linux/pfn_t.h>
2017-09-08 16:11:43 -07:00
# include <linux/swap.h>
2020-01-30 12:06:07 -08:00
# include <linux/mmzone.h>
2017-09-08 16:11:43 -07:00
# include <linux/swapops.h>
2018-08-15 14:22:16 -04:00
# include <linux/types.h>
2018-05-16 11:46:08 -07:00
# include <linux/wait_bit.h>
2018-08-15 14:22:16 -04:00
# include <linux/xarray.h>
2015-08-10 23:07:06 -04:00
2018-08-15 14:22:16 -04:00
static DEFINE_XARRAY ( pgmap_array ) ;
2016-01-15 16:56:19 -08:00
2020-01-30 12:06:07 -08:00
/*
* The memremap ( ) and memremap_pages ( ) interfaces are alternately used
* to map persistent memory namespaces . These interfaces place different
* constraints on the alignment and size of the mapping ( namespace ) .
* memremap ( ) can map individual PAGE_SIZE pages . memremap_pages ( ) can
* only map subsections ( 2 MB ) , and at least one architecture ( PowerPC )
* the minimum mapping granularity of memremap_pages ( ) is 16 MB .
*
* The role of memremap_compat_align ( ) is to communicate the minimum
* arch supported alignment of a namespace such that it can freely
* switch modes without violating the arch constraint . Namely , do not
* allow a namespace to be PAGE_SIZE aligned since that namespace may be
* reconfigured into a mode that requires SUBSECTION_SIZE alignment .
*/
# ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN
unsigned long memremap_compat_align ( void )
{
return SUBSECTION_SIZE ;
}
EXPORT_SYMBOL_GPL ( memremap_compat_align ) ;
# endif
2019-06-26 14:27:10 +02:00
# ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE ( devmap_managed_key ) ;
EXPORT_SYMBOL ( devmap_managed_key ) ;
static atomic_t devmap_managed_enable ;
2019-08-18 11:05:56 +02:00
static void devmap_managed_enable_put ( void )
2019-06-26 14:27:10 +02:00
{
if ( atomic_dec_and_test ( & devmap_managed_enable ) )
static_branch_disable ( & devmap_managed_key ) ;
}
2019-08-18 11:05:56 +02:00
static int devmap_managed_enable_get ( struct dev_pagemap * pgmap )
2019-06-26 14:27:10 +02:00
{
2020-01-30 22:12:24 -08:00
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE & &
( ! pgmap - > ops | | ! pgmap - > ops - > page_free ) ) {
2019-06-26 14:27:10 +02:00
WARN ( 1 , " Missing page_free method \n " ) ;
return - EINVAL ;
}
if ( atomic_inc_return ( & devmap_managed_enable ) = = 1 )
static_branch_enable ( & devmap_managed_key ) ;
2019-08-18 11:05:56 +02:00
return 0 ;
2019-06-26 14:27:10 +02:00
}
# else
2019-08-18 11:05:56 +02:00
static int devmap_managed_enable_get ( struct dev_pagemap * pgmap )
2019-06-26 14:27:10 +02:00
{
return - EINVAL ;
}
2019-08-18 11:05:56 +02:00
static void devmap_managed_enable_put ( void )
{
}
2019-06-26 14:27:10 +02:00
# endif /* CONFIG_DEV_PAGEMAP_OPS */
2018-08-15 14:22:16 -04:00
static void pgmap_array_delete ( struct resource * res )
2017-09-06 16:24:13 -07:00
{
2018-08-15 14:22:16 -04:00
xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) , PHYS_PFN ( res - > end ) ,
NULL , GFP_KERNEL ) ;
2017-09-06 16:24:13 -07:00
synchronize_rcu ( ) ;
2016-01-15 16:56:19 -08:00
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_first ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2019-07-18 15:58:33 -07:00
return PHYS_PFN ( pgmap - > res . start ) +
2019-06-26 14:27:13 +02:00
vmem_altmap_offset ( pgmap_altmap ( pgmap ) ) ;
2016-01-15 16:56:49 -08:00
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_end ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2017-12-29 08:54:04 +01:00
const struct resource * res = & pgmap - > res ;
2016-01-15 16:56:49 -08:00
return ( res - > start + resource_size ( res ) ) > > PAGE_SHIFT ;
}
2018-02-06 19:34:11 -08:00
static unsigned long pfn_next ( unsigned long pfn )
{
if ( pfn % 1024 = = 0 )
cond_resched ( ) ;
return pfn + 1 ;
}
2016-01-15 16:56:49 -08:00
# define for_each_device_pfn(pfn, map) \
2018-02-06 19:34:11 -08:00
for ( pfn = pfn_first ( map ) ; pfn < pfn_end ( map ) ; pfn = pfn_next ( pfn ) )
2016-01-15 16:56:49 -08:00
2019-06-26 14:27:14 +02:00
static void dev_pagemap_kill ( struct dev_pagemap * pgmap )
{
if ( pgmap - > ops & & pgmap - > ops - > kill )
pgmap - > ops - > kill ( pgmap ) ;
else
percpu_ref_kill ( pgmap - > ref ) ;
}
static void dev_pagemap_cleanup ( struct dev_pagemap * pgmap )
{
if ( pgmap - > ops & & pgmap - > ops - > cleanup ) {
pgmap - > ops - > cleanup ( pgmap ) ;
} else {
wait_for_completion ( & pgmap - > done ) ;
percpu_ref_exit ( pgmap - > ref ) ;
}
2019-08-08 14:43:49 -07:00
/*
* Undo the pgmap ref assignment for the internal case as the
* caller may re - enable the same pgmap .
*/
if ( pgmap - > ref = = & pgmap - > internal_ref )
pgmap - > ref = NULL ;
2019-06-26 14:27:14 +02:00
}
2019-08-18 11:05:57 +02:00
void memunmap_pages ( struct dev_pagemap * pgmap )
2015-08-17 16:00:35 +02:00
{
2017-12-29 08:54:04 +01:00
struct resource * res = & pgmap - > res ;
mm/memunmap: don't access uninitialized memmap in memunmap_pages()
Patch series "mm/memory_hotplug: Shrink zones before removing memory",
v6.
This series fixes the access of uninitialized memmaps when shrinking
zones/nodes and when removing memory. Also, it contains all fixes for
crashes that can be triggered when removing certain namespace using
memunmap_pages() - ZONE_DEVICE, reported by Aneesh.
We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be
more involved (we don't have SECTION_IS_ONLINE as an indicator), and
shrinking is only of limited use (set_zone_contiguous() cannot detect
the ZONE_DEVICE as contiguous).
We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount
of code to a minimum. Shrinking is especially necessary to keep
zone->contiguous set where possible, especially, on memory unplug of
DIMMs at zone boundaries.
--------------------------------------------------------------------------
Zones are now properly shrunk when offlining memory blocks or when
onlining failed. This allows to properly shrink zones on memory unplug
even if the separate memory blocks of a DIMM were onlined to different
zones or re-onlined to a different zone after offlining.
Example:
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
:/# echo "online_movable" > /sys/devices/system/memory/memory41/state
:/# echo "online_movable" > /sys/devices/system/memory/memory43/state
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 98304
present 65536
managed 65536
:/# echo 0 > /sys/devices/system/memory/memory43/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 32768
present 32768
managed 32768
:/# echo 0 > /sys/devices/system/memory/memory41/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
This patch (of 10):
With an altmap, the memmap falling into the reserved altmap space are not
initialized and, therefore, contain a garbage NID and a garbage zone.
Make sure to read the NID/zone from a memmap that was initialized.
This fixes a kernel crash that is observed when destroying a namespace:
kernel BUG at include/linux/mm.h:1107!
cpu 0x1: Vector: 700 (Program Check) at [c000000274087890]
pc: c0000000004b9728: memunmap_pages+0x238/0x340
lr: c0000000004b9724: memunmap_pages+0x234/0x340
...
pid = 3669, comm = ndctl
kernel BUG at include/linux/mm.h:1107!
devm_action_release+0x30/0x50
release_nodes+0x268/0x2d0
device_release_driver_internal+0x174/0x240
unbind_store+0x13c/0x190
drv_attr_store+0x44/0x60
sysfs_kf_write+0x70/0xa0
kernfs_fop_write+0x1ac/0x290
__vfs_write+0x3c/0x70
vfs_write+0xe4/0x200
ksys_write+0x7c/0x140
system_call+0x5c/0x68
The "page_zone(pfn_to_page(pfn)" was introduced by 69324b8f4833 ("mm,
devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support"), however, I
think we will never have driver reserved memory with
MEMORY_DEVICE_PRIVATE (no altmap AFAIKS).
[david@redhat.com: minimze code changes, rephrase description]
Link: http://lkml.kernel.org/r/20191006085646.5768-2-david@redhat.com
Fixes: 2c2a5af6fed2 ("mm, memory_hotplug: add nid parameter to arch_remove_memory")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Damian Tometzki <damian.tometzki@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pagupta@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org> [5.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-18 20:19:39 -07:00
struct page * first_page ;
2017-04-28 10:23:37 -07:00
unsigned long pfn ;
2018-12-28 00:36:22 -08:00
int nid ;
2017-04-28 10:23:37 -07:00
2019-06-26 14:27:14 +02:00
dev_pagemap_kill ( pgmap ) ;
2017-12-29 08:54:04 +01:00
for_each_device_pfn ( pfn , pgmap )
2017-04-28 10:23:37 -07:00
put_page ( pfn_to_page ( pfn ) ) ;
2019-06-26 14:27:14 +02:00
dev_pagemap_cleanup ( pgmap ) ;
2016-01-15 16:56:19 -08:00
mm/memunmap: don't access uninitialized memmap in memunmap_pages()
Patch series "mm/memory_hotplug: Shrink zones before removing memory",
v6.
This series fixes the access of uninitialized memmaps when shrinking
zones/nodes and when removing memory. Also, it contains all fixes for
crashes that can be triggered when removing certain namespace using
memunmap_pages() - ZONE_DEVICE, reported by Aneesh.
We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be
more involved (we don't have SECTION_IS_ONLINE as an indicator), and
shrinking is only of limited use (set_zone_contiguous() cannot detect
the ZONE_DEVICE as contiguous).
We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount
of code to a minimum. Shrinking is especially necessary to keep
zone->contiguous set where possible, especially, on memory unplug of
DIMMs at zone boundaries.
--------------------------------------------------------------------------
Zones are now properly shrunk when offlining memory blocks or when
onlining failed. This allows to properly shrink zones on memory unplug
even if the separate memory blocks of a DIMM were onlined to different
zones or re-onlined to a different zone after offlining.
Example:
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
:/# echo "online_movable" > /sys/devices/system/memory/memory41/state
:/# echo "online_movable" > /sys/devices/system/memory/memory43/state
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 98304
present 65536
managed 65536
:/# echo 0 > /sys/devices/system/memory/memory43/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 32768
present 32768
managed 32768
:/# echo 0 > /sys/devices/system/memory/memory41/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
This patch (of 10):
With an altmap, the memmap falling into the reserved altmap space are not
initialized and, therefore, contain a garbage NID and a garbage zone.
Make sure to read the NID/zone from a memmap that was initialized.
This fixes a kernel crash that is observed when destroying a namespace:
kernel BUG at include/linux/mm.h:1107!
cpu 0x1: Vector: 700 (Program Check) at [c000000274087890]
pc: c0000000004b9728: memunmap_pages+0x238/0x340
lr: c0000000004b9724: memunmap_pages+0x234/0x340
...
pid = 3669, comm = ndctl
kernel BUG at include/linux/mm.h:1107!
devm_action_release+0x30/0x50
release_nodes+0x268/0x2d0
device_release_driver_internal+0x174/0x240
unbind_store+0x13c/0x190
drv_attr_store+0x44/0x60
sysfs_kf_write+0x70/0xa0
kernfs_fop_write+0x1ac/0x290
__vfs_write+0x3c/0x70
vfs_write+0xe4/0x200
ksys_write+0x7c/0x140
system_call+0x5c/0x68
The "page_zone(pfn_to_page(pfn)" was introduced by 69324b8f4833 ("mm,
devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support"), however, I
think we will never have driver reserved memory with
MEMORY_DEVICE_PRIVATE (no altmap AFAIKS).
[david@redhat.com: minimze code changes, rephrase description]
Link: http://lkml.kernel.org/r/20191006085646.5768-2-david@redhat.com
Fixes: 2c2a5af6fed2 ("mm, memory_hotplug: add nid parameter to arch_remove_memory")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Damian Tometzki <damian.tometzki@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pagupta@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org> [5.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-18 20:19:39 -07:00
/* make sure to access a memmap that was actually initialized */
first_page = pfn_to_page ( pfn_first ( pgmap ) ) ;
2015-08-17 16:00:35 +02:00
/* pages are dead and unused, undo the arch mapping */
mm/memunmap: don't access uninitialized memmap in memunmap_pages()
Patch series "mm/memory_hotplug: Shrink zones before removing memory",
v6.
This series fixes the access of uninitialized memmaps when shrinking
zones/nodes and when removing memory. Also, it contains all fixes for
crashes that can be triggered when removing certain namespace using
memunmap_pages() - ZONE_DEVICE, reported by Aneesh.
We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be
more involved (we don't have SECTION_IS_ONLINE as an indicator), and
shrinking is only of limited use (set_zone_contiguous() cannot detect
the ZONE_DEVICE as contiguous).
We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount
of code to a minimum. Shrinking is especially necessary to keep
zone->contiguous set where possible, especially, on memory unplug of
DIMMs at zone boundaries.
--------------------------------------------------------------------------
Zones are now properly shrunk when offlining memory blocks or when
onlining failed. This allows to properly shrink zones on memory unplug
even if the separate memory blocks of a DIMM were onlined to different
zones or re-onlined to a different zone after offlining.
Example:
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
:/# echo "online_movable" > /sys/devices/system/memory/memory41/state
:/# echo "online_movable" > /sys/devices/system/memory/memory43/state
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 98304
present 65536
managed 65536
:/# echo 0 > /sys/devices/system/memory/memory43/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 32768
present 32768
managed 32768
:/# echo 0 > /sys/devices/system/memory/memory41/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
This patch (of 10):
With an altmap, the memmap falling into the reserved altmap space are not
initialized and, therefore, contain a garbage NID and a garbage zone.
Make sure to read the NID/zone from a memmap that was initialized.
This fixes a kernel crash that is observed when destroying a namespace:
kernel BUG at include/linux/mm.h:1107!
cpu 0x1: Vector: 700 (Program Check) at [c000000274087890]
pc: c0000000004b9728: memunmap_pages+0x238/0x340
lr: c0000000004b9724: memunmap_pages+0x234/0x340
...
pid = 3669, comm = ndctl
kernel BUG at include/linux/mm.h:1107!
devm_action_release+0x30/0x50
release_nodes+0x268/0x2d0
device_release_driver_internal+0x174/0x240
unbind_store+0x13c/0x190
drv_attr_store+0x44/0x60
sysfs_kf_write+0x70/0xa0
kernfs_fop_write+0x1ac/0x290
__vfs_write+0x3c/0x70
vfs_write+0xe4/0x200
ksys_write+0x7c/0x140
system_call+0x5c/0x68
The "page_zone(pfn_to_page(pfn)" was introduced by 69324b8f4833 ("mm,
devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support"), however, I
think we will never have driver reserved memory with
MEMORY_DEVICE_PRIVATE (no altmap AFAIKS).
[david@redhat.com: minimze code changes, rephrase description]
Link: http://lkml.kernel.org/r/20191006085646.5768-2-david@redhat.com
Fixes: 2c2a5af6fed2 ("mm, memory_hotplug: add nid parameter to arch_remove_memory")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Damian Tometzki <damian.tometzki@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pagupta@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org> [5.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-18 20:19:39 -07:00
nid = page_to_nid ( first_page ) ;
2018-12-28 00:36:22 -08:00
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2020-02-03 17:34:09 -08:00
remove_pfn_range_from_zone ( page_zone ( first_page ) , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( resource_size ( res ) ) ) ;
2018-12-28 00:35:01 -08:00
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
2020-01-04 12:59:33 -08:00
__remove_pages ( PHYS_PFN ( res - > start ) ,
mm/memunmap: don't access uninitialized memmap in memunmap_pages()
Patch series "mm/memory_hotplug: Shrink zones before removing memory",
v6.
This series fixes the access of uninitialized memmaps when shrinking
zones/nodes and when removing memory. Also, it contains all fixes for
crashes that can be triggered when removing certain namespace using
memunmap_pages() - ZONE_DEVICE, reported by Aneesh.
We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be
more involved (we don't have SECTION_IS_ONLINE as an indicator), and
shrinking is only of limited use (set_zone_contiguous() cannot detect
the ZONE_DEVICE as contiguous).
We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount
of code to a minimum. Shrinking is especially necessary to keep
zone->contiguous set where possible, especially, on memory unplug of
DIMMs at zone boundaries.
--------------------------------------------------------------------------
Zones are now properly shrunk when offlining memory blocks or when
onlining failed. This allows to properly shrink zones on memory unplug
even if the separate memory blocks of a DIMM were onlined to different
zones or re-onlined to a different zone after offlining.
Example:
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
:/# echo "online_movable" > /sys/devices/system/memory/memory41/state
:/# echo "online_movable" > /sys/devices/system/memory/memory43/state
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 98304
present 65536
managed 65536
:/# echo 0 > /sys/devices/system/memory/memory43/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 32768
present 32768
managed 32768
:/# echo 0 > /sys/devices/system/memory/memory41/online
:/# cat /proc/zoneinfo
Node 1, zone Movable
spanned 0
present 0
managed 0
This patch (of 10):
With an altmap, the memmap falling into the reserved altmap space are not
initialized and, therefore, contain a garbage NID and a garbage zone.
Make sure to read the NID/zone from a memmap that was initialized.
This fixes a kernel crash that is observed when destroying a namespace:
kernel BUG at include/linux/mm.h:1107!
cpu 0x1: Vector: 700 (Program Check) at [c000000274087890]
pc: c0000000004b9728: memunmap_pages+0x238/0x340
lr: c0000000004b9724: memunmap_pages+0x234/0x340
...
pid = 3669, comm = ndctl
kernel BUG at include/linux/mm.h:1107!
devm_action_release+0x30/0x50
release_nodes+0x268/0x2d0
device_release_driver_internal+0x174/0x240
unbind_store+0x13c/0x190
drv_attr_store+0x44/0x60
sysfs_kf_write+0x70/0xa0
kernfs_fop_write+0x1ac/0x290
__vfs_write+0x3c/0x70
vfs_write+0xe4/0x200
ksys_write+0x7c/0x140
system_call+0x5c/0x68
The "page_zone(pfn_to_page(pfn)" was introduced by 69324b8f4833 ("mm,
devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support"), however, I
think we will never have driver reserved memory with
MEMORY_DEVICE_PRIVATE (no altmap AFAIKS).
[david@redhat.com: minimze code changes, rephrase description]
Link: http://lkml.kernel.org/r/20191006085646.5768-2-david@redhat.com
Fixes: 2c2a5af6fed2 ("mm, memory_hotplug: add nid parameter to arch_remove_memory")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Damian Tometzki <damian.tometzki@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pankaj Gupta <pagupta@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Will Deacon <will@kernel.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Cc: <stable@vger.kernel.org> [5.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-10-18 20:19:39 -07:00
PHYS_PFN ( resource_size ( res ) ) , NULL ) ;
2018-12-28 00:35:01 -08:00
} else {
2019-07-18 15:58:33 -07:00
arch_remove_memory ( nid , res - > start , resource_size ( res ) ,
2019-06-26 14:27:13 +02:00
pgmap_altmap ( pgmap ) ) ;
2019-07-18 15:58:33 -07:00
kasan_remove_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-12-28 00:35:01 -08:00
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2017-02-24 14:55:45 -08:00
2019-07-18 15:58:33 -07:00
untrack_pfn ( NULL , PHYS_PFN ( res - > start ) , resource_size ( res ) ) ;
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
2019-08-18 11:05:55 +02:00
WARN_ONCE ( pgmap - > altmap . alloc , " failed to free all reserved pages \n " ) ;
2019-08-18 11:05:56 +02:00
devmap_managed_enable_put ( ) ;
2016-01-15 16:56:19 -08:00
}
2019-08-18 11:05:57 +02:00
EXPORT_SYMBOL_GPL ( memunmap_pages ) ;
static void devm_memremap_pages_release ( void * data )
{
memunmap_pages ( data ) ;
}
2016-01-15 16:56:19 -08:00
2019-06-26 14:27:14 +02:00
static void dev_pagemap_percpu_release ( struct percpu_ref * ref )
{
struct dev_pagemap * pgmap =
container_of ( ref , struct dev_pagemap , internal_ref ) ;
complete ( & pgmap - > done ) ;
}
2019-08-18 11:05:57 +02:00
/*
* Not device managed version of dev_memremap_pages , undone by
* memunmap_pages ( ) . Please use dev_memremap_pages if you have a struct
* device available .
2016-01-15 16:56:22 -08:00
*/
2019-08-18 11:05:57 +02:00
void * memremap_pages ( struct dev_pagemap * pgmap , int nid )
2015-08-17 16:00:35 +02:00
{
2018-02-06 19:34:11 -08:00
struct resource * res = & pgmap - > res ;
2018-10-26 15:07:52 -07:00
struct dev_pagemap * conflict_pgmap ;
2020-04-10 14:33:21 -07:00
struct mhp_params params = {
2019-05-13 17:21:26 -07:00
/*
* We do not want any optional features only our own memmap
2019-07-18 15:58:33 -07:00
*/
2019-06-26 14:27:13 +02:00
. altmap = pgmap_altmap ( pgmap ) ,
mm/memory_hotplug: add pgprot_t to mhp_params
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are
created with PAGE_KERNEL which implies setting the PAT bits to be WB.
However, on x86, an mtrr register will typically override this and force
the cache type to be UC-. In the case firmware doesn't set this
register it is effectively WB and will typically result in a machine
check exception when it's accessed.
Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.
To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().
Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a
simple change to pass the pgprot_t down to their respective functions
which set up the page tables. For x86_32, set the page tables
explicitly using _set_memory_prot() (seeing they are already mapped).
For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this
should be fine, for now, seeing these architectures don't support
ZONE_DEVICE.
A check in __add_pages() is also added to ensure the pgprot parameter
was set for all arches.
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Badger <ebadger@gigaio.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-10 14:33:36 -07:00
. pgprot = PAGE_KERNEL ,
2019-05-13 17:21:26 -07:00
} ;
2019-08-18 11:05:57 +02:00
int error , is_ram ;
2019-06-26 14:27:10 +02:00
bool need_devmap_managed = true ;
2016-03-09 14:08:13 -08:00
2019-06-26 14:27:07 +02:00
switch ( pgmap - > type ) {
case MEMORY_DEVICE_PRIVATE :
if ( ! IS_ENABLED ( CONFIG_DEVICE_PRIVATE ) ) {
WARN ( 1 , " Device private memory not supported \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-26 14:27:11 +02:00
if ( ! pgmap - > ops | | ! pgmap - > ops - > migrate_to_ram ) {
WARN ( 1 , " Missing migrate_to_ram method \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2020-03-16 20:32:13 +01:00
if ( ! pgmap - > owner ) {
WARN ( 1 , " Missing owner \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-26 14:27:07 +02:00
break ;
case MEMORY_DEVICE_FS_DAX :
if ( ! IS_ENABLED ( CONFIG_ZONE_DEVICE ) | |
IS_ENABLED ( CONFIG_FS_DAX_LIMITED ) ) {
WARN ( 1 , " File system DAX not supported \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
break ;
2020-09-01 10:33:25 +02:00
case MEMORY_DEVICE_GENERIC :
2020-04-10 14:33:39 -07:00
need_devmap_managed = false ;
break ;
2019-06-26 14:27:07 +02:00
case MEMORY_DEVICE_PCI_P2PDMA :
2020-04-10 14:33:39 -07:00
params . pgprot = pgprot_noncached ( params . pgprot ) ;
2019-06-26 14:27:10 +02:00
need_devmap_managed = false ;
2019-06-26 14:27:07 +02:00
break ;
default :
WARN ( 1 , " Invalid pgmap type %d \n " , pgmap - > type ) ;
break ;
}
2019-06-26 14:27:14 +02:00
if ( ! pgmap - > ref ) {
if ( pgmap - > ops & & ( pgmap - > ops - > kill | | pgmap - > ops - > cleanup ) )
return ERR_PTR ( - EINVAL ) ;
init_completion ( & pgmap - > done ) ;
error = percpu_ref_init ( & pgmap - > internal_ref ,
dev_pagemap_percpu_release , 0 , GFP_KERNEL ) ;
if ( error )
return ERR_PTR ( error ) ;
pgmap - > ref = & pgmap - > internal_ref ;
} else {
if ( ! pgmap - > ops | | ! pgmap - > ops - > kill | | ! pgmap - > ops - > cleanup ) {
WARN ( 1 , " Missing reference count teardown definition \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-13 15:56:33 -07:00
}
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
2019-06-26 14:27:10 +02:00
if ( need_devmap_managed ) {
2019-08-18 11:05:56 +02:00
error = devmap_managed_enable_get ( pgmap ) ;
2019-06-26 14:27:10 +02:00
if ( error )
return ERR_PTR ( error ) ;
}
2019-07-18 15:58:33 -07:00
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( res - > start ) , NULL ) ;
2018-07-26 16:37:15 -07:00
if ( conflict_pgmap ) {
2019-08-18 11:05:57 +02:00
WARN ( 1 , " Conflicting mapping in same section \n " ) ;
2018-07-26 16:37:15 -07:00
put_dev_pagemap ( conflict_pgmap ) ;
2019-06-13 15:56:33 -07:00
error = - ENOMEM ;
goto err_array ;
2018-07-26 16:37:15 -07:00
}
2019-07-18 15:58:33 -07:00
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( res - > end ) , NULL ) ;
2018-07-26 16:37:15 -07:00
if ( conflict_pgmap ) {
2019-08-18 11:05:57 +02:00
WARN ( 1 , " Conflicting mapping in same section \n " ) ;
2018-07-26 16:37:15 -07:00
put_dev_pagemap ( conflict_pgmap ) ;
2019-06-13 15:56:33 -07:00
error = - ENOMEM ;
goto err_array ;
2018-07-26 16:37:15 -07:00
}
2019-07-18 15:58:33 -07:00
is_ram = region_intersects ( res - > start , resource_size ( res ) ,
2016-03-14 15:15:51 -07:00
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-17 16:00:35 +02:00
2018-12-28 00:34:54 -08:00
if ( is_ram ! = REGION_DISJOINT ) {
WARN_ONCE ( 1 , " %s attempted on %s region %pr \n " , __func__ ,
is_ram = = REGION_MIXED ? " mixed " : " ram " , res ) ;
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
error = - ENXIO ;
goto err_array ;
2015-08-17 16:00:35 +02:00
}
2018-08-15 14:22:16 -04:00
error = xa_err ( xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( res - > end ) , pgmap , GFP_KERNEL ) ) ;
2016-01-15 16:56:19 -08:00
if ( error )
2018-08-15 14:22:16 -04:00
goto err_array ;
2016-01-15 16:56:19 -08:00
2015-08-17 16:00:35 +02:00
if ( nid < 0 )
2015-10-05 20:35:55 -04:00
nid = numa_mem_id ( ) ;
2015-08-17 16:00:35 +02:00
mm/memory_hotplug: add pgprot_t to mhp_params
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are
created with PAGE_KERNEL which implies setting the PAT bits to be WB.
However, on x86, an mtrr register will typically override this and force
the cache type to be UC-. In the case firmware doesn't set this
register it is effectively WB and will typically result in a machine
check exception when it's accessed.
Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.
To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().
Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a
simple change to pass the pgprot_t down to their respective functions
which set up the page tables. For x86_32, set the page tables
explicitly using _set_memory_prot() (seeing they are already mapped).
For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this
should be fine, for now, seeing these architectures don't support
ZONE_DEVICE.
A check in __add_pages() is also added to ensure the pgprot parameter
was set for all arches.
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Eric Badger <ebadger@gigaio.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-10 14:33:36 -07:00
error = track_pfn_remap ( NULL , & params . pgprot , PHYS_PFN ( res - > start ) ,
0 , resource_size ( res ) ) ;
2016-09-07 08:51:21 -07:00
if ( error )
goto err_pfn_remap ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2018-12-28 00:35:01 -08:00
/*
* For device private memory we call add_pages ( ) as we only need to
* allocate and initialize struct page for the device memory . More -
* over the device memory is un - accessible thus we do not want to
* create a linear mapping for the memory like arch_add_memory ( )
* would do .
*
* For all other device memory types , which are accessible by
* the CPU , we do want the linear mapping and thus use
* arch_add_memory ( ) .
*/
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
2019-07-18 15:58:33 -07:00
error = add_pages ( nid , PHYS_PFN ( res - > start ) ,
2020-04-10 14:33:21 -07:00
PHYS_PFN ( resource_size ( res ) ) , & params ) ;
2018-12-28 00:35:01 -08:00
} else {
2019-07-18 15:58:33 -07:00
error = kasan_add_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-12-28 00:35:01 -08:00
if ( error ) {
mem_hotplug_done ( ) ;
goto err_kasan ;
}
2019-07-18 15:58:33 -07:00
error = arch_add_memory ( nid , res - > start , resource_size ( res ) ,
2020-04-10 14:33:21 -07:00
& params ) ;
2018-12-28 00:35:01 -08:00
}
if ( ! error ) {
struct zone * zone ;
zone = & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ;
2019-07-18 15:58:33 -07:00
move_pfn_range_to_zone ( zone , PHYS_PFN ( res - > start ) ,
2020-04-10 14:33:21 -07:00
PHYS_PFN ( resource_size ( res ) ) , params . altmap ) ;
2018-08-17 15:47:04 -07:00
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2016-01-15 16:56:19 -08:00
if ( error )
goto err_add_memory ;
2015-08-17 16:00:35 +02:00
2018-10-26 15:07:52 -07:00
/*
* Initialization of the pages has been deferred until now in order
* to allow us to do the work while not holding the hotplug lock .
*/
memmap_init_zone_device ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
2019-07-18 15:58:33 -07:00
PHYS_PFN ( res - > start ) ,
PHYS_PFN ( resource_size ( res ) ) , pgmap ) ;
2018-10-26 15:07:52 -07:00
percpu_ref_get_many ( pgmap - > ref , pfn_end ( pgmap ) - pfn_first ( pgmap ) ) ;
2015-08-17 16:00:35 +02:00
return __va ( res - > start ) ;
2016-01-15 16:56:19 -08:00
err_add_memory :
2019-07-18 15:58:33 -07:00
kasan_remove_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-08-17 15:47:04 -07:00
err_kasan :
2019-07-18 15:58:33 -07:00
untrack_pfn ( NULL , PHYS_PFN ( res - > start ) , resource_size ( res ) ) ;
2016-09-07 08:51:21 -07:00
err_pfn_remap :
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
err_array :
2019-06-26 14:27:14 +02:00
dev_pagemap_kill ( pgmap ) ;
dev_pagemap_cleanup ( pgmap ) ;
2019-08-18 11:05:56 +02:00
devmap_managed_enable_put ( ) ;
2016-01-15 16:56:19 -08:00
return ERR_PTR ( error ) ;
2015-08-17 16:00:35 +02:00
}
2019-08-18 11:05:57 +02:00
EXPORT_SYMBOL_GPL ( memremap_pages ) ;
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @ dev : hosting device for @ res
* @ pgmap : pointer to a struct dev_pagemap
*
* Notes :
* 1 / At a minimum the res and type members of @ pgmap must be initialized
* by the caller before passing it to this function
*
* 2 / The altmap field may optionally be initialized , in which case
* PGMAP_ALTMAP_VALID must be set in pgmap - > flags .
*
* 3 / The ref field may optionally be provided , in which pgmap - > ref must be
* ' live ' on entry and will be killed and reaped at
* devm_memremap_pages_release ( ) time , or if this routine fails .
*
* 4 / res is expected to be a host memory range that could feasibly be
* treated as a " System RAM " range , i . e . not a device mmio range , but
* this is not enforced .
*/
void * devm_memremap_pages ( struct device * dev , struct dev_pagemap * pgmap )
{
int error ;
void * ret ;
ret = memremap_pages ( pgmap , dev_to_node ( dev ) ) ;
if ( IS_ERR ( ret ) )
return ret ;
error = devm_add_action_or_reset ( dev , devm_memremap_pages_release ,
pgmap ) ;
if ( error )
return ERR_PTR ( error ) ;
return ret ;
}
2018-12-28 00:34:50 -08:00
EXPORT_SYMBOL_GPL ( devm_memremap_pages ) ;
2016-01-15 16:56:22 -08:00
2019-06-13 15:56:21 -07:00
void devm_memunmap_pages ( struct device * dev , struct dev_pagemap * pgmap )
{
devm_release_action ( dev , devm_memremap_pages_release , pgmap ) ;
}
EXPORT_SYMBOL_GPL ( devm_memunmap_pages ) ;
2016-01-15 16:56:22 -08:00
unsigned long vmem_altmap_offset ( struct vmem_altmap * altmap )
{
/* number of pfns from base where pfn_to_page() is valid */
2019-06-26 14:27:13 +02:00
if ( altmap )
return altmap - > reserve + altmap - > free ;
return 0 ;
2016-01-15 16:56:22 -08:00
}
void vmem_altmap_free ( struct vmem_altmap * altmap , unsigned long nr_pfns )
{
altmap - > alloc - = nr_pfns ;
}
2017-12-29 08:54:00 +01:00
/**
* get_dev_pagemap ( ) - take a new live reference on the dev_pagemap for @ pfn
* @ pfn : page frame number to lookup page_map
* @ pgmap : optional known pgmap that already has a reference
*
2017-12-29 08:54:01 +01:00
* If @ pgmap is non - NULL and covers @ pfn it will be returned as - is . If @ pgmap
* is non - NULL but does not cover @ pfn the reference to it will be released .
2017-12-29 08:54:00 +01:00
*/
struct dev_pagemap * get_dev_pagemap ( unsigned long pfn ,
struct dev_pagemap * pgmap )
{
resource_size_t phys = PFN_PHYS ( pfn ) ;
/*
2017-12-29 08:54:01 +01:00
* In the cached case we ' re already holding a live reference .
2017-12-29 08:54:00 +01:00
*/
2017-12-29 08:54:01 +01:00
if ( pgmap ) {
2017-12-29 08:54:04 +01:00
if ( phys > = pgmap - > res . start & & phys < = pgmap - > res . end )
2017-12-29 08:54:01 +01:00
return pgmap ;
put_dev_pagemap ( pgmap ) ;
2017-12-29 08:54:00 +01:00
}
/* fall back to slow path lookup */
rcu_read_lock ( ) ;
2018-08-15 14:22:16 -04:00
pgmap = xa_load ( & pgmap_array , PHYS_PFN ( phys ) ) ;
2017-12-29 08:54:00 +01:00
if ( pgmap & & ! percpu_ref_tryget_live ( pgmap - > ref ) )
pgmap = NULL ;
rcu_read_unlock ( ) ;
return pgmap ;
}
2018-05-16 11:46:08 -07:00
EXPORT_SYMBOL_GPL ( get_dev_pagemap ) ;
2017-09-08 16:11:46 -07:00
2018-05-16 11:46:08 -07:00
# ifdef CONFIG_DEV_PAGEMAP_OPS
2020-01-30 22:12:28 -08:00
void free_devmap_managed_page ( struct page * page )
2017-09-08 16:11:46 -07:00
{
2020-01-30 22:12:24 -08:00
/* notify page idle for dax */
if ( ! is_device_private_page ( page ) ) {
wake_up_var ( & page - > _refcount ) ;
return ;
}
2019-08-13 15:37:07 -07:00
2020-01-30 22:12:24 -08:00
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive ( page ) ;
__ClearPageWaiters ( page ) ;
mem_cgroup_uncharge ( page ) ;
/*
* When a device_private page is freed , the page - > mapping field
* may still contain a ( stale ) mapping value . For example , the
* lower bits of page - > mapping may still identify the page as an
* anonymous page . Ultimately , this entire field is just stale
* and wrong , and it will cause errors if not cleared . One
* example is :
*
* migrate_vma_pages ( )
* migrate_vma_insert_page ( )
* page_add_new_anon_rmap ( )
* __page_set_anon_rmap ( )
* . . . checks page - > mapping , via PageAnon ( page ) call ,
* and incorrectly concludes that the page is an
* anonymous page . Therefore , it incorrectly ,
* silently fails to set up the new anon rmap .
*
* For other types of ZONE_DEVICE pages , migration is either
* handled differently or not done at all , so there is no need
* to clear page - > mapping .
*/
page - > mapping = NULL ;
page - > pgmap - > ops - > page_free ( page ) ;
2017-09-08 16:11:46 -07:00
}
2018-05-16 11:46:08 -07:00
# endif /* CONFIG_DEV_PAGEMAP_OPS */