2018-03-29 19:07:13 -07:00
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
2015-08-10 23:07:07 -04:00
# include <linux/device.h>
2015-08-10 23:07:06 -04:00
# include <linux/io.h>
2018-08-17 15:47:04 -07:00
# include <linux/kasan.h>
2015-08-17 16:00:35 +02:00
# include <linux/memory_hotplug.h>
2018-08-15 14:22:16 -04:00
# include <linux/mm.h>
# include <linux/pfn_t.h>
2017-09-08 16:11:43 -07:00
# include <linux/swap.h>
# include <linux/swapops.h>
2018-08-15 14:22:16 -04:00
# include <linux/types.h>
2018-05-16 11:46:08 -07:00
# include <linux/wait_bit.h>
2018-08-15 14:22:16 -04:00
# include <linux/xarray.h>
2015-08-10 23:07:06 -04:00
2018-08-15 14:22:16 -04:00
static DEFINE_XARRAY ( pgmap_array ) ;
2016-01-15 16:56:19 -08:00
# define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
# define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
2019-06-26 14:27:10 +02:00
# ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE ( devmap_managed_key ) ;
EXPORT_SYMBOL ( devmap_managed_key ) ;
static atomic_t devmap_managed_enable ;
static void devmap_managed_enable_put ( void * data )
{
if ( atomic_dec_and_test ( & devmap_managed_enable ) )
static_branch_disable ( & devmap_managed_key ) ;
}
static int devmap_managed_enable_get ( struct device * dev , struct dev_pagemap * pgmap )
{
2019-06-26 14:27:14 +02:00
if ( ! pgmap - > ops | | ! pgmap - > ops - > page_free ) {
2019-06-26 14:27:10 +02:00
WARN ( 1 , " Missing page_free method \n " ) ;
return - EINVAL ;
}
if ( atomic_inc_return ( & devmap_managed_enable ) = = 1 )
static_branch_enable ( & devmap_managed_key ) ;
return devm_add_action_or_reset ( dev , devmap_managed_enable_put , NULL ) ;
}
# else
static int devmap_managed_enable_get ( struct device * dev , struct dev_pagemap * pgmap )
{
return - EINVAL ;
}
# endif /* CONFIG_DEV_PAGEMAP_OPS */
2018-08-15 14:22:16 -04:00
static void pgmap_array_delete ( struct resource * res )
2017-09-06 16:24:13 -07:00
{
2018-08-15 14:22:16 -04:00
xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) , PHYS_PFN ( res - > end ) ,
NULL , GFP_KERNEL ) ;
2017-09-06 16:24:13 -07:00
synchronize_rcu ( ) ;
2016-01-15 16:56:19 -08:00
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_first ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2019-07-18 15:58:33 -07:00
return PHYS_PFN ( pgmap - > res . start ) +
2019-06-26 14:27:13 +02:00
vmem_altmap_offset ( pgmap_altmap ( pgmap ) ) ;
2016-01-15 16:56:49 -08:00
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_end ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2017-12-29 08:54:04 +01:00
const struct resource * res = & pgmap - > res ;
2016-01-15 16:56:49 -08:00
return ( res - > start + resource_size ( res ) ) > > PAGE_SHIFT ;
}
2018-02-06 19:34:11 -08:00
static unsigned long pfn_next ( unsigned long pfn )
{
if ( pfn % 1024 = = 0 )
cond_resched ( ) ;
return pfn + 1 ;
}
2016-01-15 16:56:49 -08:00
# define for_each_device_pfn(pfn, map) \
2018-02-06 19:34:11 -08:00
for ( pfn = pfn_first ( map ) ; pfn < pfn_end ( map ) ; pfn = pfn_next ( pfn ) )
2016-01-15 16:56:49 -08:00
2019-06-26 14:27:14 +02:00
static void dev_pagemap_kill ( struct dev_pagemap * pgmap )
{
if ( pgmap - > ops & & pgmap - > ops - > kill )
pgmap - > ops - > kill ( pgmap ) ;
else
percpu_ref_kill ( pgmap - > ref ) ;
}
static void dev_pagemap_cleanup ( struct dev_pagemap * pgmap )
{
if ( pgmap - > ops & & pgmap - > ops - > cleanup ) {
pgmap - > ops - > cleanup ( pgmap ) ;
} else {
wait_for_completion ( & pgmap - > done ) ;
percpu_ref_exit ( pgmap - > ref ) ;
}
2019-08-08 14:43:49 -07:00
/*
* Undo the pgmap ref assignment for the internal case as the
* caller may re - enable the same pgmap .
*/
if ( pgmap - > ref = = & pgmap - > internal_ref )
pgmap - > ref = NULL ;
2019-06-26 14:27:14 +02:00
}
2017-12-29 08:54:05 +01:00
static void devm_memremap_pages_release ( void * data )
2015-08-17 16:00:35 +02:00
{
2017-12-29 08:54:04 +01:00
struct dev_pagemap * pgmap = data ;
2017-12-29 08:54:05 +01:00
struct device * dev = pgmap - > dev ;
2017-12-29 08:54:04 +01:00
struct resource * res = & pgmap - > res ;
2017-04-28 10:23:37 -07:00
unsigned long pfn ;
2018-12-28 00:36:22 -08:00
int nid ;
2017-04-28 10:23:37 -07:00
2019-06-26 14:27:14 +02:00
dev_pagemap_kill ( pgmap ) ;
2017-12-29 08:54:04 +01:00
for_each_device_pfn ( pfn , pgmap )
2017-04-28 10:23:37 -07:00
put_page ( pfn_to_page ( pfn ) ) ;
2019-06-26 14:27:14 +02:00
dev_pagemap_cleanup ( pgmap ) ;
2016-01-15 16:56:19 -08:00
2015-08-17 16:00:35 +02:00
/* pages are dead and unused, undo the arch mapping */
2019-07-18 15:58:33 -07:00
nid = page_to_nid ( pfn_to_page ( PHYS_PFN ( res - > start ) ) ) ;
2018-12-28 00:36:22 -08:00
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2018-12-28 00:35:01 -08:00
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
2019-07-18 15:58:33 -07:00
pfn = PHYS_PFN ( res - > start ) ;
2018-12-28 00:35:01 -08:00
__remove_pages ( page_zone ( pfn_to_page ( pfn ) ) , pfn ,
2019-07-18 15:58:33 -07:00
PHYS_PFN ( resource_size ( res ) ) , NULL ) ;
2018-12-28 00:35:01 -08:00
} else {
2019-07-18 15:58:33 -07:00
arch_remove_memory ( nid , res - > start , resource_size ( res ) ,
2019-06-26 14:27:13 +02:00
pgmap_altmap ( pgmap ) ) ;
2019-07-18 15:58:33 -07:00
kasan_remove_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-12-28 00:35:01 -08:00
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2017-02-24 14:55:45 -08:00
2019-07-18 15:58:33 -07:00
untrack_pfn ( NULL , PHYS_PFN ( res - > start ) , resource_size ( res ) ) ;
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
2017-12-29 08:54:04 +01:00
dev_WARN_ONCE ( dev , pgmap - > altmap . alloc ,
" %s: failed to free all reserved pages \n " , __func__ ) ;
2016-01-15 16:56:19 -08:00
}
2019-06-26 14:27:14 +02:00
static void dev_pagemap_percpu_release ( struct percpu_ref * ref )
{
struct dev_pagemap * pgmap =
container_of ( ref , struct dev_pagemap , internal_ref ) ;
complete ( & pgmap - > done ) ;
}
2016-01-15 16:56:22 -08:00
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @ dev : hosting device for @ res
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
* @ pgmap : pointer to a struct dev_pagemap
2016-01-15 16:56:22 -08:00
*
2016-01-15 16:56:49 -08:00
* Notes :
2019-06-26 14:27:14 +02:00
* 1 / At a minimum the res and type members of @ pgmap must be initialized
* by the caller before passing it to this function
2017-12-29 08:54:05 +01:00
*
2019-06-26 14:27:13 +02:00
* 2 / The altmap field may optionally be initialized , in which case
* PGMAP_ALTMAP_VALID must be set in pgmap - > flags .
2017-12-29 08:54:05 +01:00
*
2019-06-26 14:27:14 +02:00
* 3 / The ref field may optionally be provided , in which pgmap - > ref must be
* ' live ' on entry and will be killed and reaped at
* devm_memremap_pages_release ( ) time , or if this routine fails .
2016-01-15 16:56:49 -08:00
*
2017-12-29 08:54:05 +01:00
* 4 / res is expected to be a host memory range that could feasibly be
2016-01-15 16:56:49 -08:00
* treated as a " System RAM " range , i . e . not a device mmio range , but
* this is not enforced .
2016-01-15 16:56:22 -08:00
*/
2017-12-29 08:54:05 +01:00
void * devm_memremap_pages ( struct device * dev , struct dev_pagemap * pgmap )
2015-08-17 16:00:35 +02:00
{
2018-02-06 19:34:11 -08:00
struct resource * res = & pgmap - > res ;
2018-10-26 15:07:52 -07:00
struct dev_pagemap * conflict_pgmap ;
2019-05-13 17:21:26 -07:00
struct mhp_restrictions restrictions = {
/*
* We do not want any optional features only our own memmap
2019-07-18 15:58:33 -07:00
*/
2019-06-26 14:27:13 +02:00
. altmap = pgmap_altmap ( pgmap ) ,
2019-05-13 17:21:26 -07:00
} ;
2016-09-07 08:51:21 -07:00
pgprot_t pgprot = PAGE_KERNEL ;
2018-02-06 19:34:11 -08:00
int error , nid , is_ram ;
2019-06-26 14:27:10 +02:00
bool need_devmap_managed = true ;
2016-03-09 14:08:13 -08:00
2019-06-26 14:27:07 +02:00
switch ( pgmap - > type ) {
case MEMORY_DEVICE_PRIVATE :
if ( ! IS_ENABLED ( CONFIG_DEVICE_PRIVATE ) ) {
WARN ( 1 , " Device private memory not supported \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-26 14:27:11 +02:00
if ( ! pgmap - > ops | | ! pgmap - > ops - > migrate_to_ram ) {
WARN ( 1 , " Missing migrate_to_ram method \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-26 14:27:07 +02:00
break ;
case MEMORY_DEVICE_FS_DAX :
if ( ! IS_ENABLED ( CONFIG_ZONE_DEVICE ) | |
IS_ENABLED ( CONFIG_FS_DAX_LIMITED ) ) {
WARN ( 1 , " File system DAX not supported \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
break ;
case MEMORY_DEVICE_DEVDAX :
case MEMORY_DEVICE_PCI_P2PDMA :
2019-06-26 14:27:10 +02:00
need_devmap_managed = false ;
2019-06-26 14:27:07 +02:00
break ;
default :
WARN ( 1 , " Invalid pgmap type %d \n " , pgmap - > type ) ;
break ;
}
2019-06-26 14:27:14 +02:00
if ( ! pgmap - > ref ) {
if ( pgmap - > ops & & ( pgmap - > ops - > kill | | pgmap - > ops - > cleanup ) )
return ERR_PTR ( - EINVAL ) ;
init_completion ( & pgmap - > done ) ;
error = percpu_ref_init ( & pgmap - > internal_ref ,
dev_pagemap_percpu_release , 0 , GFP_KERNEL ) ;
if ( error )
return ERR_PTR ( error ) ;
pgmap - > ref = & pgmap - > internal_ref ;
} else {
if ( ! pgmap - > ops | | ! pgmap - > ops - > kill | | ! pgmap - > ops - > cleanup ) {
WARN ( 1 , " Missing reference count teardown definition \n " ) ;
return ERR_PTR ( - EINVAL ) ;
}
2019-06-13 15:56:33 -07:00
}
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
2019-06-26 14:27:10 +02:00
if ( need_devmap_managed ) {
error = devmap_managed_enable_get ( dev , pgmap ) ;
if ( error )
return ERR_PTR ( error ) ;
}
2019-07-18 15:58:33 -07:00
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( res - > start ) , NULL ) ;
2018-07-26 16:37:15 -07:00
if ( conflict_pgmap ) {
dev_WARN ( dev , " Conflicting mapping in same section \n " ) ;
put_dev_pagemap ( conflict_pgmap ) ;
2019-06-13 15:56:33 -07:00
error = - ENOMEM ;
goto err_array ;
2018-07-26 16:37:15 -07:00
}
2019-07-18 15:58:33 -07:00
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( res - > end ) , NULL ) ;
2018-07-26 16:37:15 -07:00
if ( conflict_pgmap ) {
dev_WARN ( dev , " Conflicting mapping in same section \n " ) ;
put_dev_pagemap ( conflict_pgmap ) ;
2019-06-13 15:56:33 -07:00
error = - ENOMEM ;
goto err_array ;
2018-07-26 16:37:15 -07:00
}
2019-07-18 15:58:33 -07:00
is_ram = region_intersects ( res - > start , resource_size ( res ) ,
2016-03-14 15:15:51 -07:00
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-17 16:00:35 +02:00
2018-12-28 00:34:54 -08:00
if ( is_ram ! = REGION_DISJOINT ) {
WARN_ONCE ( 1 , " %s attempted on %s region %pr \n " , __func__ ,
is_ram = = REGION_MIXED ? " mixed " : " ram " , res ) ;
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
error = - ENXIO ;
goto err_array ;
2015-08-17 16:00:35 +02:00
}
2016-01-15 16:56:22 -08:00
pgmap - > dev = dev ;
2018-08-15 14:22:16 -04:00
error = xa_err ( xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( res - > end ) , pgmap , GFP_KERNEL ) ) ;
2016-01-15 16:56:19 -08:00
if ( error )
2018-08-15 14:22:16 -04:00
goto err_array ;
2016-01-15 16:56:19 -08:00
2015-08-17 16:00:35 +02:00
nid = dev_to_node ( dev ) ;
if ( nid < 0 )
2015-10-05 20:35:55 -04:00
nid = numa_mem_id ( ) ;
2015-08-17 16:00:35 +02:00
2019-07-18 15:58:33 -07:00
error = track_pfn_remap ( NULL , & pgprot , PHYS_PFN ( res - > start ) , 0 ,
resource_size ( res ) ) ;
2016-09-07 08:51:21 -07:00
if ( error )
goto err_pfn_remap ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2018-12-28 00:35:01 -08:00
/*
* For device private memory we call add_pages ( ) as we only need to
* allocate and initialize struct page for the device memory . More -
* over the device memory is un - accessible thus we do not want to
* create a linear mapping for the memory like arch_add_memory ( )
* would do .
*
* For all other device memory types , which are accessible by
* the CPU , we do want the linear mapping and thus use
* arch_add_memory ( ) .
*/
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
2019-07-18 15:58:33 -07:00
error = add_pages ( nid , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( resource_size ( res ) ) , & restrictions ) ;
2018-12-28 00:35:01 -08:00
} else {
2019-07-18 15:58:33 -07:00
error = kasan_add_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-12-28 00:35:01 -08:00
if ( error ) {
mem_hotplug_done ( ) ;
goto err_kasan ;
}
2019-07-18 15:58:33 -07:00
error = arch_add_memory ( nid , res - > start , resource_size ( res ) ,
2019-05-13 17:21:26 -07:00
& restrictions ) ;
2018-12-28 00:35:01 -08:00
}
if ( ! error ) {
struct zone * zone ;
zone = & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ;
2019-07-18 15:58:33 -07:00
move_pfn_range_to_zone ( zone , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( resource_size ( res ) ) , restrictions . altmap ) ;
2018-08-17 15:47:04 -07:00
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2016-01-15 16:56:19 -08:00
if ( error )
goto err_add_memory ;
2015-08-17 16:00:35 +02:00
2018-10-26 15:07:52 -07:00
/*
* Initialization of the pages has been deferred until now in order
* to allow us to do the work while not holding the hotplug lock .
*/
memmap_init_zone_device ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
2019-07-18 15:58:33 -07:00
PHYS_PFN ( res - > start ) ,
PHYS_PFN ( resource_size ( res ) ) , pgmap ) ;
2018-10-26 15:07:52 -07:00
percpu_ref_get_many ( pgmap - > ref , pfn_end ( pgmap ) - pfn_first ( pgmap ) ) ;
2017-12-29 08:54:05 +01:00
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
error = devm_add_action_or_reset ( dev , devm_memremap_pages_release ,
pgmap ) ;
if ( error )
return ERR_PTR ( error ) ;
2017-12-29 08:54:05 +01:00
2015-08-17 16:00:35 +02:00
return __va ( res - > start ) ;
2016-01-15 16:56:19 -08:00
err_add_memory :
2019-07-18 15:58:33 -07:00
kasan_remove_zero_shadow ( __va ( res - > start ) , resource_size ( res ) ) ;
2018-08-17 15:47:04 -07:00
err_kasan :
2019-07-18 15:58:33 -07:00
untrack_pfn ( NULL , PHYS_PFN ( res - > start ) , resource_size ( res ) ) ;
2016-09-07 08:51:21 -07:00
err_pfn_remap :
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
err_array :
2019-06-26 14:27:14 +02:00
dev_pagemap_kill ( pgmap ) ;
dev_pagemap_cleanup ( pgmap ) ;
2016-01-15 16:56:19 -08:00
return ERR_PTR ( error ) ;
2015-08-17 16:00:35 +02:00
}
2018-12-28 00:34:50 -08:00
EXPORT_SYMBOL_GPL ( devm_memremap_pages ) ;
2016-01-15 16:56:22 -08:00
2019-06-13 15:56:21 -07:00
void devm_memunmap_pages ( struct device * dev , struct dev_pagemap * pgmap )
{
devm_release_action ( dev , devm_memremap_pages_release , pgmap ) ;
}
EXPORT_SYMBOL_GPL ( devm_memunmap_pages ) ;
2016-01-15 16:56:22 -08:00
unsigned long vmem_altmap_offset ( struct vmem_altmap * altmap )
{
/* number of pfns from base where pfn_to_page() is valid */
2019-06-26 14:27:13 +02:00
if ( altmap )
return altmap - > reserve + altmap - > free ;
return 0 ;
2016-01-15 16:56:22 -08:00
}
void vmem_altmap_free ( struct vmem_altmap * altmap , unsigned long nr_pfns )
{
altmap - > alloc - = nr_pfns ;
}
2017-12-29 08:54:00 +01:00
/**
* get_dev_pagemap ( ) - take a new live reference on the dev_pagemap for @ pfn
* @ pfn : page frame number to lookup page_map
* @ pgmap : optional known pgmap that already has a reference
*
2017-12-29 08:54:01 +01:00
* If @ pgmap is non - NULL and covers @ pfn it will be returned as - is . If @ pgmap
* is non - NULL but does not cover @ pfn the reference to it will be released .
2017-12-29 08:54:00 +01:00
*/
struct dev_pagemap * get_dev_pagemap ( unsigned long pfn ,
struct dev_pagemap * pgmap )
{
resource_size_t phys = PFN_PHYS ( pfn ) ;
/*
2017-12-29 08:54:01 +01:00
* In the cached case we ' re already holding a live reference .
2017-12-29 08:54:00 +01:00
*/
2017-12-29 08:54:01 +01:00
if ( pgmap ) {
2017-12-29 08:54:04 +01:00
if ( phys > = pgmap - > res . start & & phys < = pgmap - > res . end )
2017-12-29 08:54:01 +01:00
return pgmap ;
put_dev_pagemap ( pgmap ) ;
2017-12-29 08:54:00 +01:00
}
/* fall back to slow path lookup */
rcu_read_lock ( ) ;
2018-08-15 14:22:16 -04:00
pgmap = xa_load ( & pgmap_array , PHYS_PFN ( phys ) ) ;
2017-12-29 08:54:00 +01:00
if ( pgmap & & ! percpu_ref_tryget_live ( pgmap - > ref ) )
pgmap = NULL ;
rcu_read_unlock ( ) ;
return pgmap ;
}
2018-05-16 11:46:08 -07:00
EXPORT_SYMBOL_GPL ( get_dev_pagemap ) ;
2017-09-08 16:11:46 -07:00
2018-05-16 11:46:08 -07:00
# ifdef CONFIG_DEV_PAGEMAP_OPS
void __put_devmap_managed_page ( struct page * page )
2017-09-08 16:11:46 -07:00
{
int count = page_ref_dec_return ( page ) ;
/*
* If refcount is 1 then page is freed and refcount is stable as nobody
* holds a reference on the page .
*/
if ( count = = 1 ) {
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive ( page ) ;
__ClearPageWaiters ( page ) ;
2017-09-08 16:11:54 -07:00
mem_cgroup_uncharge ( page ) ;
2017-09-08 16:11:46 -07:00
2019-08-13 15:37:07 -07:00
/*
* When a device_private page is freed , the page - > mapping field
* may still contain a ( stale ) mapping value . For example , the
* lower bits of page - > mapping may still identify the page as
* an anonymous page . Ultimately , this entire field is just
* stale and wrong , and it will cause errors if not cleared .
* One example is :
*
* migrate_vma_pages ( )
* migrate_vma_insert_page ( )
* page_add_new_anon_rmap ( )
* __page_set_anon_rmap ( )
* . . . checks page - > mapping , via PageAnon ( page ) call ,
* and incorrectly concludes that the page is an
* anonymous page . Therefore , it incorrectly ,
* silently fails to set up the new anon rmap .
*
* For other types of ZONE_DEVICE pages , migration is either
* handled differently or not done at all , so there is no need
* to clear page - > mapping .
*/
if ( is_device_private_page ( page ) )
page - > mapping = NULL ;
2019-06-26 14:27:12 +02:00
page - > pgmap - > ops - > page_free ( page ) ;
2017-09-08 16:11:46 -07:00
} else if ( ! count )
__put_page ( page ) ;
}
2018-07-26 16:37:22 -07:00
EXPORT_SYMBOL ( __put_devmap_managed_page ) ;
2018-05-16 11:46:08 -07:00
# endif /* CONFIG_DEV_PAGEMAP_OPS */