2018-03-29 19:07:13 -07:00
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
2015-08-10 23:07:07 -04:00
# include <linux/device.h>
2015-08-10 23:07:06 -04:00
# include <linux/io.h>
2018-08-17 15:47:04 -07:00
# include <linux/kasan.h>
2015-08-17 16:00:35 +02:00
# include <linux/memory_hotplug.h>
2018-08-15 14:22:16 -04:00
# include <linux/mm.h>
# include <linux/pfn_t.h>
2017-09-08 16:11:43 -07:00
# include <linux/swap.h>
# include <linux/swapops.h>
2018-08-15 14:22:16 -04:00
# include <linux/types.h>
2018-05-16 11:46:08 -07:00
# include <linux/wait_bit.h>
2018-08-15 14:22:16 -04:00
# include <linux/xarray.h>
2018-12-28 00:39:46 -08:00
# include <linux/hmm.h>
2015-08-10 23:07:06 -04:00
2018-08-15 14:22:16 -04:00
static DEFINE_XARRAY ( pgmap_array ) ;
2016-01-15 16:56:19 -08:00
# define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
# define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
2017-09-08 16:11:43 -07:00
# if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
2018-08-23 17:01:36 -07:00
vm_fault_t device_private_entry_fault ( struct vm_area_struct * vma ,
2017-09-08 16:11:43 -07:00
unsigned long addr ,
swp_entry_t entry ,
unsigned int flags ,
pmd_t * pmdp )
{
struct page * page = device_private_entry_to_page ( entry ) ;
2018-12-28 00:39:46 -08:00
struct hmm_devmem * devmem ;
devmem = container_of ( page - > pgmap , typeof ( * devmem ) , pagemap ) ;
2017-09-08 16:11:43 -07:00
/*
* The page_fault ( ) callback must migrate page back to system memory
* so that CPU can access it . This might fail for various reasons
* ( device issue , device was unsafely unplugged , . . . ) . When such
* error conditions happen , the callback must return VM_FAULT_SIGBUS .
*
* Note that because memory cgroup charges are accounted to the device
* memory , this should never fail because of memory restrictions ( but
* allocation of regular system page might still fail because we are
* out of memory ) .
*
* There is a more in - depth description of what that callback can and
* cannot do , in include / linux / memremap . h
*/
2018-12-28 00:39:46 -08:00
return devmem - > page_fault ( vma , addr , page , flags , pmdp ) ;
2017-09-08 16:11:43 -07:00
}
# endif /* CONFIG_DEVICE_PRIVATE */
2018-08-15 14:22:16 -04:00
static void pgmap_array_delete ( struct resource * res )
2017-09-06 16:24:13 -07:00
{
2018-08-15 14:22:16 -04:00
xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) , PHYS_PFN ( res - > end ) ,
NULL , GFP_KERNEL ) ;
2017-09-06 16:24:13 -07:00
synchronize_rcu ( ) ;
2016-01-15 16:56:19 -08:00
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_first ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2017-12-29 08:54:04 +01:00
const struct resource * res = & pgmap - > res ;
struct vmem_altmap * altmap = & pgmap - > altmap ;
2016-01-15 16:56:49 -08:00
unsigned long pfn ;
pfn = res - > start > > PAGE_SHIFT ;
2017-12-29 08:54:04 +01:00
if ( pgmap - > altmap_valid )
2016-01-15 16:56:49 -08:00
pfn + = vmem_altmap_offset ( altmap ) ;
return pfn ;
}
2017-12-29 08:54:04 +01:00
static unsigned long pfn_end ( struct dev_pagemap * pgmap )
2016-01-15 16:56:49 -08:00
{
2017-12-29 08:54:04 +01:00
const struct resource * res = & pgmap - > res ;
2016-01-15 16:56:49 -08:00
return ( res - > start + resource_size ( res ) ) > > PAGE_SHIFT ;
}
2018-02-06 19:34:11 -08:00
static unsigned long pfn_next ( unsigned long pfn )
{
if ( pfn % 1024 = = 0 )
cond_resched ( ) ;
return pfn + 1 ;
}
2016-01-15 16:56:49 -08:00
# define for_each_device_pfn(pfn, map) \
2018-02-06 19:34:11 -08:00
for ( pfn = pfn_first ( map ) ; pfn < pfn_end ( map ) ; pfn = pfn_next ( pfn ) )
2016-01-15 16:56:49 -08:00
2017-12-29 08:54:05 +01:00
static void devm_memremap_pages_release ( void * data )
2015-08-17 16:00:35 +02:00
{
2017-12-29 08:54:04 +01:00
struct dev_pagemap * pgmap = data ;
2017-12-29 08:54:05 +01:00
struct device * dev = pgmap - > dev ;
2017-12-29 08:54:04 +01:00
struct resource * res = & pgmap - > res ;
2016-01-15 16:56:19 -08:00
resource_size_t align_start , align_size ;
2017-04-28 10:23:37 -07:00
unsigned long pfn ;
2018-12-28 00:36:22 -08:00
int nid ;
2017-04-28 10:23:37 -07:00
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
pgmap - > kill ( pgmap - > ref ) ;
2017-12-29 08:54:04 +01:00
for_each_device_pfn ( pfn , pgmap )
2017-04-28 10:23:37 -07:00
put_page ( pfn_to_page ( pfn ) ) ;
2016-01-15 16:56:19 -08:00
2015-08-17 16:00:35 +02:00
/* pages are dead and unused, undo the arch mapping */
2016-01-15 16:56:19 -08:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
2018-01-19 16:27:54 -08:00
align_size = ALIGN ( res - > start + resource_size ( res ) , SECTION_SIZE )
- align_start ;
2017-02-24 14:55:45 -08:00
2018-12-28 00:36:22 -08:00
nid = page_to_nid ( pfn_to_page ( align_start > > PAGE_SHIFT ) ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2018-12-28 00:35:01 -08:00
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
pfn = align_start > > PAGE_SHIFT ;
__remove_pages ( page_zone ( pfn_to_page ( pfn ) ) , pfn ,
align_size > > PAGE_SHIFT , NULL ) ;
} else {
2018-12-28 00:36:22 -08:00
arch_remove_memory ( nid , align_start , align_size ,
2018-12-28 00:35:01 -08:00
pgmap - > altmap_valid ? & pgmap - > altmap : NULL ) ;
kasan_remove_zero_shadow ( __va ( align_start ) , align_size ) ;
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2017-02-24 14:55:45 -08:00
2016-09-07 08:51:21 -07:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
2017-12-29 08:54:04 +01:00
dev_WARN_ONCE ( dev , pgmap - > altmap . alloc ,
" %s: failed to free all reserved pages \n " , __func__ ) ;
2016-01-15 16:56:19 -08:00
}
2016-01-15 16:56:22 -08:00
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @ dev : hosting device for @ res
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
* @ pgmap : pointer to a struct dev_pagemap
2016-01-15 16:56:22 -08:00
*
2016-01-15 16:56:49 -08:00
* Notes :
2017-12-29 08:54:05 +01:00
* 1 / At a minimum the res , ref and type members of @ pgmap must be initialized
* by the caller before passing it to this function
*
* 2 / The altmap field may optionally be initialized , in which case altmap_valid
* must be set to true
*
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
* 3 / pgmap - > ref must be ' live ' on entry and will be killed at
* devm_memremap_pages_release ( ) time , or if this routine fails .
2016-01-15 16:56:49 -08:00
*
2017-12-29 08:54:05 +01:00
* 4 / res is expected to be a host memory range that could feasibly be
2016-01-15 16:56:49 -08:00
* treated as a " System RAM " range , i . e . not a device mmio range , but
* this is not enforced .
2016-01-15 16:56:22 -08:00
*/
2017-12-29 08:54:05 +01:00
void * devm_memremap_pages ( struct device * dev , struct dev_pagemap * pgmap )
2015-08-17 16:00:35 +02:00
{
2017-09-06 16:24:13 -07:00
resource_size_t align_start , align_size , align_end ;
2017-12-29 08:54:05 +01:00
struct vmem_altmap * altmap = pgmap - > altmap_valid ?
& pgmap - > altmap : NULL ;
2018-02-06 19:34:11 -08:00
struct resource * res = & pgmap - > res ;
2018-10-26 15:07:52 -07:00
struct dev_pagemap * conflict_pgmap ;
2019-05-13 17:21:26 -07:00
struct mhp_restrictions restrictions = {
/*
* We do not want any optional features only our own memmap
*/
. altmap = altmap ,
} ;
2016-09-07 08:51:21 -07:00
pgprot_t pgprot = PAGE_KERNEL ;
2018-02-06 19:34:11 -08:00
int error , nid , is_ram ;
2016-03-09 14:08:13 -08:00
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
if ( ! pgmap - > ref | | ! pgmap - > kill )
return ERR_PTR ( - EINVAL ) ;
2016-03-09 14:08:13 -08:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
align_size = ALIGN ( res - > start + resource_size ( res ) , SECTION_SIZE )
- align_start ;
2018-07-26 16:37:15 -07:00
align_end = align_start + align_size - 1 ;
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( align_start ) , NULL ) ;
if ( conflict_pgmap ) {
dev_WARN ( dev , " Conflicting mapping in same section \n " ) ;
put_dev_pagemap ( conflict_pgmap ) ;
return ERR_PTR ( - ENOMEM ) ;
}
conflict_pgmap = get_dev_pagemap ( PHYS_PFN ( align_end ) , NULL ) ;
if ( conflict_pgmap ) {
dev_WARN ( dev , " Conflicting mapping in same section \n " ) ;
put_dev_pagemap ( conflict_pgmap ) ;
return ERR_PTR ( - ENOMEM ) ;
}
2016-03-14 15:15:51 -07:00
is_ram = region_intersects ( align_start , align_size ,
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-17 16:00:35 +02:00
2018-12-28 00:34:54 -08:00
if ( is_ram ! = REGION_DISJOINT ) {
WARN_ONCE ( 1 , " %s attempted on %s region %pr \n " , __func__ ,
is_ram = = REGION_MIXED ? " mixed " : " ram " , res ) ;
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
error = - ENXIO ;
goto err_array ;
2015-08-17 16:00:35 +02:00
}
2016-01-15 16:56:22 -08:00
pgmap - > dev = dev ;
2018-08-15 14:22:16 -04:00
error = xa_err ( xa_store_range ( & pgmap_array , PHYS_PFN ( res - > start ) ,
PHYS_PFN ( res - > end ) , pgmap , GFP_KERNEL ) ) ;
2016-01-15 16:56:19 -08:00
if ( error )
2018-08-15 14:22:16 -04:00
goto err_array ;
2016-01-15 16:56:19 -08:00
2015-08-17 16:00:35 +02:00
nid = dev_to_node ( dev ) ;
if ( nid < 0 )
2015-10-05 20:35:55 -04:00
nid = numa_mem_id ( ) ;
2015-08-17 16:00:35 +02:00
2016-09-07 08:51:21 -07:00
error = track_pfn_remap ( NULL , & pgprot , PHYS_PFN ( align_start ) , 0 ,
align_size ) ;
if ( error )
goto err_pfn_remap ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_begin ( ) ;
2018-12-28 00:35:01 -08:00
/*
* For device private memory we call add_pages ( ) as we only need to
* allocate and initialize struct page for the device memory . More -
* over the device memory is un - accessible thus we do not want to
* create a linear mapping for the memory like arch_add_memory ( )
* would do .
*
* For all other device memory types , which are accessible by
* the CPU , we do want the linear mapping and thus use
* arch_add_memory ( ) .
*/
if ( pgmap - > type = = MEMORY_DEVICE_PRIVATE ) {
error = add_pages ( nid , align_start > > PAGE_SHIFT ,
2019-05-13 17:21:26 -07:00
align_size > > PAGE_SHIFT , & restrictions ) ;
2018-12-28 00:35:01 -08:00
} else {
error = kasan_add_zero_shadow ( __va ( align_start ) , align_size ) ;
if ( error ) {
mem_hotplug_done ( ) ;
goto err_kasan ;
}
2019-05-13 17:21:26 -07:00
error = arch_add_memory ( nid , align_start , align_size ,
& restrictions ) ;
2018-12-28 00:35:01 -08:00
}
if ( ! error ) {
struct zone * zone ;
zone = & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ;
move_pfn_range_to_zone ( zone , align_start > > PAGE_SHIFT ,
align_size > > PAGE_SHIFT , altmap ) ;
2018-08-17 15:47:04 -07:00
}
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-10 16:57:36 -08:00
mem_hotplug_done ( ) ;
2016-01-15 16:56:19 -08:00
if ( error )
goto err_add_memory ;
2015-08-17 16:00:35 +02:00
2018-10-26 15:07:52 -07:00
/*
* Initialization of the pages has been deferred until now in order
* to allow us to do the work while not holding the hotplug lock .
*/
memmap_init_zone_device ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
align_start > > PAGE_SHIFT ,
align_size > > PAGE_SHIFT , pgmap ) ;
percpu_ref_get_many ( pgmap - > ref , pfn_end ( pgmap ) - pfn_first ( pgmap ) ) ;
2017-12-29 08:54:05 +01:00
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
error = devm_add_action_or_reset ( dev , devm_memremap_pages_release ,
pgmap ) ;
if ( error )
return ERR_PTR ( error ) ;
2017-12-29 08:54:05 +01:00
2015-08-17 16:00:35 +02:00
return __va ( res - > start ) ;
2016-01-15 16:56:19 -08:00
err_add_memory :
2018-08-17 15:47:04 -07:00
kasan_remove_zero_shadow ( __va ( align_start ) , align_size ) ;
err_kasan :
2016-09-07 08:51:21 -07:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
err_pfn_remap :
2018-08-15 14:22:16 -04:00
pgmap_array_delete ( res ) ;
err_array :
mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse@redhat.com>
Reported-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-12-28 00:34:57 -08:00
pgmap - > kill ( pgmap - > ref ) ;
2016-01-15 16:56:19 -08:00
return ERR_PTR ( error ) ;
2015-08-17 16:00:35 +02:00
}
2018-12-28 00:34:50 -08:00
EXPORT_SYMBOL_GPL ( devm_memremap_pages ) ;
2016-01-15 16:56:22 -08:00
unsigned long vmem_altmap_offset ( struct vmem_altmap * altmap )
{
/* number of pfns from base where pfn_to_page() is valid */
return altmap - > reserve + altmap - > free ;
}
void vmem_altmap_free ( struct vmem_altmap * altmap , unsigned long nr_pfns )
{
altmap - > alloc - = nr_pfns ;
}
2017-12-29 08:54:00 +01:00
/**
* get_dev_pagemap ( ) - take a new live reference on the dev_pagemap for @ pfn
* @ pfn : page frame number to lookup page_map
* @ pgmap : optional known pgmap that already has a reference
*
2017-12-29 08:54:01 +01:00
* If @ pgmap is non - NULL and covers @ pfn it will be returned as - is . If @ pgmap
* is non - NULL but does not cover @ pfn the reference to it will be released .
2017-12-29 08:54:00 +01:00
*/
struct dev_pagemap * get_dev_pagemap ( unsigned long pfn ,
struct dev_pagemap * pgmap )
{
resource_size_t phys = PFN_PHYS ( pfn ) ;
/*
2017-12-29 08:54:01 +01:00
* In the cached case we ' re already holding a live reference .
2017-12-29 08:54:00 +01:00
*/
2017-12-29 08:54:01 +01:00
if ( pgmap ) {
2017-12-29 08:54:04 +01:00
if ( phys > = pgmap - > res . start & & phys < = pgmap - > res . end )
2017-12-29 08:54:01 +01:00
return pgmap ;
put_dev_pagemap ( pgmap ) ;
2017-12-29 08:54:00 +01:00
}
/* fall back to slow path lookup */
rcu_read_lock ( ) ;
2018-08-15 14:22:16 -04:00
pgmap = xa_load ( & pgmap_array , PHYS_PFN ( phys ) ) ;
2017-12-29 08:54:00 +01:00
if ( pgmap & & ! percpu_ref_tryget_live ( pgmap - > ref ) )
pgmap = NULL ;
rcu_read_unlock ( ) ;
return pgmap ;
}
2018-05-16 11:46:08 -07:00
EXPORT_SYMBOL_GPL ( get_dev_pagemap ) ;
2017-09-08 16:11:46 -07:00
2018-05-16 11:46:08 -07:00
# ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE ( devmap_managed_key ) ;
2018-07-26 16:37:22 -07:00
EXPORT_SYMBOL ( devmap_managed_key ) ;
2018-05-16 11:46:08 -07:00
static atomic_t devmap_enable ;
/*
* Toggle the static key for - > page_free ( ) callbacks when dev_pagemap
* pages go idle .
*/
void dev_pagemap_get_ops ( void )
{
if ( atomic_inc_return ( & devmap_enable ) = = 1 )
static_branch_enable ( & devmap_managed_key ) ;
}
EXPORT_SYMBOL_GPL ( dev_pagemap_get_ops ) ;
void dev_pagemap_put_ops ( void )
{
if ( atomic_dec_and_test ( & devmap_enable ) )
static_branch_disable ( & devmap_managed_key ) ;
}
EXPORT_SYMBOL_GPL ( dev_pagemap_put_ops ) ;
void __put_devmap_managed_page ( struct page * page )
2017-09-08 16:11:46 -07:00
{
int count = page_ref_dec_return ( page ) ;
/*
* If refcount is 1 then page is freed and refcount is stable as nobody
* holds a reference on the page .
*/
if ( count = = 1 ) {
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive ( page ) ;
__ClearPageWaiters ( page ) ;
2017-09-08 16:11:54 -07:00
mem_cgroup_uncharge ( page ) ;
2017-09-08 16:11:46 -07:00
page - > pgmap - > page_free ( page , page - > pgmap - > data ) ;
} else if ( ! count )
__put_page ( page ) ;
}
2018-07-26 16:37:22 -07:00
EXPORT_SYMBOL ( __put_devmap_managed_page ) ;
2018-05-16 11:46:08 -07:00
# endif /* CONFIG_DEV_PAGEMAP_OPS */