2015-08-11 06:07:06 +03:00
/*
* Copyright ( c ) 2015 Intel Corporation . All rights reserved .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*/
2016-01-16 03:56:19 +03:00
# include <linux/radix-tree.h>
# include <linux/memremap.h>
2015-08-11 06:07:07 +03:00
# include <linux/device.h>
2015-08-11 06:07:06 +03:00
# include <linux/types.h>
2016-01-16 03:56:14 +03:00
# include <linux/pfn_t.h>
2015-08-11 06:07:06 +03:00
# include <linux/io.h>
# include <linux/mm.h>
2015-08-17 17:00:35 +03:00
# include <linux/memory_hotplug.h>
2015-08-11 06:07:06 +03:00
# ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
__weak void __iomem * ioremap_cache ( resource_size_t offset , unsigned long size )
{
return ioremap ( offset , size ) ;
}
# endif
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
# ifndef arch_memremap_wb
static void * arch_memremap_wb ( resource_size_t offset , unsigned long size )
{
return ( __force void * ) ioremap_cache ( offset , size ) ;
}
# endif
2015-10-26 23:55:56 +03:00
static void * try_ram_remap ( resource_size_t offset , size_t size )
{
2016-03-10 01:08:32 +03:00
unsigned long pfn = PHYS_PFN ( offset ) ;
2015-10-26 23:55:56 +03:00
/* In the simple case just return the existing linear address */
2016-03-10 01:08:32 +03:00
if ( pfn_valid ( pfn ) & & ! PageHighMem ( pfn_to_page ( pfn ) ) )
2015-10-26 23:55:56 +03:00
return __va ( offset ) ;
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
return NULL ; /* fallback to arch_memremap_wb */
2015-10-26 23:55:56 +03:00
}
2015-08-11 06:07:06 +03:00
/**
* memremap ( ) - remap an iomem_resource as cacheable memory
* @ offset : iomem resource start address
* @ size : size of remap
2016-03-23 00:28:00 +03:00
* @ flags : any of MEMREMAP_WB , MEMREMAP_WT and MEMREMAP_WC
2015-08-11 06:07:06 +03:00
*
* memremap ( ) is " ioremap " for cases where it is known that the resource
* being mapped does not have i / o side effects and the __iomem
2016-03-23 00:28:00 +03:00
* annotation is not applicable . In the case of multiple flags , the different
* mapping types will be attempted in the order listed below until one of
* them succeeds .
2015-08-11 06:07:06 +03:00
*
2016-01-26 23:57:28 +03:00
* MEMREMAP_WB - matches the default mapping for System RAM on
2015-08-11 06:07:06 +03:00
* the architecture . This is usually a read - allocate write - back cache .
* Morever , if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap ( ) will bypass establishing a new mapping and instead return
* a pointer into the direct map .
*
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache - dirty state with respect to program visibility . Attempts to
2016-01-26 23:57:28 +03:00
* map System RAM with this mapping type will fail .
2016-03-23 00:28:00 +03:00
*
* MEMREMAP_WC - establish a writecombine mapping , whereby writes may
* be coalesced together ( e . g . in the CPU ' s write buffers ) , but is otherwise
* uncached . Attempts to map System RAM with this mapping type will fail .
2015-08-11 06:07:06 +03:00
*/
void * memremap ( resource_size_t offset , size_t size , unsigned long flags )
{
2016-01-26 23:57:28 +03:00
int is_ram = region_intersects ( offset , size ,
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-11 06:07:06 +03:00
void * addr = NULL ;
2016-03-23 00:27:57 +03:00
if ( ! flags )
return NULL ;
2015-08-11 06:07:06 +03:00
if ( is_ram = = REGION_MIXED ) {
WARN_ONCE ( 1 , " memremap attempted on mixed range %pa size: %#lx \n " ,
& offset , ( unsigned long ) size ) ;
return NULL ;
}
/* Try all mapping types requested until one returns non-NULL */
if ( flags & MEMREMAP_WB ) {
/*
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map . Some archs depend on the
* capability of memremap ( ) to autodetect cases where
2016-01-26 23:57:28 +03:00
* the requested range is potentially in System RAM .
2015-08-11 06:07:06 +03:00
*/
if ( is_ram = = REGION_INTERSECTS )
2015-10-26 23:55:56 +03:00
addr = try_ram_remap ( offset , size ) ;
if ( ! addr )
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
addr = arch_memremap_wb ( offset , size ) ;
2015-08-11 06:07:06 +03:00
}
/*
2016-03-23 00:27:57 +03:00
* If we don ' t have a mapping yet and other request flags are
* present then we will be attempting to establish a new virtual
2015-08-11 06:07:06 +03:00
* address mapping . Enforce that this mapping is not aliasing
2016-01-26 23:57:28 +03:00
* System RAM .
2015-08-11 06:07:06 +03:00
*/
2016-03-23 00:27:57 +03:00
if ( ! addr & & is_ram = = REGION_INTERSECTS & & flags ! = MEMREMAP_WB ) {
2015-08-11 06:07:06 +03:00
WARN_ONCE ( 1 , " memremap attempted on ram %pa size: %#lx \n " ,
& offset , ( unsigned long ) size ) ;
return NULL ;
}
2016-03-23 00:27:57 +03:00
if ( ! addr & & ( flags & MEMREMAP_WT ) )
2015-08-11 06:07:06 +03:00
addr = ioremap_wt ( offset , size ) ;
2016-03-23 00:28:00 +03:00
if ( ! addr & & ( flags & MEMREMAP_WC ) )
addr = ioremap_wc ( offset , size ) ;
2015-08-11 06:07:06 +03:00
return addr ;
}
EXPORT_SYMBOL ( memremap ) ;
void memunmap ( void * addr )
{
if ( is_vmalloc_addr ( addr ) )
iounmap ( ( void __iomem * ) addr ) ;
}
EXPORT_SYMBOL ( memunmap ) ;
2015-08-11 06:07:07 +03:00
static void devm_memremap_release ( struct device * dev , void * res )
{
2016-02-18 00:11:29 +03:00
memunmap ( * ( void * * ) res ) ;
2015-08-11 06:07:07 +03:00
}
static int devm_memremap_match ( struct device * dev , void * res , void * match_data )
{
return * ( void * * ) res = = match_data ;
}
void * devm_memremap ( struct device * dev , resource_size_t offset ,
size_t size , unsigned long flags )
{
void * * ptr , * addr ;
2015-10-06 03:35:56 +03:00
ptr = devres_alloc_node ( devm_memremap_release , sizeof ( * ptr ) , GFP_KERNEL ,
dev_to_node ( dev ) ) ;
2015-08-11 06:07:07 +03:00
if ( ! ptr )
2015-09-15 09:42:20 +03:00
return ERR_PTR ( - ENOMEM ) ;
2015-08-11 06:07:07 +03:00
addr = memremap ( offset , size , flags ) ;
if ( addr ) {
* ptr = addr ;
devres_add ( dev , ptr ) ;
2016-02-21 01:32:24 +03:00
} else {
2015-08-11 06:07:07 +03:00
devres_free ( ptr ) ;
2016-02-21 01:32:24 +03:00
return ERR_PTR ( - ENXIO ) ;
}
2015-08-11 06:07:07 +03:00
return addr ;
}
EXPORT_SYMBOL ( devm_memremap ) ;
void devm_memunmap ( struct device * dev , void * addr )
{
2015-09-15 09:37:48 +03:00
WARN_ON ( devres_release ( dev , devm_memremap_release ,
devm_memremap_match , addr ) ) ;
2015-08-11 06:07:07 +03:00
}
EXPORT_SYMBOL ( devm_memunmap ) ;
2015-08-17 17:00:35 +03:00
# ifdef CONFIG_ZONE_DEVICE
2016-01-16 03:56:19 +03:00
static DEFINE_MUTEX ( pgmap_lock ) ;
static RADIX_TREE ( pgmap_radix , GFP_KERNEL ) ;
# define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
# define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
2015-08-17 17:00:35 +03:00
struct page_map {
struct resource res ;
2016-01-16 03:56:19 +03:00
struct percpu_ref * ref ;
struct dev_pagemap pgmap ;
2016-01-16 03:56:22 +03:00
struct vmem_altmap altmap ;
2015-08-17 17:00:35 +03:00
} ;
2016-01-16 03:56:55 +03:00
void get_zone_device_page ( struct page * page )
{
percpu_ref_get ( page - > pgmap - > ref ) ;
}
EXPORT_SYMBOL ( get_zone_device_page ) ;
void put_zone_device_page ( struct page * page )
{
put_dev_pagemap ( page - > pgmap ) ;
}
EXPORT_SYMBOL ( put_zone_device_page ) ;
2016-01-16 03:56:19 +03:00
static void pgmap_radix_release ( struct resource * res )
{
2016-01-30 08:48:34 +03:00
resource_size_t key , align_start , align_size , align_end ;
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
align_size = ALIGN ( resource_size ( res ) , SECTION_SIZE ) ;
align_end = align_start + align_size - 1 ;
2016-01-16 03:56:19 +03:00
mutex_lock ( & pgmap_lock ) ;
for ( key = res - > start ; key < = res - > end ; key + = SECTION_SIZE )
radix_tree_delete ( & pgmap_radix , key > > PA_SECTION_SHIFT ) ;
mutex_unlock ( & pgmap_lock ) ;
}
2016-01-16 03:56:49 +03:00
static unsigned long pfn_first ( struct page_map * page_map )
{
struct dev_pagemap * pgmap = & page_map - > pgmap ;
const struct resource * res = & page_map - > res ;
struct vmem_altmap * altmap = pgmap - > altmap ;
unsigned long pfn ;
pfn = res - > start > > PAGE_SHIFT ;
if ( altmap )
pfn + = vmem_altmap_offset ( altmap ) ;
return pfn ;
}
static unsigned long pfn_end ( struct page_map * page_map )
{
const struct resource * res = & page_map - > res ;
return ( res - > start + resource_size ( res ) ) > > PAGE_SHIFT ;
}
# define for_each_device_pfn(pfn, map) \
for ( pfn = pfn_first ( map ) ; pfn < pfn_end ( map ) ; pfn + + )
2016-01-16 03:56:19 +03:00
static void devm_memremap_pages_release ( struct device * dev , void * data )
2015-08-17 17:00:35 +03:00
{
2016-01-16 03:56:19 +03:00
struct page_map * page_map = data ;
struct resource * res = & page_map - > res ;
resource_size_t align_start , align_size ;
2016-01-16 03:56:22 +03:00
struct dev_pagemap * pgmap = & page_map - > pgmap ;
2016-01-16 03:56:19 +03:00
2016-01-16 03:56:49 +03:00
if ( percpu_ref_tryget_live ( pgmap - > ref ) ) {
dev_WARN ( dev , " %s: page mapping is still live! \n " , __func__ ) ;
percpu_ref_put ( pgmap - > ref ) ;
}
2015-08-17 17:00:35 +03:00
/* pages are dead and unused, undo the arch mapping */
2016-01-16 03:56:19 +03:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
align_size = ALIGN ( resource_size ( res ) , SECTION_SIZE ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_begin ( ) ;
2016-01-16 03:56:19 +03:00
arch_remove_memory ( align_start , align_size ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_done ( ) ;
2016-09-07 18:51:21 +03:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
2016-01-30 08:48:34 +03:00
pgmap_radix_release ( res ) ;
2016-01-16 03:56:22 +03:00
dev_WARN_ONCE ( dev , pgmap - > altmap & & pgmap - > altmap - > alloc ,
" %s: failed to free all reserved pages \n " , __func__ ) ;
2016-01-16 03:56:19 +03:00
}
/* assumes rcu_read_lock() held at entry */
struct dev_pagemap * find_dev_pagemap ( resource_size_t phys )
{
struct page_map * page_map ;
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
page_map = radix_tree_lookup ( & pgmap_radix , phys > > PA_SECTION_SHIFT ) ;
return page_map ? & page_map - > pgmap : NULL ;
2015-08-17 17:00:35 +03:00
}
2016-01-16 03:56:22 +03:00
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @ dev : hosting device for @ res
* @ res : " host memory " address range
2016-01-16 03:56:49 +03:00
* @ ref : a live per - cpu reference count
2016-01-16 03:56:22 +03:00
* @ altmap : optional descriptor for allocating the memmap from @ res
*
2016-01-16 03:56:49 +03:00
* Notes :
* 1 / @ ref must be ' live ' on entry and ' dead ' before devm_memunmap_pages ( ) time
* ( or devm release event ) .
*
* 2 / @ res is expected to be a host memory range that could feasibly be
* treated as a " System RAM " range , i . e . not a device mmio range , but
* this is not enforced .
2016-01-16 03:56:22 +03:00
*/
void * devm_memremap_pages ( struct device * dev , struct resource * res ,
2016-01-16 03:56:49 +03:00
struct percpu_ref * ref , struct vmem_altmap * altmap )
2015-08-17 17:00:35 +03:00
{
2016-01-30 08:48:34 +03:00
resource_size_t key , align_start , align_size , align_end ;
2016-09-07 18:51:21 +03:00
pgprot_t pgprot = PAGE_KERNEL ;
2016-01-16 03:56:22 +03:00
struct dev_pagemap * pgmap ;
2015-08-17 17:00:35 +03:00
struct page_map * page_map ;
2016-03-10 01:08:13 +03:00
int error , nid , is_ram ;
2016-01-16 03:56:49 +03:00
unsigned long pfn ;
2016-03-10 01:08:13 +03:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
align_size = ALIGN ( res - > start + resource_size ( res ) , SECTION_SIZE )
- align_start ;
2016-03-15 01:15:51 +03:00
is_ram = region_intersects ( align_start , align_size ,
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-17 17:00:35 +03:00
if ( is_ram = = REGION_MIXED ) {
WARN_ONCE ( 1 , " %s attempted on mixed region %pr \n " ,
__func__ , res ) ;
return ERR_PTR ( - ENXIO ) ;
}
if ( is_ram = = REGION_INTERSECTS )
return __va ( res - > start ) ;
2016-01-16 03:56:49 +03:00
if ( ! ref )
return ERR_PTR ( - EINVAL ) ;
2015-10-06 03:35:56 +03:00
page_map = devres_alloc_node ( devm_memremap_pages_release ,
sizeof ( * page_map ) , GFP_KERNEL , dev_to_node ( dev ) ) ;
2015-08-17 17:00:35 +03:00
if ( ! page_map )
return ERR_PTR ( - ENOMEM ) ;
2016-01-16 03:56:22 +03:00
pgmap = & page_map - > pgmap ;
2015-08-17 17:00:35 +03:00
memcpy ( & page_map - > res , res , sizeof ( * res ) ) ;
2016-01-16 03:56:22 +03:00
pgmap - > dev = dev ;
if ( altmap ) {
memcpy ( & page_map - > altmap , altmap , sizeof ( * altmap ) ) ;
pgmap - > altmap = & page_map - > altmap ;
}
2016-01-16 03:56:49 +03:00
pgmap - > ref = ref ;
2016-01-16 03:56:22 +03:00
pgmap - > res = & page_map - > res ;
2016-01-16 03:56:19 +03:00
mutex_lock ( & pgmap_lock ) ;
error = 0 ;
2016-01-30 08:48:34 +03:00
align_end = align_start + align_size - 1 ;
for ( key = align_start ; key < = align_end ; key + = SECTION_SIZE ) {
2016-01-16 03:56:19 +03:00
struct dev_pagemap * dup ;
rcu_read_lock ( ) ;
dup = find_dev_pagemap ( key ) ;
rcu_read_unlock ( ) ;
if ( dup ) {
dev_err ( dev , " %s: %pr collides with mapping for %s \n " ,
__func__ , res , dev_name ( dup - > dev ) ) ;
error = - EBUSY ;
break ;
}
error = radix_tree_insert ( & pgmap_radix , key > > PA_SECTION_SHIFT ,
page_map ) ;
if ( error ) {
dev_err ( dev , " %s: failed: %d \n " , __func__ , error ) ;
break ;
}
}
mutex_unlock ( & pgmap_lock ) ;
if ( error )
goto err_radix ;
2015-08-17 17:00:35 +03:00
nid = dev_to_node ( dev ) ;
if ( nid < 0 )
2015-10-06 03:35:55 +03:00
nid = numa_mem_id ( ) ;
2015-08-17 17:00:35 +03:00
2016-09-07 18:51:21 +03:00
error = track_pfn_remap ( NULL , & pgprot , PHYS_PFN ( align_start ) , 0 ,
align_size ) ;
if ( error )
goto err_pfn_remap ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_begin ( ) ;
2016-01-16 03:56:19 +03:00
error = arch_add_memory ( nid , align_start , align_size , true ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_done ( ) ;
2016-01-16 03:56:19 +03:00
if ( error )
goto err_add_memory ;
2015-08-17 17:00:35 +03:00
2016-01-16 03:56:49 +03:00
for_each_device_pfn ( pfn , page_map ) {
struct page * page = pfn_to_page ( pfn ) ;
2016-03-10 01:08:10 +03:00
/*
* ZONE_DEVICE pages union - > lru with a - > pgmap back
* pointer . It is a bug if a ZONE_DEVICE page is ever
* freed or placed on a driver - private list . Seed the
* storage with LIST_POISON * values .
*/
list_del ( & page - > lru ) ;
2016-01-16 03:56:49 +03:00
page - > pgmap = pgmap ;
}
2015-08-17 17:00:35 +03:00
devres_add ( dev , page_map ) ;
return __va ( res - > start ) ;
2016-01-16 03:56:19 +03:00
err_add_memory :
2016-09-07 18:51:21 +03:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
err_pfn_remap :
2016-01-16 03:56:19 +03:00
err_radix :
pgmap_radix_release ( res ) ;
devres_free ( page_map ) ;
return ERR_PTR ( error ) ;
2015-08-17 17:00:35 +03:00
}
EXPORT_SYMBOL ( devm_memremap_pages ) ;
2016-01-16 03:56:22 +03:00
unsigned long vmem_altmap_offset ( struct vmem_altmap * altmap )
{
/* number of pfns from base where pfn_to_page() is valid */
return altmap - > reserve + altmap - > free ;
}
void vmem_altmap_free ( struct vmem_altmap * altmap , unsigned long nr_pfns )
{
altmap - > alloc - = nr_pfns ;
}
struct vmem_altmap * to_vmem_altmap ( unsigned long memmap_start )
{
/*
* ' memmap_start ' is the virtual address for the first " struct
* page " in this range of the vmemmap array. In the case of
2016-03-16 00:55:33 +03:00
* CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple
2016-01-16 03:56:22 +03:00
* pointer arithmetic , so we can perform this to_vmem_altmap ( )
* conversion without concern for the initialization state of
* the struct page fields .
*/
struct page * page = ( struct page * ) memmap_start ;
struct dev_pagemap * pgmap ;
/*
2016-03-16 00:55:33 +03:00
* Unconditionally retrieve a dev_pagemap associated with the
2016-01-16 03:56:22 +03:00
* given physical address , this is only for use in the
* arch_ { add | remove } _memory ( ) for setting up and tearing down
* the memmap .
*/
rcu_read_lock ( ) ;
pgmap = find_dev_pagemap ( __pfn_to_phys ( page_to_pfn ( page ) ) ) ;
rcu_read_unlock ( ) ;
return pgmap ? pgmap - > altmap : NULL ;
}
2015-08-17 17:00:35 +03:00
# endif /* CONFIG_ZONE_DEVICE */