2015-08-11 06:07:06 +03:00
/*
* Copyright ( c ) 2015 Intel Corporation . All rights reserved .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*/
2016-01-16 03:56:19 +03:00
# include <linux/radix-tree.h>
2015-08-11 06:07:07 +03:00
# include <linux/device.h>
2015-08-11 06:07:06 +03:00
# include <linux/types.h>
2016-01-16 03:56:14 +03:00
# include <linux/pfn_t.h>
2015-08-11 06:07:06 +03:00
# include <linux/io.h>
# include <linux/mm.h>
2015-08-17 17:00:35 +03:00
# include <linux/memory_hotplug.h>
2017-09-09 02:11:43 +03:00
# include <linux/swap.h>
# include <linux/swapops.h>
2015-08-11 06:07:06 +03:00
# ifndef ioremap_cache
/* temporary while we convert existing ioremap_cache users to memremap */
__weak void __iomem * ioremap_cache ( resource_size_t offset , unsigned long size )
{
return ioremap ( offset , size ) ;
}
# endif
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
# ifndef arch_memremap_wb
static void * arch_memremap_wb ( resource_size_t offset , unsigned long size )
{
return ( __force void * ) ioremap_cache ( offset , size ) ;
}
# endif
2017-07-18 00:10:16 +03:00
# ifndef arch_memremap_can_ram_remap
static bool arch_memremap_can_ram_remap ( resource_size_t offset , size_t size ,
unsigned long flags )
{
return true ;
}
# endif
static void * try_ram_remap ( resource_size_t offset , size_t size ,
unsigned long flags )
2015-10-26 23:55:56 +03:00
{
2016-03-10 01:08:32 +03:00
unsigned long pfn = PHYS_PFN ( offset ) ;
2015-10-26 23:55:56 +03:00
/* In the simple case just return the existing linear address */
2017-07-18 00:10:16 +03:00
if ( pfn_valid ( pfn ) & & ! PageHighMem ( pfn_to_page ( pfn ) ) & &
arch_memremap_can_ram_remap ( offset , size , flags ) )
2015-10-26 23:55:56 +03:00
return __va ( offset ) ;
2017-07-18 00:10:16 +03:00
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
return NULL ; /* fallback to arch_memremap_wb */
2015-10-26 23:55:56 +03:00
}
2015-08-11 06:07:06 +03:00
/**
* memremap ( ) - remap an iomem_resource as cacheable memory
* @ offset : iomem resource start address
* @ size : size of remap
2017-07-18 00:10:16 +03:00
* @ flags : any of MEMREMAP_WB , MEMREMAP_WT , MEMREMAP_WC ,
* MEMREMAP_ENC , MEMREMAP_DEC
2015-08-11 06:07:06 +03:00
*
* memremap ( ) is " ioremap " for cases where it is known that the resource
* being mapped does not have i / o side effects and the __iomem
2016-03-23 00:28:00 +03:00
* annotation is not applicable . In the case of multiple flags , the different
* mapping types will be attempted in the order listed below until one of
* them succeeds .
2015-08-11 06:07:06 +03:00
*
2016-01-26 23:57:28 +03:00
* MEMREMAP_WB - matches the default mapping for System RAM on
2015-08-11 06:07:06 +03:00
* the architecture . This is usually a read - allocate write - back cache .
* Morever , if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap ( ) will bypass establishing a new mapping and instead return
* a pointer into the direct map .
*
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache - dirty state with respect to program visibility . Attempts to
2016-01-26 23:57:28 +03:00
* map System RAM with this mapping type will fail .
2016-03-23 00:28:00 +03:00
*
* MEMREMAP_WC - establish a writecombine mapping , whereby writes may
* be coalesced together ( e . g . in the CPU ' s write buffers ) , but is otherwise
* uncached . Attempts to map System RAM with this mapping type will fail .
2015-08-11 06:07:06 +03:00
*/
void * memremap ( resource_size_t offset , size_t size , unsigned long flags )
{
2016-01-26 23:57:28 +03:00
int is_ram = region_intersects ( offset , size ,
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-11 06:07:06 +03:00
void * addr = NULL ;
2016-03-23 00:27:57 +03:00
if ( ! flags )
return NULL ;
2015-08-11 06:07:06 +03:00
if ( is_ram = = REGION_MIXED ) {
WARN_ONCE ( 1 , " memremap attempted on mixed range %pa size: %#lx \n " ,
& offset , ( unsigned long ) size ) ;
return NULL ;
}
/* Try all mapping types requested until one returns non-NULL */
if ( flags & MEMREMAP_WB ) {
/*
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map . Some archs depend on the
* capability of memremap ( ) to autodetect cases where
2016-01-26 23:57:28 +03:00
* the requested range is potentially in System RAM .
2015-08-11 06:07:06 +03:00
*/
if ( is_ram = = REGION_INTERSECTS )
2017-07-18 00:10:16 +03:00
addr = try_ram_remap ( offset , size , flags ) ;
2015-10-26 23:55:56 +03:00
if ( ! addr )
memremap: add arch specific hook for MEMREMAP_WB mappings
Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.
So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.
Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
2016-02-22 17:02:07 +03:00
addr = arch_memremap_wb ( offset , size ) ;
2015-08-11 06:07:06 +03:00
}
/*
2016-03-23 00:27:57 +03:00
* If we don ' t have a mapping yet and other request flags are
* present then we will be attempting to establish a new virtual
2015-08-11 06:07:06 +03:00
* address mapping . Enforce that this mapping is not aliasing
2016-01-26 23:57:28 +03:00
* System RAM .
2015-08-11 06:07:06 +03:00
*/
2016-03-23 00:27:57 +03:00
if ( ! addr & & is_ram = = REGION_INTERSECTS & & flags ! = MEMREMAP_WB ) {
2015-08-11 06:07:06 +03:00
WARN_ONCE ( 1 , " memremap attempted on ram %pa size: %#lx \n " ,
& offset , ( unsigned long ) size ) ;
return NULL ;
}
2016-03-23 00:27:57 +03:00
if ( ! addr & & ( flags & MEMREMAP_WT ) )
2015-08-11 06:07:06 +03:00
addr = ioremap_wt ( offset , size ) ;
2016-03-23 00:28:00 +03:00
if ( ! addr & & ( flags & MEMREMAP_WC ) )
addr = ioremap_wc ( offset , size ) ;
2015-08-11 06:07:06 +03:00
return addr ;
}
EXPORT_SYMBOL ( memremap ) ;
void memunmap ( void * addr )
{
if ( is_vmalloc_addr ( addr ) )
iounmap ( ( void __iomem * ) addr ) ;
}
EXPORT_SYMBOL ( memunmap ) ;
2015-08-11 06:07:07 +03:00
static void devm_memremap_release ( struct device * dev , void * res )
{
2016-02-18 00:11:29 +03:00
memunmap ( * ( void * * ) res ) ;
2015-08-11 06:07:07 +03:00
}
static int devm_memremap_match ( struct device * dev , void * res , void * match_data )
{
return * ( void * * ) res = = match_data ;
}
void * devm_memremap ( struct device * dev , resource_size_t offset ,
size_t size , unsigned long flags )
{
void * * ptr , * addr ;
2015-10-06 03:35:56 +03:00
ptr = devres_alloc_node ( devm_memremap_release , sizeof ( * ptr ) , GFP_KERNEL ,
dev_to_node ( dev ) ) ;
2015-08-11 06:07:07 +03:00
if ( ! ptr )
2015-09-15 09:42:20 +03:00
return ERR_PTR ( - ENOMEM ) ;
2015-08-11 06:07:07 +03:00
addr = memremap ( offset , size , flags ) ;
if ( addr ) {
* ptr = addr ;
devres_add ( dev , ptr ) ;
2016-02-21 01:32:24 +03:00
} else {
2015-08-11 06:07:07 +03:00
devres_free ( ptr ) ;
2016-02-21 01:32:24 +03:00
return ERR_PTR ( - ENXIO ) ;
}
2015-08-11 06:07:07 +03:00
return addr ;
}
EXPORT_SYMBOL ( devm_memremap ) ;
void devm_memunmap ( struct device * dev , void * addr )
{
2015-09-15 09:37:48 +03:00
WARN_ON ( devres_release ( dev , devm_memremap_release ,
devm_memremap_match , addr ) ) ;
2015-08-11 06:07:07 +03:00
}
EXPORT_SYMBOL ( devm_memunmap ) ;
2015-08-17 17:00:35 +03:00
# ifdef CONFIG_ZONE_DEVICE
2016-01-16 03:56:19 +03:00
static DEFINE_MUTEX ( pgmap_lock ) ;
static RADIX_TREE ( pgmap_radix , GFP_KERNEL ) ;
# define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
# define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
2017-09-07 02:24:13 +03:00
static unsigned long order_at ( struct resource * res , unsigned long pgoff )
2016-01-16 03:56:19 +03:00
{
2017-09-07 02:24:13 +03:00
unsigned long phys_pgoff = PHYS_PFN ( res - > start ) + pgoff ;
unsigned long nr_pages , mask ;
2016-01-30 08:48:34 +03:00
2017-09-07 02:24:13 +03:00
nr_pages = PHYS_PFN ( resource_size ( res ) ) ;
if ( nr_pages = = pgoff )
return ULONG_MAX ;
/*
* What is the largest aligned power - of - 2 range available from
* this resource pgoff to the end of the resource range ,
* considering the alignment of the current pgoff ?
*/
mask = phys_pgoff | rounddown_pow_of_two ( nr_pages - pgoff ) ;
if ( ! mask )
return ULONG_MAX ;
return find_first_bit ( & mask , BITS_PER_LONG ) ;
}
# define foreach_order_pgoff(res, order, pgoff) \
for ( pgoff = 0 , order = order_at ( ( res ) , pgoff ) ; order < ULONG_MAX ; \
pgoff + = 1UL < < order , order = order_at ( ( res ) , pgoff ) )
2017-09-09 02:11:43 +03:00
# if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
int device_private_entry_fault ( struct vm_area_struct * vma ,
unsigned long addr ,
swp_entry_t entry ,
unsigned int flags ,
pmd_t * pmdp )
{
struct page * page = device_private_entry_to_page ( entry ) ;
/*
* The page_fault ( ) callback must migrate page back to system memory
* so that CPU can access it . This might fail for various reasons
* ( device issue , device was unsafely unplugged , . . . ) . When such
* error conditions happen , the callback must return VM_FAULT_SIGBUS .
*
* Note that because memory cgroup charges are accounted to the device
* memory , this should never fail because of memory restrictions ( but
* allocation of regular system page might still fail because we are
* out of memory ) .
*
* There is a more in - depth description of what that callback can and
* cannot do , in include / linux / memremap . h
*/
return page - > pgmap - > page_fault ( vma , addr , page , flags , pmdp ) ;
}
EXPORT_SYMBOL ( device_private_entry_fault ) ;
# endif /* CONFIG_DEVICE_PRIVATE */
2018-01-20 03:26:33 +03:00
static void pgmap_radix_release ( struct resource * res , unsigned long end_pgoff )
2017-09-07 02:24:13 +03:00
{
unsigned long pgoff , order ;
2016-01-16 03:56:19 +03:00
mutex_lock ( & pgmap_lock ) ;
2018-01-20 03:26:33 +03:00
foreach_order_pgoff ( res , order , pgoff ) {
if ( pgoff > = end_pgoff )
break ;
2017-09-07 02:24:13 +03:00
radix_tree_delete ( & pgmap_radix , PHYS_PFN ( res - > start ) + pgoff ) ;
2018-01-20 03:26:33 +03:00
}
2016-01-16 03:56:19 +03:00
mutex_unlock ( & pgmap_lock ) ;
2017-09-07 02:24:13 +03:00
synchronize_rcu ( ) ;
2016-01-16 03:56:19 +03:00
}
2017-12-29 10:54:04 +03:00
static unsigned long pfn_first ( struct dev_pagemap * pgmap )
2016-01-16 03:56:49 +03:00
{
2017-12-29 10:54:04 +03:00
const struct resource * res = & pgmap - > res ;
struct vmem_altmap * altmap = & pgmap - > altmap ;
2016-01-16 03:56:49 +03:00
unsigned long pfn ;
pfn = res - > start > > PAGE_SHIFT ;
2017-12-29 10:54:04 +03:00
if ( pgmap - > altmap_valid )
2016-01-16 03:56:49 +03:00
pfn + = vmem_altmap_offset ( altmap ) ;
return pfn ;
}
2017-12-29 10:54:04 +03:00
static unsigned long pfn_end ( struct dev_pagemap * pgmap )
2016-01-16 03:56:49 +03:00
{
2017-12-29 10:54:04 +03:00
const struct resource * res = & pgmap - > res ;
2016-01-16 03:56:49 +03:00
return ( res - > start + resource_size ( res ) ) > > PAGE_SHIFT ;
}
2018-02-07 06:34:11 +03:00
static unsigned long pfn_next ( unsigned long pfn )
{
if ( pfn % 1024 = = 0 )
cond_resched ( ) ;
return pfn + 1 ;
}
2016-01-16 03:56:49 +03:00
# define for_each_device_pfn(pfn, map) \
2018-02-07 06:34:11 +03:00
for ( pfn = pfn_first ( map ) ; pfn < pfn_end ( map ) ; pfn = pfn_next ( pfn ) )
2016-01-16 03:56:49 +03:00
2017-12-29 10:54:05 +03:00
static void devm_memremap_pages_release ( void * data )
2015-08-17 17:00:35 +03:00
{
2017-12-29 10:54:04 +03:00
struct dev_pagemap * pgmap = data ;
2017-12-29 10:54:05 +03:00
struct device * dev = pgmap - > dev ;
2017-12-29 10:54:04 +03:00
struct resource * res = & pgmap - > res ;
2016-01-16 03:56:19 +03:00
resource_size_t align_start , align_size ;
2017-04-28 20:23:37 +03:00
unsigned long pfn ;
2017-12-29 10:54:04 +03:00
for_each_device_pfn ( pfn , pgmap )
2017-04-28 20:23:37 +03:00
put_page ( pfn_to_page ( pfn ) ) ;
2016-01-16 03:56:19 +03:00
2016-01-16 03:56:49 +03:00
if ( percpu_ref_tryget_live ( pgmap - > ref ) ) {
dev_WARN ( dev , " %s: page mapping is still live! \n " , __func__ ) ;
percpu_ref_put ( pgmap - > ref ) ;
}
2015-08-17 17:00:35 +03:00
/* pages are dead and unused, undo the arch mapping */
2016-01-16 03:56:19 +03:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
2018-01-20 03:27:54 +03:00
align_size = ALIGN ( res - > start + resource_size ( res ) , SECTION_SIZE )
- align_start ;
2017-02-25 01:55:45 +03:00
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_begin ( ) ;
2017-12-29 10:54:04 +03:00
arch_remove_memory ( align_start , align_size , pgmap - > altmap_valid ?
& pgmap - > altmap : NULL ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_done ( ) ;
2017-02-25 01:55:45 +03:00
2016-09-07 18:51:21 +03:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
2018-01-20 03:26:33 +03:00
pgmap_radix_release ( res , - 1 ) ;
2017-12-29 10:54:04 +03:00
dev_WARN_ONCE ( dev , pgmap - > altmap . alloc ,
" %s: failed to free all reserved pages \n " , __func__ ) ;
2016-01-16 03:56:19 +03:00
}
2016-01-16 03:56:22 +03:00
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @ dev : hosting device for @ res
2017-12-29 10:54:05 +03:00
* @ pgmap : pointer to a struct dev_pgmap
2016-01-16 03:56:22 +03:00
*
2016-01-16 03:56:49 +03:00
* Notes :
2017-12-29 10:54:05 +03:00
* 1 / At a minimum the res , ref and type members of @ pgmap must be initialized
* by the caller before passing it to this function
*
* 2 / The altmap field may optionally be initialized , in which case altmap_valid
* must be set to true
*
* 3 / pgmap . ref must be ' live ' on entry and ' dead ' before devm_memunmap_pages ( )
* time ( or devm release event ) . The expected order of events is that ref has
2017-04-28 20:23:37 +03:00
* been through percpu_ref_kill ( ) before devm_memremap_pages_release ( ) . The
* wait for the completion of all references being dropped and
* percpu_ref_exit ( ) must occur after devm_memremap_pages_release ( ) .
2016-01-16 03:56:49 +03:00
*
2017-12-29 10:54:05 +03:00
* 4 / res is expected to be a host memory range that could feasibly be
2016-01-16 03:56:49 +03:00
* treated as a " System RAM " range , i . e . not a device mmio range , but
* this is not enforced .
2016-01-16 03:56:22 +03:00
*/
2017-12-29 10:54:05 +03:00
void * devm_memremap_pages ( struct device * dev , struct dev_pagemap * pgmap )
2015-08-17 17:00:35 +03:00
{
2017-09-07 02:24:13 +03:00
resource_size_t align_start , align_size , align_end ;
2017-12-29 10:54:05 +03:00
struct vmem_altmap * altmap = pgmap - > altmap_valid ?
& pgmap - > altmap : NULL ;
2018-02-07 06:34:11 +03:00
struct resource * res = & pgmap - > res ;
2017-09-07 02:24:13 +03:00
unsigned long pfn , pgoff , order ;
2016-09-07 18:51:21 +03:00
pgprot_t pgprot = PAGE_KERNEL ;
2018-02-07 06:34:11 +03:00
int error , nid , is_ram ;
2016-03-10 01:08:13 +03:00
align_start = res - > start & ~ ( SECTION_SIZE - 1 ) ;
align_size = ALIGN ( res - > start + resource_size ( res ) , SECTION_SIZE )
- align_start ;
2016-03-15 01:15:51 +03:00
is_ram = region_intersects ( align_start , align_size ,
IORESOURCE_SYSTEM_RAM , IORES_DESC_NONE ) ;
2015-08-17 17:00:35 +03:00
if ( is_ram = = REGION_MIXED ) {
WARN_ONCE ( 1 , " %s attempted on mixed region %pr \n " ,
__func__ , res ) ;
return ERR_PTR ( - ENXIO ) ;
}
if ( is_ram = = REGION_INTERSECTS )
return __va ( res - > start ) ;
2017-12-29 10:54:05 +03:00
if ( ! pgmap - > ref )
2016-01-16 03:56:49 +03:00
return ERR_PTR ( - EINVAL ) ;
2016-01-16 03:56:22 +03:00
pgmap - > dev = dev ;
2016-01-16 03:56:19 +03:00
mutex_lock ( & pgmap_lock ) ;
error = 0 ;
2016-01-30 08:48:34 +03:00
align_end = align_start + align_size - 1 ;
2017-09-07 02:24:13 +03:00
foreach_order_pgoff ( res , order , pgoff ) {
error = __radix_tree_insert ( & pgmap_radix ,
2017-12-29 10:54:04 +03:00
PHYS_PFN ( res - > start ) + pgoff , order , pgmap ) ;
2016-01-16 03:56:19 +03:00
if ( error ) {
dev_err ( dev , " %s: failed: %d \n " , __func__ , error ) ;
break ;
}
}
mutex_unlock ( & pgmap_lock ) ;
if ( error )
goto err_radix ;
2015-08-17 17:00:35 +03:00
nid = dev_to_node ( dev ) ;
if ( nid < 0 )
2015-10-06 03:35:55 +03:00
nid = numa_mem_id ( ) ;
2015-08-17 17:00:35 +03:00
2016-09-07 18:51:21 +03:00
error = track_pfn_remap ( NULL , & pgprot , PHYS_PFN ( align_start ) , 0 ,
align_size ) ;
if ( error )
goto err_pfn_remap ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_begin ( ) ;
2017-12-29 10:53:53 +03:00
error = arch_add_memory ( nid , align_start , align_size , altmap , false ) ;
2017-07-07 01:38:11 +03:00
if ( ! error )
move_pfn_range_to_zone ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
align_start > > PAGE_SHIFT ,
2017-12-29 10:53:57 +03:00
align_size > > PAGE_SHIFT , altmap ) ;
mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
Both arch_add_memory() and arch_remove_memory() expect a single threaded
context.
For example, arch/x86/mm/init_64.c::kernel_physical_mapping_init() does
not hold any locks over this check and branch:
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
continue;
}
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
page_size_mask);
The result is that two threads calling devm_memremap_pages()
simultaneously can end up colliding on pgd initialization. This leads
to crash signatures like the following where the loser of the race
initializes the wrong pgd entry:
BUG: unable to handle kernel paging request at ffff888ebfff0000
IP: memcpy_erms+0x6/0x10
PGD 2f8e8fc067 PUD 0 /* <---- Invalid PUD */
Oops: 0000 [#1] SMP DEBUG_PAGEALLOC
CPU: 54 PID: 3818 Comm: systemd-udevd Not tainted 4.6.7+ #13
task: ffff882fac290040 ti: ffff882f887a4000 task.ti: ffff882f887a4000
RIP: memcpy_erms+0x6/0x10
[..]
Call Trace:
? pmem_do_bvec+0x205/0x370 [nd_pmem]
? blk_queue_enter+0x3a/0x280
pmem_rw_page+0x38/0x80 [nd_pmem]
bdev_read_page+0x84/0xb0
Hold the standard memory hotplug mutex over calls to
arch_{add,remove}_memory().
Fixes: 41e94a851304 ("add devm_memremap_pages")
Link: http://lkml.kernel.org/r/148357647831.9498.12606007370121652979.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-01-11 03:57:36 +03:00
mem_hotplug_done ( ) ;
2016-01-16 03:56:19 +03:00
if ( error )
goto err_add_memory ;
2015-08-17 17:00:35 +03:00
2017-12-29 10:54:04 +03:00
for_each_device_pfn ( pfn , pgmap ) {
2016-01-16 03:56:49 +03:00
struct page * page = pfn_to_page ( pfn ) ;
2016-03-10 01:08:10 +03:00
/*
* ZONE_DEVICE pages union - > lru with a - > pgmap back
* pointer . It is a bug if a ZONE_DEVICE page is ever
* freed or placed on a driver - private list . Seed the
* storage with LIST_POISON * values .
*/
list_del ( & page - > lru ) ;
2016-01-16 03:56:49 +03:00
page - > pgmap = pgmap ;
2017-12-29 10:54:05 +03:00
percpu_ref_get ( pgmap - > ref ) ;
2016-01-16 03:56:49 +03:00
}
2017-12-29 10:54:05 +03:00
devm_add_action ( dev , devm_memremap_pages_release , pgmap ) ;
2015-08-17 17:00:35 +03:00
return __va ( res - > start ) ;
2016-01-16 03:56:19 +03:00
err_add_memory :
2016-09-07 18:51:21 +03:00
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
err_pfn_remap :
2016-01-16 03:56:19 +03:00
err_radix :
2018-01-20 03:26:33 +03:00
pgmap_radix_release ( res , pgoff ) ;
2016-01-16 03:56:19 +03:00
return ERR_PTR ( error ) ;
2015-08-17 17:00:35 +03:00
}
EXPORT_SYMBOL ( devm_memremap_pages ) ;
2016-01-16 03:56:22 +03:00
unsigned long vmem_altmap_offset ( struct vmem_altmap * altmap )
{
/* number of pfns from base where pfn_to_page() is valid */
return altmap - > reserve + altmap - > free ;
}
void vmem_altmap_free ( struct vmem_altmap * altmap , unsigned long nr_pfns )
{
altmap - > alloc - = nr_pfns ;
}
2017-12-29 10:54:00 +03:00
/**
* get_dev_pagemap ( ) - take a new live reference on the dev_pagemap for @ pfn
* @ pfn : page frame number to lookup page_map
* @ pgmap : optional known pgmap that already has a reference
*
2017-12-29 10:54:01 +03:00
* If @ pgmap is non - NULL and covers @ pfn it will be returned as - is . If @ pgmap
* is non - NULL but does not cover @ pfn the reference to it will be released .
2017-12-29 10:54:00 +03:00
*/
struct dev_pagemap * get_dev_pagemap ( unsigned long pfn ,
struct dev_pagemap * pgmap )
{
resource_size_t phys = PFN_PHYS ( pfn ) ;
/*
2017-12-29 10:54:01 +03:00
* In the cached case we ' re already holding a live reference .
2017-12-29 10:54:00 +03:00
*/
2017-12-29 10:54:01 +03:00
if ( pgmap ) {
2017-12-29 10:54:04 +03:00
if ( phys > = pgmap - > res . start & & phys < = pgmap - > res . end )
2017-12-29 10:54:01 +03:00
return pgmap ;
put_dev_pagemap ( pgmap ) ;
2017-12-29 10:54:00 +03:00
}
/* fall back to slow path lookup */
rcu_read_lock ( ) ;
2017-12-29 10:54:06 +03:00
pgmap = radix_tree_lookup ( & pgmap_radix , PHYS_PFN ( phys ) ) ;
2017-12-29 10:54:00 +03:00
if ( pgmap & & ! percpu_ref_tryget_live ( pgmap - > ref ) )
pgmap = NULL ;
rcu_read_unlock ( ) ;
return pgmap ;
}
# endif /* CONFIG_ZONE_DEVICE */
2017-09-09 02:11:46 +03:00
2017-09-09 02:12:24 +03:00
# if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
void put_zone_device_private_or_public_page ( struct page * page )
2017-09-09 02:11:46 +03:00
{
int count = page_ref_dec_return ( page ) ;
/*
* If refcount is 1 then page is freed and refcount is stable as nobody
* holds a reference on the page .
*/
if ( count = = 1 ) {
/* Clear Active bit in case of parallel mark_page_accessed */
__ClearPageActive ( page ) ;
__ClearPageWaiters ( page ) ;
page - > mapping = NULL ;
2017-09-09 02:11:54 +03:00
mem_cgroup_uncharge ( page ) ;
2017-09-09 02:11:46 +03:00
page - > pgmap - > page_free ( page , page - > pgmap - > data ) ;
} else if ( ! count )
__put_page ( page ) ;
}
2017-09-09 02:12:24 +03:00
EXPORT_SYMBOL ( put_zone_device_private_or_public_page ) ;
# endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */