2017-09-09 02:11:23 +03:00
/*
* Copyright 2013 Red Hat Inc .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
2018-10-31 01:04:06 +03:00
* Authors : Jérôme Glisse < jglisse @ redhat . com >
2017-09-09 02:11:23 +03:00
*/
/*
* Refer to include / linux / hmm . h for information about heterogeneous memory
* management or HMM for short .
*/
# include <linux/mm.h>
# include <linux/hmm.h>
2017-09-09 02:12:02 +03:00
# include <linux/init.h>
2017-09-09 02:11:31 +03:00
# include <linux/rmap.h>
# include <linux/swap.h>
2017-09-09 02:11:23 +03:00
# include <linux/slab.h>
# include <linux/sched.h>
2017-09-09 02:11:58 +03:00
# include <linux/mmzone.h>
# include <linux/pagemap.h>
2017-09-09 02:11:31 +03:00
# include <linux/swapops.h>
# include <linux/hugetlb.h>
2017-09-09 02:11:58 +03:00
# include <linux/memremap.h>
2017-09-09 02:11:46 +03:00
# include <linux/jump_label.h>
2017-09-09 02:11:27 +03:00
# include <linux/mmu_notifier.h>
2017-09-09 02:11:58 +03:00
# include <linux/memory_hotplug.h>
# define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
2017-09-09 02:11:23 +03:00
2017-09-09 02:12:32 +03:00
# if IS_ENABLED(CONFIG_HMM_MIRROR)
2017-09-09 02:11:27 +03:00
static const struct mmu_notifier_ops hmm_mmu_notifier_ops ;
2017-09-09 02:11:23 +03:00
/*
* struct hmm - HMM per mm struct
*
* @ mm : mm struct this HMM struct is bound to
2017-09-09 02:11:31 +03:00
* @ lock : lock protecting ranges list
2017-09-09 02:11:27 +03:00
* @ sequence : we track updates to the CPU page table with a sequence number
2017-09-09 02:11:31 +03:00
* @ ranges : list of range being snapshotted
2017-09-09 02:11:27 +03:00
* @ mirrors : list of mirrors for this mm
* @ mmu_notifier : mmu notifier to track updates to CPU page table
* @ mirrors_sem : read / write semaphore protecting the mirrors list
2017-09-09 02:11:23 +03:00
*/
struct hmm {
struct mm_struct * mm ;
2017-09-09 02:11:31 +03:00
spinlock_t lock ;
2017-09-09 02:11:27 +03:00
atomic_t sequence ;
2017-09-09 02:11:31 +03:00
struct list_head ranges ;
2017-09-09 02:11:27 +03:00
struct list_head mirrors ;
struct mmu_notifier mmu_notifier ;
struct rw_semaphore mirrors_sem ;
2017-09-09 02:11:23 +03:00
} ;
/*
* hmm_register - register HMM against an mm ( HMM internal )
*
* @ mm : mm struct to attach to
*
* This is not intended to be used directly by device drivers . It allocates an
* HMM struct if mm does not have one , and initializes it .
*/
static struct hmm * hmm_register ( struct mm_struct * mm )
{
2017-09-09 02:11:27 +03:00
struct hmm * hmm = READ_ONCE ( mm - > hmm ) ;
bool cleanup = false ;
2017-09-09 02:11:23 +03:00
/*
* The hmm struct can only be freed once the mm_struct goes away ,
* hence we should always have pre - allocated an new hmm struct
* above .
*/
2017-09-09 02:11:27 +03:00
if ( hmm )
return hmm ;
hmm = kmalloc ( sizeof ( * hmm ) , GFP_KERNEL ) ;
if ( ! hmm )
return NULL ;
INIT_LIST_HEAD ( & hmm - > mirrors ) ;
init_rwsem ( & hmm - > mirrors_sem ) ;
atomic_set ( & hmm - > sequence , 0 ) ;
hmm - > mmu_notifier . ops = NULL ;
2017-09-09 02:11:31 +03:00
INIT_LIST_HEAD ( & hmm - > ranges ) ;
spin_lock_init ( & hmm - > lock ) ;
2017-09-09 02:11:27 +03:00
hmm - > mm = mm ;
spin_lock ( & mm - > page_table_lock ) ;
if ( ! mm - > hmm )
mm - > hmm = hmm ;
else
cleanup = true ;
spin_unlock ( & mm - > page_table_lock ) ;
2018-10-31 01:04:14 +03:00
if ( cleanup )
goto error ;
/*
* We should only get here if hold the mmap_sem in write mode ie on
* registration of first mirror through hmm_mirror_register ( )
*/
hmm - > mmu_notifier . ops = & hmm_mmu_notifier_ops ;
if ( __mmu_notifier_register ( & hmm - > mmu_notifier , mm ) )
goto error_mm ;
2017-09-09 02:11:27 +03:00
2017-09-09 02:11:23 +03:00
return mm - > hmm ;
2018-10-31 01:04:14 +03:00
error_mm :
spin_lock ( & mm - > page_table_lock ) ;
if ( mm - > hmm = = hmm )
mm - > hmm = NULL ;
spin_unlock ( & mm - > page_table_lock ) ;
error :
kfree ( hmm ) ;
return NULL ;
2017-09-09 02:11:23 +03:00
}
void hmm_mm_destroy ( struct mm_struct * mm )
{
kfree ( mm - > hmm ) ;
}
2017-09-09 02:11:27 +03:00
2018-10-31 01:04:24 +03:00
static int hmm_invalidate_range ( struct hmm * hmm ,
const struct hmm_update * update )
2017-09-09 02:11:27 +03:00
{
struct hmm_mirror * mirror ;
2017-09-09 02:11:31 +03:00
struct hmm_range * range ;
spin_lock ( & hmm - > lock ) ;
list_for_each_entry ( range , & hmm - > ranges , list ) {
unsigned long addr , idx , npages ;
2018-10-31 01:04:24 +03:00
if ( update - > end < range - > start | | update - > start > = range - > end )
2017-09-09 02:11:31 +03:00
continue ;
range - > valid = false ;
2018-10-31 01:04:24 +03:00
addr = max ( update - > start , range - > start ) ;
2017-09-09 02:11:31 +03:00
idx = ( addr - range - > start ) > > PAGE_SHIFT ;
2018-10-31 01:04:24 +03:00
npages = ( min ( range - > end , update - > end ) - addr ) > > PAGE_SHIFT ;
2017-09-09 02:11:31 +03:00
memset ( & range - > pfns [ idx ] , 0 , sizeof ( * range - > pfns ) * npages ) ;
}
spin_unlock ( & hmm - > lock ) ;
2017-09-09 02:11:27 +03:00
down_read ( & hmm - > mirrors_sem ) ;
2018-10-31 01:04:24 +03:00
list_for_each_entry ( mirror , & hmm - > mirrors , list ) {
int ret ;
ret = mirror - > ops - > sync_cpu_device_pagetables ( mirror , update ) ;
if ( ! update - > blockable & & ret = = - EAGAIN ) {
up_read ( & hmm - > mirrors_sem ) ;
return - EAGAIN ;
}
}
2017-09-09 02:11:27 +03:00
up_read ( & hmm - > mirrors_sem ) ;
2018-10-31 01:04:24 +03:00
return 0 ;
2017-09-09 02:11:27 +03:00
}
2018-04-11 02:28:19 +03:00
static void hmm_release ( struct mmu_notifier * mn , struct mm_struct * mm )
{
struct hmm_mirror * mirror ;
struct hmm * hmm = mm - > hmm ;
down_write ( & hmm - > mirrors_sem ) ;
mirror = list_first_entry_or_null ( & hmm - > mirrors , struct hmm_mirror ,
list ) ;
while ( mirror ) {
list_del_init ( & mirror - > list ) ;
if ( mirror - > ops - > release ) {
/*
* Drop mirrors_sem so callback can wait on any pending
* work that might itself trigger mmu_notifier callback
* and thus would deadlock with us .
*/
up_write ( & hmm - > mirrors_sem ) ;
mirror - > ops - > release ( mirror ) ;
down_write ( & hmm - > mirrors_sem ) ;
}
mirror = list_first_entry_or_null ( & hmm - > mirrors ,
struct hmm_mirror , list ) ;
}
up_write ( & hmm - > mirrors_sem ) ;
}
2018-08-22 07:52:33 +03:00
static int hmm_invalidate_range_start ( struct mmu_notifier * mn ,
2017-09-09 02:11:27 +03:00
struct mm_struct * mm ,
unsigned long start ,
2018-08-22 07:52:33 +03:00
unsigned long end ,
bool blockable )
2017-09-09 02:11:27 +03:00
{
struct hmm * hmm = mm - > hmm ;
VM_BUG_ON ( ! hmm ) ;
atomic_inc ( & hmm - > sequence ) ;
2018-08-22 07:52:33 +03:00
return 0 ;
2017-09-09 02:11:27 +03:00
}
static void hmm_invalidate_range_end ( struct mmu_notifier * mn ,
struct mm_struct * mm ,
unsigned long start ,
unsigned long end )
{
2018-10-31 01:04:24 +03:00
struct hmm_update update ;
2017-09-09 02:11:27 +03:00
struct hmm * hmm = mm - > hmm ;
VM_BUG_ON ( ! hmm ) ;
2018-10-31 01:04:24 +03:00
update . start = start ;
update . end = end ;
update . event = HMM_UPDATE_INVALIDATE ;
update . blockable = true ;
hmm_invalidate_range ( hmm , & update ) ;
2017-09-09 02:11:27 +03:00
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
2018-04-11 02:28:19 +03:00
. release = hmm_release ,
2017-09-09 02:11:27 +03:00
. invalidate_range_start = hmm_invalidate_range_start ,
. invalidate_range_end = hmm_invalidate_range_end ,
} ;
/*
* hmm_mirror_register ( ) - register a mirror against an mm
*
* @ mirror : new mirror struct to register
* @ mm : mm to register against
*
* To start mirroring a process address space , the device driver must register
* an HMM mirror struct .
*
* THE mm - > mmap_sem MUST BE HELD IN WRITE MODE !
*/
int hmm_mirror_register ( struct hmm_mirror * mirror , struct mm_struct * mm )
{
/* Sanity check */
if ( ! mm | | ! mirror | | ! mirror - > ops )
return - EINVAL ;
2018-04-11 02:28:23 +03:00
again :
2017-09-09 02:11:27 +03:00
mirror - > hmm = hmm_register ( mm ) ;
if ( ! mirror - > hmm )
return - ENOMEM ;
down_write ( & mirror - > hmm - > mirrors_sem ) ;
2018-04-11 02:28:23 +03:00
if ( mirror - > hmm - > mm = = NULL ) {
/*
* A racing hmm_mirror_unregister ( ) is about to destroy the hmm
* struct . Try again to allocate a new one .
*/
up_write ( & mirror - > hmm - > mirrors_sem ) ;
mirror - > hmm = NULL ;
goto again ;
} else {
list_add ( & mirror - > list , & mirror - > hmm - > mirrors ) ;
up_write ( & mirror - > hmm - > mirrors_sem ) ;
}
2017-09-09 02:11:27 +03:00
return 0 ;
}
EXPORT_SYMBOL ( hmm_mirror_register ) ;
/*
* hmm_mirror_unregister ( ) - unregister a mirror
*
* @ mirror : new mirror struct to register
*
* Stop mirroring a process address space , and cleanup .
*/
void hmm_mirror_unregister ( struct hmm_mirror * mirror )
{
2018-04-11 02:28:23 +03:00
bool should_unregister = false ;
struct mm_struct * mm ;
struct hmm * hmm ;
if ( mirror - > hmm = = NULL )
return ;
2017-09-09 02:11:27 +03:00
2018-04-11 02:28:23 +03:00
hmm = mirror - > hmm ;
2017-09-09 02:11:27 +03:00
down_write ( & hmm - > mirrors_sem ) ;
2018-04-11 02:28:19 +03:00
list_del_init ( & mirror - > list ) ;
2018-04-11 02:28:23 +03:00
should_unregister = list_empty ( & hmm - > mirrors ) ;
mirror - > hmm = NULL ;
mm = hmm - > mm ;
hmm - > mm = NULL ;
2017-09-09 02:11:27 +03:00
up_write ( & hmm - > mirrors_sem ) ;
2018-04-11 02:28:23 +03:00
if ( ! should_unregister | | mm = = NULL )
return ;
2018-10-31 01:04:14 +03:00
mmu_notifier_unregister_no_release ( & hmm - > mmu_notifier , mm ) ;
2018-04-11 02:28:23 +03:00
spin_lock ( & mm - > page_table_lock ) ;
if ( mm - > hmm = = hmm )
mm - > hmm = NULL ;
spin_unlock ( & mm - > page_table_lock ) ;
kfree ( hmm ) ;
2017-09-09 02:11:27 +03:00
}
EXPORT_SYMBOL ( hmm_mirror_unregister ) ;
2017-09-09 02:11:31 +03:00
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk {
struct hmm_range * range ;
unsigned long last ;
bool fault ;
bool block ;
} ;
2018-04-11 02:29:02 +03:00
static int hmm_vma_do_fault ( struct mm_walk * walk , unsigned long addr ,
bool write_fault , uint64_t * pfn )
2017-09-09 02:11:35 +03:00
{
unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE ;
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2017-09-09 02:11:35 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-08-18 01:44:47 +03:00
vm_fault_t ret ;
2017-09-09 02:11:35 +03:00
flags | = hmm_vma_walk - > block ? 0 : FAULT_FLAG_ALLOW_RETRY ;
2018-04-11 02:29:02 +03:00
flags | = write_fault ? FAULT_FLAG_WRITE : 0 ;
2018-08-18 01:44:47 +03:00
ret = handle_mm_fault ( vma , addr , flags ) ;
if ( ret & VM_FAULT_RETRY )
2017-09-09 02:11:35 +03:00
return - EBUSY ;
2018-08-18 01:44:47 +03:00
if ( ret & VM_FAULT_ERROR ) {
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_ERROR ] ;
2017-09-09 02:11:35 +03:00
return - EFAULT ;
}
return - EAGAIN ;
}
2017-09-09 02:11:31 +03:00
static int hmm_pfns_bad ( unsigned long addr ,
unsigned long end ,
struct mm_walk * walk )
{
2018-04-11 02:28:27 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long i ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
for ( ; addr < end ; addr + = PAGE_SIZE , i + + )
2018-04-11 02:29:06 +03:00
pfns [ i ] = range - > values [ HMM_PFN_ERROR ] ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2018-04-11 02:28:46 +03:00
/*
* hmm_vma_walk_hole ( ) - handle a range lacking valid pmd or pte ( s )
* @ start : range virtual start address ( inclusive )
* @ end : range virtual end address ( exclusive )
2018-04-11 02:29:02 +03:00
* @ fault : should we fault or not ?
* @ write_fault : write fault ?
2018-04-11 02:28:46 +03:00
* @ walk : mm_walk structure
* Returns : 0 on success , - EAGAIN after page fault , or page fault error
*
* This function will be called whenever pmd_none ( ) or pte_none ( ) returns true ,
* or whenever there is no page directory covering the virtual address range .
*/
2018-04-11 02:29:02 +03:00
static int hmm_vma_walk_hole_ ( unsigned long addr , unsigned long end ,
bool fault , bool write_fault ,
struct mm_walk * walk )
2017-09-09 02:11:31 +03:00
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long i ;
2017-09-09 02:11:35 +03:00
hmm_vma_walk - > last = addr ;
2017-09-09 02:11:31 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:35 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , i + + ) {
2018-04-11 02:29:06 +03:00
pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2017-09-09 02:11:35 +03:00
int ret ;
2017-09-09 02:11:31 +03:00
2018-04-11 02:29:02 +03:00
ret = hmm_vma_do_fault ( walk , addr , write_fault ,
& pfns [ i ] ) ;
2017-09-09 02:11:35 +03:00
if ( ret ! = - EAGAIN )
return ret ;
}
}
2018-04-11 02:29:02 +03:00
return ( fault | | write_fault ) ? - EAGAIN : 0 ;
}
static inline void hmm_pte_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
uint64_t pfns , uint64_t cpu_flags ,
bool * fault , bool * write_fault )
{
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
* fault = * write_fault = false ;
if ( ! hmm_vma_walk - > fault )
return ;
/* We aren't ask to do anything ... */
2018-04-11 02:29:06 +03:00
if ( ! ( pfns & range - > flags [ HMM_PFN_VALID ] ) )
2018-04-11 02:29:02 +03:00
return ;
2018-04-11 02:29:06 +03:00
/* If this is device memory than only fault if explicitly requested */
if ( ( cpu_flags & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) ) {
/* Do we fault on device memory ? */
if ( pfns & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) {
* write_fault = pfns & range - > flags [ HMM_PFN_WRITE ] ;
* fault = true ;
}
2018-04-11 02:29:02 +03:00
return ;
}
2018-04-11 02:29:06 +03:00
/* If CPU page table is not valid then we need to fault */
* fault = ! ( cpu_flags & range - > flags [ HMM_PFN_VALID ] ) ;
/* Need to write fault ? */
if ( ( pfns & range - > flags [ HMM_PFN_WRITE ] ) & &
! ( cpu_flags & range - > flags [ HMM_PFN_WRITE ] ) ) {
* write_fault = true ;
2018-04-11 02:29:02 +03:00
* fault = true ;
}
}
static void hmm_range_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
const uint64_t * pfns , unsigned long npages ,
uint64_t cpu_flags , bool * fault ,
bool * write_fault )
{
unsigned long i ;
if ( ! hmm_vma_walk - > fault ) {
* fault = * write_fault = false ;
return ;
}
for ( i = 0 ; i < npages ; + + i ) {
hmm_pte_need_fault ( hmm_vma_walk , pfns [ i ] , cpu_flags ,
fault , write_fault ) ;
if ( ( * fault ) | | ( * write_fault ) )
return ;
}
}
static int hmm_vma_walk_hole ( unsigned long addr , unsigned long end ,
struct mm_walk * walk )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
bool fault , write_fault ;
unsigned long i , npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
}
2018-04-11 02:29:06 +03:00
static inline uint64_t pmd_to_hmm_pfn_flags ( struct hmm_range * range , pmd_t pmd )
2018-04-11 02:29:02 +03:00
{
if ( pmd_protnone ( pmd ) )
return 0 ;
2018-04-11 02:29:06 +03:00
return pmd_write ( pmd ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2017-09-09 02:11:31 +03:00
}
2018-04-11 02:28:59 +03:00
static int hmm_vma_handle_pmd ( struct mm_walk * walk ,
unsigned long addr ,
unsigned long end ,
uint64_t * pfns ,
pmd_t pmd )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
unsigned long pfn , npages , i ;
bool fault , write_fault ;
2018-04-11 02:29:06 +03:00
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
npages = ( end - addr ) > > PAGE_SHIFT ;
2018-04-11 02:29:06 +03:00
cpu_flags = pmd_to_hmm_pfn_flags ( range , pmd ) ;
2018-04-11 02:29:02 +03:00
hmm_range_need_fault ( hmm_vma_walk , pfns , npages , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
if ( pmd_protnone ( pmd ) | | fault | | write_fault )
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
pfn = pmd_pfn ( pmd ) + pte_index ( addr ) ;
for ( i = 0 ; addr < end ; addr + = PAGE_SIZE , i + + , pfn + + )
2018-04-11 02:29:06 +03:00
pfns [ i ] = hmm_pfn_from_pfn ( range , pfn ) | cpu_flags ;
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = end ;
return 0 ;
}
2018-04-11 02:29:06 +03:00
static inline uint64_t pte_to_hmm_pfn_flags ( struct hmm_range * range , pte_t pte )
2018-04-11 02:29:02 +03:00
{
if ( pte_none ( pte ) | | ! pte_present ( pte ) )
return 0 ;
2018-04-11 02:29:06 +03:00
return pte_write ( pte ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2018-04-11 02:29:02 +03:00
}
2018-04-11 02:28:59 +03:00
static int hmm_vma_handle_pte ( struct mm_walk * walk , unsigned long addr ,
unsigned long end , pmd_t * pmdp , pte_t * ptep ,
uint64_t * pfn )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:59 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-04-11 02:29:02 +03:00
bool fault , write_fault ;
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
pte_t pte = * ptep ;
2018-04-11 02:29:06 +03:00
uint64_t orig_pfn = * pfn ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_NONE ] ;
cpu_flags = pte_to_hmm_pfn_flags ( range , pte ) ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
2018-04-11 02:29:02 +03:00
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
if ( pte_none ( pte ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
if ( ! pte_present ( pte ) ) {
swp_entry_t entry = pte_to_swp_entry ( pte ) ;
if ( ! non_swap_entry ( entry ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
/*
* This is a special swap entry , ignore migration , use
* device and report anything else as error .
*/
if ( is_device_private_entry ( entry ) ) {
2018-04-11 02:29:06 +03:00
cpu_flags = range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ;
2018-04-11 02:29:02 +03:00
cpu_flags | = is_write_device_private_entry ( entry ) ?
2018-04-11 02:29:06 +03:00
range - > flags [ HMM_PFN_WRITE ] : 0 ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
if ( fault | | write_fault )
goto fault ;
* pfn = hmm_pfn_from_pfn ( range , swp_offset ( entry ) ) ;
* pfn | = cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
}
if ( is_migration_entry ( entry ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2018-04-11 02:28:59 +03:00
pte_unmap ( ptep ) ;
hmm_vma_walk - > last = addr ;
migration_entry_wait ( vma - > vm_mm ,
2018-04-11 02:29:02 +03:00
pmdp , addr ) ;
2018-04-11 02:28:59 +03:00
return - EAGAIN ;
}
return 0 ;
}
/* Report error for everything else */
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_ERROR ] ;
2018-04-11 02:28:59 +03:00
return - EFAULT ;
}
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
2018-04-11 02:29:06 +03:00
* pfn = hmm_pfn_from_pfn ( range , pte_pfn ( pte ) ) | cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
fault :
pte_unmap ( ptep ) ;
/* Fault any virtual address we were asked to fault */
2018-04-11 02:29:02 +03:00
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
}
2017-09-09 02:11:31 +03:00
static int hmm_vma_walk_pmd ( pmd_t * pmdp ,
unsigned long start ,
unsigned long end ,
struct mm_walk * walk )
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-10-31 01:04:20 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long addr = start , i ;
pte_t * ptep ;
2018-10-31 01:04:20 +03:00
pmd_t pmd ;
2017-09-09 02:11:31 +03:00
again :
2018-10-31 01:04:20 +03:00
pmd = READ_ONCE ( * pmdp ) ;
if ( pmd_none ( pmd ) )
2017-09-09 02:11:31 +03:00
return hmm_vma_walk_hole ( start , end , walk ) ;
2018-10-31 01:04:20 +03:00
if ( pmd_huge ( pmd ) & & ( range - > vma - > vm_flags & VM_HUGETLB ) )
2017-09-09 02:11:31 +03:00
return hmm_pfns_bad ( start , end , walk ) ;
2018-10-31 01:04:20 +03:00
if ( thp_migration_supported ( ) & & is_pmd_migration_entry ( pmd ) ) {
bool fault , write_fault ;
unsigned long npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
if ( fault | | write_fault ) {
hmm_vma_walk - > last = addr ;
pmd_migration_entry_wait ( vma - > vm_mm , pmdp ) ;
return - EAGAIN ;
}
return 0 ;
} else if ( ! pmd_present ( pmd ) )
return hmm_pfns_bad ( start , end , walk ) ;
2017-09-09 02:11:31 +03:00
2018-10-31 01:04:20 +03:00
if ( pmd_devmap ( pmd ) | | pmd_trans_huge ( pmd ) ) {
2017-09-09 02:11:31 +03:00
/*
* No need to take pmd_lock here , even if some other threads
* is splitting the huge pmd we will get that event through
* mmu_notifier callback .
*
* So just read pmd value and check again its a transparent
* huge or device mapping one and compute corresponding pfn
* values .
*/
pmd = pmd_read_atomic ( pmdp ) ;
barrier ( ) ;
if ( ! pmd_devmap ( pmd ) & & ! pmd_trans_huge ( pmd ) )
goto again ;
2017-09-09 02:11:35 +03:00
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2018-04-11 02:28:59 +03:00
return hmm_vma_handle_pmd ( walk , addr , end , & pfns [ i ] , pmd ) ;
2017-09-09 02:11:31 +03:00
}
2018-10-31 01:04:20 +03:00
/*
* We have handled all the valid case above ie either none , migration ,
* huge or transparent huge . At this point either it is a valid pmd
* entry pointing to pte directory or it is a bad pmd that will not
* recover .
*/
if ( pmd_bad ( pmd ) )
2017-09-09 02:11:31 +03:00
return hmm_pfns_bad ( start , end , walk ) ;
ptep = pte_offset_map ( pmdp , addr ) ;
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:31 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , ptep + + , i + + ) {
2018-04-11 02:28:59 +03:00
int r ;
2017-09-09 02:11:35 +03:00
2018-04-11 02:28:59 +03:00
r = hmm_vma_handle_pte ( walk , addr , end , pmdp , ptep , & pfns [ i ] ) ;
if ( r ) {
/* hmm_vma_handle_pte() did unmap pte directory */
hmm_vma_walk - > last = addr ;
return r ;
2017-09-09 02:11:35 +03:00
}
2017-09-09 02:11:31 +03:00
}
pte_unmap ( ptep - 1 ) ;
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = addr ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2018-04-11 02:29:06 +03:00
static void hmm_pfns_clear ( struct hmm_range * range ,
uint64_t * pfns ,
2018-04-11 02:28:54 +03:00
unsigned long addr ,
unsigned long end )
{
for ( ; addr < end ; addr + = PAGE_SIZE , pfns + + )
2018-04-11 02:29:06 +03:00
* pfns = range - > values [ HMM_PFN_NONE ] ;
2018-04-11 02:28:54 +03:00
}
2018-04-11 02:28:42 +03:00
static void hmm_pfns_special ( struct hmm_range * range )
{
unsigned long addr = range - > start , i = 0 ;
for ( ; addr < range - > end ; addr + = PAGE_SIZE , i + + )
2018-04-11 02:29:06 +03:00
range - > pfns [ i ] = range - > values [ HMM_PFN_SPECIAL ] ;
2018-04-11 02:28:42 +03:00
}
2017-09-09 02:11:31 +03:00
/*
* hmm_vma_get_pfns ( ) - snapshot CPU page table for a range of virtual addresses
2018-04-11 02:28:30 +03:00
* @ range : range being snapshotted
2018-04-11 02:28:34 +03:00
* Returns : - EINVAL if invalid argument , - ENOMEM out of memory , - EPERM invalid
* vma permission , 0 success
2017-09-09 02:11:31 +03:00
*
* This snapshots the CPU page table for a range of virtual addresses . Snapshot
* validity is tracked by range struct . See hmm_vma_range_done ( ) for further
* information .
*
* The range struct is initialized here . It tracks the CPU page table , but only
* if the function returns success ( 0 ) , in which case the caller must then call
* hmm_vma_range_done ( ) to stop CPU page table update tracking on this range .
*
* NOT CALLING hmm_vma_range_done ( ) IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
* MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
*/
2018-04-11 02:28:30 +03:00
int hmm_vma_get_pfns ( struct hmm_range * range )
2017-09-09 02:11:31 +03:00
{
2018-04-11 02:28:30 +03:00
struct vm_area_struct * vma = range - > vma ;
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk hmm_vma_walk ;
2017-09-09 02:11:31 +03:00
struct mm_walk mm_walk ;
struct hmm * hmm ;
/* Sanity check, this really should not happen ! */
2018-04-11 02:28:30 +03:00
if ( range - > start < vma - > vm_start | | range - > start > = vma - > vm_end )
2017-09-09 02:11:31 +03:00
return - EINVAL ;
2018-04-11 02:28:30 +03:00
if ( range - > end < vma - > vm_start | | range - > end > vma - > vm_end )
2017-09-09 02:11:31 +03:00
return - EINVAL ;
hmm = hmm_register ( vma - > vm_mm ) ;
if ( ! hmm )
return - ENOMEM ;
/* Caller must have registered a mirror, via hmm_mirror_register() ! */
if ( ! hmm - > mmu_notifier . ops )
return - EINVAL ;
2018-04-11 02:28:42 +03:00
/* FIXME support hugetlb fs */
2018-08-18 01:43:40 +03:00
if ( is_vm_hugetlb_page ( vma ) | | ( vma - > vm_flags & VM_SPECIAL ) | |
vma_is_dax ( vma ) ) {
2018-04-11 02:28:42 +03:00
hmm_pfns_special ( range ) ;
return - EINVAL ;
}
2018-04-11 02:28:34 +03:00
if ( ! ( vma - > vm_flags & VM_READ ) ) {
/*
* If vma do not allow read access , then assume that it does
* not allow write access , either . Architecture that allow
* write without read access are not supported by HMM , because
* operations such has atomic access would not work .
*/
2018-04-11 02:29:06 +03:00
hmm_pfns_clear ( range , range - > pfns , range - > start , range - > end ) ;
2018-04-11 02:28:34 +03:00
return - EPERM ;
}
2017-09-09 02:11:31 +03:00
/* Initialize range to track CPU page table update */
spin_lock ( & hmm - > lock ) ;
range - > valid = true ;
list_add_rcu ( & range - > list , & hmm - > ranges ) ;
spin_unlock ( & hmm - > lock ) ;
2017-09-09 02:11:35 +03:00
hmm_vma_walk . fault = false ;
hmm_vma_walk . range = range ;
mm_walk . private = & hmm_vma_walk ;
2017-09-09 02:11:31 +03:00
mm_walk . vma = vma ;
mm_walk . mm = vma - > vm_mm ;
mm_walk . pte_entry = NULL ;
mm_walk . test_walk = NULL ;
mm_walk . hugetlb_entry = NULL ;
mm_walk . pmd_entry = hmm_vma_walk_pmd ;
mm_walk . pte_hole = hmm_vma_walk_hole ;
2018-04-11 02:28:30 +03:00
walk_page_range ( range - > start , range - > end , & mm_walk ) ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
EXPORT_SYMBOL ( hmm_vma_get_pfns ) ;
/*
* hmm_vma_range_done ( ) - stop tracking change to CPU page table over a range
* @ range : range being tracked
* Returns : false if range data has been invalidated , true otherwise
*
* Range struct is used to track updates to the CPU page table after a call to
* either hmm_vma_get_pfns ( ) or hmm_vma_fault ( ) . Once the device driver is done
* using the data , or wants to lock updates to the data it got from those
* functions , it must call the hmm_vma_range_done ( ) function , which will then
* stop tracking CPU page table updates .
*
* Note that device driver must still implement general CPU page table update
* tracking either by using hmm_mirror ( see hmm_mirror_register ( ) ) or by using
* the mmu_notifier API directly .
*
* CPU page table update tracking done through hmm_range is only temporary and
* to be used while trying to duplicate CPU page table contents for a range of
* virtual addresses .
*
* There are two ways to use this :
* again :
2018-04-11 02:28:30 +03:00
* hmm_vma_get_pfns ( range ) ; or hmm_vma_fault ( . . . ) ;
2017-09-09 02:11:31 +03:00
* trans = device_build_page_table_update_transaction ( pfns ) ;
* device_page_table_lock ( ) ;
2018-04-11 02:28:30 +03:00
* if ( ! hmm_vma_range_done ( range ) ) {
2017-09-09 02:11:31 +03:00
* device_page_table_unlock ( ) ;
* goto again ;
* }
* device_commit_transaction ( trans ) ;
* device_page_table_unlock ( ) ;
*
* Or :
2018-04-11 02:28:30 +03:00
* hmm_vma_get_pfns ( range ) ; or hmm_vma_fault ( . . . ) ;
2017-09-09 02:11:31 +03:00
* device_page_table_lock ( ) ;
2018-04-11 02:28:30 +03:00
* hmm_vma_range_done ( range ) ;
* device_update_page_table ( range - > pfns ) ;
2017-09-09 02:11:31 +03:00
* device_page_table_unlock ( ) ;
*/
2018-04-11 02:28:30 +03:00
bool hmm_vma_range_done ( struct hmm_range * range )
2017-09-09 02:11:31 +03:00
{
unsigned long npages = ( range - > end - range - > start ) > > PAGE_SHIFT ;
struct hmm * hmm ;
if ( range - > end < = range - > start ) {
BUG ( ) ;
return false ;
}
2018-04-11 02:28:30 +03:00
hmm = hmm_register ( range - > vma - > vm_mm ) ;
2017-09-09 02:11:31 +03:00
if ( ! hmm ) {
memset ( range - > pfns , 0 , sizeof ( * range - > pfns ) * npages ) ;
return false ;
}
spin_lock ( & hmm - > lock ) ;
list_del_rcu ( & range - > list ) ;
spin_unlock ( & hmm - > lock ) ;
return range - > valid ;
}
EXPORT_SYMBOL ( hmm_vma_range_done ) ;
2017-09-09 02:11:35 +03:00
/*
* hmm_vma_fault ( ) - try to fault some address in a virtual address range
2018-04-11 02:28:30 +03:00
* @ range : range being faulted
2017-09-09 02:11:35 +03:00
* @ block : allow blocking on fault ( if true it sleeps and do not drop mmap_sem )
* Returns : 0 success , error otherwise ( - EAGAIN means mmap_sem have been drop )
*
* This is similar to a regular CPU page fault except that it will not trigger
* any memory migration if the memory being faulted is not accessible by CPUs .
*
2018-04-11 02:28:38 +03:00
* On error , for one virtual address in the range , the function will mark the
* corresponding HMM pfn entry with an error flag .
2017-09-09 02:11:35 +03:00
*
* Expected use pattern :
* retry :
* down_read ( & mm - > mmap_sem ) ;
* // Find vma and address device wants to fault, initialize hmm_pfn_t
* // array accordingly
2018-04-11 02:28:30 +03:00
* ret = hmm_vma_fault ( range , write , block ) ;
2017-09-09 02:11:35 +03:00
* switch ( ret ) {
* case - EAGAIN :
2018-04-11 02:28:30 +03:00
* hmm_vma_range_done ( range ) ;
2017-09-09 02:11:35 +03:00
* // You might want to rate limit or yield to play nicely, you may
* // also commit any valid pfn in the array assuming that you are
* // getting true from hmm_vma_range_monitor_end()
* goto retry ;
* case 0 :
* break ;
2018-04-11 02:28:34 +03:00
* case - ENOMEM :
* case - EINVAL :
* case - EPERM :
2017-09-09 02:11:35 +03:00
* default :
* // Handle error !
* up_read ( & mm - > mmap_sem )
* return ;
* }
* // Take device driver lock that serialize device page table update
* driver_lock_device_page_table_update ( ) ;
2018-04-11 02:28:30 +03:00
* hmm_vma_range_done ( range ) ;
2017-09-09 02:11:35 +03:00
* // Commit pfns we got from hmm_vma_fault()
* driver_unlock_device_page_table_update ( ) ;
* up_read ( & mm - > mmap_sem )
*
* YOU MUST CALL hmm_vma_range_done ( ) AFTER THIS FUNCTION RETURN SUCCESS ( 0 )
* BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
*
* YOU HAVE BEEN WARNED !
*/
2018-04-11 02:29:02 +03:00
int hmm_vma_fault ( struct hmm_range * range , bool block )
2017-09-09 02:11:35 +03:00
{
2018-04-11 02:28:30 +03:00
struct vm_area_struct * vma = range - > vma ;
unsigned long start = range - > start ;
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk hmm_vma_walk ;
struct mm_walk mm_walk ;
struct hmm * hmm ;
int ret ;
/* Sanity check, this really should not happen ! */
2018-04-11 02:28:30 +03:00
if ( range - > start < vma - > vm_start | | range - > start > = vma - > vm_end )
2017-09-09 02:11:35 +03:00
return - EINVAL ;
2018-04-11 02:28:30 +03:00
if ( range - > end < vma - > vm_start | | range - > end > vma - > vm_end )
2017-09-09 02:11:35 +03:00
return - EINVAL ;
hmm = hmm_register ( vma - > vm_mm ) ;
if ( ! hmm ) {
2018-04-11 02:29:06 +03:00
hmm_pfns_clear ( range , range - > pfns , range - > start , range - > end ) ;
2017-09-09 02:11:35 +03:00
return - ENOMEM ;
}
/* Caller must have registered a mirror using hmm_mirror_register() */
if ( ! hmm - > mmu_notifier . ops )
return - EINVAL ;
2018-04-11 02:28:42 +03:00
/* FIXME support hugetlb fs */
2018-08-18 01:43:40 +03:00
if ( is_vm_hugetlb_page ( vma ) | | ( vma - > vm_flags & VM_SPECIAL ) | |
vma_is_dax ( vma ) ) {
2018-04-11 02:28:42 +03:00
hmm_pfns_special ( range ) ;
return - EINVAL ;
}
2018-04-11 02:28:34 +03:00
if ( ! ( vma - > vm_flags & VM_READ ) ) {
/*
* If vma do not allow read access , then assume that it does
* not allow write access , either . Architecture that allow
* write without read access are not supported by HMM , because
* operations such has atomic access would not work .
*/
2018-04-11 02:29:06 +03:00
hmm_pfns_clear ( range , range - > pfns , range - > start , range - > end ) ;
2018-04-11 02:28:34 +03:00
return - EPERM ;
}
2017-09-09 02:11:35 +03:00
2018-04-11 02:28:34 +03:00
/* Initialize range to track CPU page table update */
spin_lock ( & hmm - > lock ) ;
range - > valid = true ;
list_add_rcu ( & range - > list , & hmm - > ranges ) ;
spin_unlock ( & hmm - > lock ) ;
2017-09-09 02:11:35 +03:00
hmm_vma_walk . fault = true ;
hmm_vma_walk . block = block ;
hmm_vma_walk . range = range ;
mm_walk . private = & hmm_vma_walk ;
hmm_vma_walk . last = range - > start ;
mm_walk . vma = vma ;
mm_walk . mm = vma - > vm_mm ;
mm_walk . pte_entry = NULL ;
mm_walk . test_walk = NULL ;
mm_walk . hugetlb_entry = NULL ;
mm_walk . pmd_entry = hmm_vma_walk_pmd ;
mm_walk . pte_hole = hmm_vma_walk_hole ;
do {
2018-04-11 02:28:30 +03:00
ret = walk_page_range ( start , range - > end , & mm_walk ) ;
2017-09-09 02:11:35 +03:00
start = hmm_vma_walk . last ;
} while ( ret = = - EAGAIN ) ;
if ( ret ) {
unsigned long i ;
i = ( hmm_vma_walk . last - range - > start ) > > PAGE_SHIFT ;
2018-04-11 02:29:06 +03:00
hmm_pfns_clear ( range , & range - > pfns [ i ] , hmm_vma_walk . last ,
range - > end ) ;
2018-04-11 02:28:30 +03:00
hmm_vma_range_done ( range ) ;
2017-09-09 02:11:35 +03:00
}
return ret ;
}
EXPORT_SYMBOL ( hmm_vma_fault ) ;
2017-09-09 02:11:27 +03:00
# endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
2017-09-09 02:11:58 +03:00
2017-09-09 02:12:24 +03:00
# if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
2017-09-09 02:11:58 +03:00
struct page * hmm_vma_alloc_locked_page ( struct vm_area_struct * vma ,
unsigned long addr )
{
struct page * page ;
page = alloc_page_vma ( GFP_HIGHUSER , vma , addr ) ;
if ( ! page )
return NULL ;
lock_page ( page ) ;
return page ;
}
EXPORT_SYMBOL ( hmm_vma_alloc_locked_page ) ;
static void hmm_devmem_ref_release ( struct percpu_ref * ref )
{
struct hmm_devmem * devmem ;
devmem = container_of ( ref , struct hmm_devmem , ref ) ;
complete ( & devmem - > completion ) ;
}
static void hmm_devmem_ref_exit ( void * data )
{
struct percpu_ref * ref = data ;
struct hmm_devmem * devmem ;
devmem = container_of ( ref , struct hmm_devmem , ref ) ;
percpu_ref_exit ( ref ) ;
devm_remove_action ( devmem - > device , & hmm_devmem_ref_exit , data ) ;
}
static void hmm_devmem_ref_kill ( void * data )
{
struct percpu_ref * ref = data ;
struct hmm_devmem * devmem ;
devmem = container_of ( ref , struct hmm_devmem , ref ) ;
percpu_ref_kill ( ref ) ;
wait_for_completion ( & devmem - > completion ) ;
devm_remove_action ( devmem - > device , & hmm_devmem_ref_kill , data ) ;
}
static int hmm_devmem_fault ( struct vm_area_struct * vma ,
unsigned long addr ,
const struct page * page ,
unsigned int flags ,
pmd_t * pmdp )
{
struct hmm_devmem * devmem = page - > pgmap - > data ;
return devmem - > ops - > fault ( devmem , vma , addr , page , flags , pmdp ) ;
}
static void hmm_devmem_free ( struct page * page , void * data )
{
struct hmm_devmem * devmem = data ;
2018-07-14 07:50:01 +03:00
page - > mapping = NULL ;
2017-09-09 02:11:58 +03:00
devmem - > ops - > free ( devmem , page ) ;
}
static DEFINE_MUTEX ( hmm_devmem_lock ) ;
static RADIX_TREE ( hmm_devmem_radix , GFP_KERNEL ) ;
static void hmm_devmem_radix_release ( struct resource * resource )
{
2018-08-18 01:50:07 +03:00
resource_size_t key ;
2017-09-09 02:11:58 +03:00
mutex_lock ( & hmm_devmem_lock ) ;
for ( key = resource - > start ;
key < = resource - > end ;
key + = PA_SECTION_SIZE )
radix_tree_delete ( & hmm_devmem_radix , key > > PA_SECTION_SHIFT ) ;
mutex_unlock ( & hmm_devmem_lock ) ;
}
static void hmm_devmem_release ( struct device * dev , void * data )
{
struct hmm_devmem * devmem = data ;
struct resource * resource = devmem - > resource ;
unsigned long start_pfn , npages ;
struct zone * zone ;
struct page * page ;
if ( percpu_ref_tryget_live ( & devmem - > ref ) ) {
dev_WARN ( dev , " %s: page mapping is still live! \n " , __func__ ) ;
percpu_ref_put ( & devmem - > ref ) ;
}
/* pages are dead and unused, undo the arch mapping */
start_pfn = ( resource - > start & ~ ( PA_SECTION_SIZE - 1 ) ) > > PAGE_SHIFT ;
npages = ALIGN ( resource_size ( resource ) , PA_SECTION_SIZE ) > > PAGE_SHIFT ;
page = pfn_to_page ( start_pfn ) ;
zone = page_zone ( page ) ;
mem_hotplug_begin ( ) ;
2017-09-09 02:12:28 +03:00
if ( resource - > desc = = IORES_DESC_DEVICE_PRIVATE_MEMORY )
2017-12-29 10:53:55 +03:00
__remove_pages ( zone , start_pfn , npages , NULL ) ;
2017-09-09 02:12:28 +03:00
else
arch_remove_memory ( start_pfn < < PAGE_SHIFT ,
2017-12-29 10:53:55 +03:00
npages < < PAGE_SHIFT , NULL ) ;
2017-09-09 02:11:58 +03:00
mem_hotplug_done ( ) ;
hmm_devmem_radix_release ( resource ) ;
}
static int hmm_devmem_pages_create ( struct hmm_devmem * devmem )
{
resource_size_t key , align_start , align_size , align_end ;
struct device * device = devmem - > device ;
int ret , nid , is_ram ;
align_start = devmem - > resource - > start & ~ ( PA_SECTION_SIZE - 1 ) ;
align_size = ALIGN ( devmem - > resource - > start +
resource_size ( devmem - > resource ) ,
PA_SECTION_SIZE ) - align_start ;
is_ram = region_intersects ( align_start , align_size ,
IORESOURCE_SYSTEM_RAM ,
IORES_DESC_NONE ) ;
if ( is_ram = = REGION_MIXED ) {
WARN_ONCE ( 1 , " %s attempted on mixed region %pr \n " ,
__func__ , devmem - > resource ) ;
return - ENXIO ;
}
if ( is_ram = = REGION_INTERSECTS )
return - ENXIO ;
2017-09-09 02:12:28 +03:00
if ( devmem - > resource - > desc = = IORES_DESC_DEVICE_PUBLIC_MEMORY )
devmem - > pagemap . type = MEMORY_DEVICE_PUBLIC ;
else
devmem - > pagemap . type = MEMORY_DEVICE_PRIVATE ;
2017-12-29 10:54:04 +03:00
devmem - > pagemap . res = * devmem - > resource ;
2017-09-09 02:11:58 +03:00
devmem - > pagemap . page_fault = hmm_devmem_fault ;
devmem - > pagemap . page_free = hmm_devmem_free ;
devmem - > pagemap . dev = devmem - > device ;
devmem - > pagemap . ref = & devmem - > ref ;
devmem - > pagemap . data = devmem ;
mutex_lock ( & hmm_devmem_lock ) ;
align_end = align_start + align_size - 1 ;
for ( key = align_start ; key < = align_end ; key + = PA_SECTION_SIZE ) {
struct hmm_devmem * dup ;
2018-04-11 02:29:09 +03:00
dup = radix_tree_lookup ( & hmm_devmem_radix ,
key > > PA_SECTION_SHIFT ) ;
2017-09-09 02:11:58 +03:00
if ( dup ) {
dev_err ( device , " %s: collides with mapping for %s \n " ,
__func__ , dev_name ( dup - > device ) ) ;
mutex_unlock ( & hmm_devmem_lock ) ;
ret = - EBUSY ;
goto error ;
}
ret = radix_tree_insert ( & hmm_devmem_radix ,
key > > PA_SECTION_SHIFT ,
devmem ) ;
if ( ret ) {
dev_err ( device , " %s: failed: %d \n " , __func__ , ret ) ;
mutex_unlock ( & hmm_devmem_lock ) ;
goto error_radix ;
}
}
mutex_unlock ( & hmm_devmem_lock ) ;
nid = dev_to_node ( device ) ;
if ( nid < 0 )
nid = numa_mem_id ( ) ;
mem_hotplug_begin ( ) ;
/*
* For device private memory we call add_pages ( ) as we only need to
* allocate and initialize struct page for the device memory . More -
* over the device memory is un - accessible thus we do not want to
* create a linear mapping for the memory like arch_add_memory ( )
* would do .
2017-09-09 02:12:28 +03:00
*
* For device public memory , which is accesible by the CPU , we do
* want the linear mapping and thus use arch_add_memory ( ) .
2017-09-09 02:11:58 +03:00
*/
2017-09-09 02:12:28 +03:00
if ( devmem - > pagemap . type = = MEMORY_DEVICE_PUBLIC )
2017-12-29 10:53:53 +03:00
ret = arch_add_memory ( nid , align_start , align_size , NULL ,
false ) ;
2017-09-09 02:12:28 +03:00
else
ret = add_pages ( nid , align_start > > PAGE_SHIFT ,
2017-12-29 10:53:53 +03:00
align_size > > PAGE_SHIFT , NULL , false ) ;
2017-09-09 02:11:58 +03:00
if ( ret ) {
mem_hotplug_done ( ) ;
goto error_add_memory ;
}
move_pfn_range_to_zone ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
align_start > > PAGE_SHIFT ,
2017-12-29 10:53:57 +03:00
align_size > > PAGE_SHIFT , NULL ) ;
2017-09-09 02:11:58 +03:00
mem_hotplug_done ( ) ;
2018-10-27 01:07:52 +03:00
/*
* Initialization of the pages has been deferred until now in order
* to allow us to do the work while not holding the hotplug lock .
*/
memmap_init_zone_device ( & NODE_DATA ( nid ) - > node_zones [ ZONE_DEVICE ] ,
align_start > > PAGE_SHIFT ,
align_size > > PAGE_SHIFT , & devmem - > pagemap ) ;
2017-09-09 02:11:58 +03:00
return 0 ;
error_add_memory :
untrack_pfn ( NULL , PHYS_PFN ( align_start ) , align_size ) ;
error_radix :
hmm_devmem_radix_release ( devmem - > resource ) ;
error :
return ret ;
}
static int hmm_devmem_match ( struct device * dev , void * data , void * match_data )
{
struct hmm_devmem * devmem = data ;
return devmem - > resource = = match_data ;
}
static void hmm_devmem_pages_remove ( struct hmm_devmem * devmem )
{
devres_release ( devmem - > device , & hmm_devmem_release ,
& hmm_devmem_match , devmem - > resource ) ;
}
/*
* hmm_devmem_add ( ) - hotplug ZONE_DEVICE memory for device memory
*
* @ ops : memory event device driver callback ( see struct hmm_devmem_ops )
* @ device : device struct to bind the resource too
* @ size : size in bytes of the device memory to add
* Returns : pointer to new hmm_devmem struct ERR_PTR otherwise
*
* This function first finds an empty range of physical address big enough to
* contain the new resource , and then hotplugs it as ZONE_DEVICE memory , which
* in turn allocates struct pages . It does not do anything beyond that ; all
* events affecting the memory will go through the various callbacks provided
* by hmm_devmem_ops struct .
*
* Device driver should call this function during device initialization and
* is then responsible of memory management . HMM only provides helpers .
*/
struct hmm_devmem * hmm_devmem_add ( const struct hmm_devmem_ops * ops ,
struct device * device ,
unsigned long size )
{
struct hmm_devmem * devmem ;
resource_size_t addr ;
int ret ;
2018-05-16 21:46:08 +03:00
dev_pagemap_get_ops ( ) ;
2017-09-09 02:11:58 +03:00
devmem = devres_alloc_node ( & hmm_devmem_release , sizeof ( * devmem ) ,
GFP_KERNEL , dev_to_node ( device ) ) ;
if ( ! devmem )
return ERR_PTR ( - ENOMEM ) ;
init_completion ( & devmem - > completion ) ;
devmem - > pfn_first = - 1UL ;
devmem - > pfn_last = - 1UL ;
devmem - > resource = NULL ;
devmem - > device = device ;
devmem - > ops = ops ;
ret = percpu_ref_init ( & devmem - > ref , & hmm_devmem_ref_release ,
0 , GFP_KERNEL ) ;
if ( ret )
goto error_percpu_ref ;
ret = devm_add_action ( device , hmm_devmem_ref_exit , & devmem - > ref ) ;
if ( ret )
goto error_devm_add_action ;
size = ALIGN ( size , PA_SECTION_SIZE ) ;
addr = min ( ( unsigned long ) iomem_resource . end ,
( 1UL < < MAX_PHYSMEM_BITS ) - 1 ) ;
addr = addr - size + 1UL ;
/*
* FIXME add a new helper to quickly walk resource tree and find free
* range
*
* FIXME what about ioport_resource resource ?
*/
for ( ; addr > size & & addr > = iomem_resource . start ; addr - = size ) {
ret = region_intersects ( addr , size , 0 , IORES_DESC_NONE ) ;
if ( ret ! = REGION_DISJOINT )
continue ;
devmem - > resource = devm_request_mem_region ( device , addr , size ,
dev_name ( device ) ) ;
if ( ! devmem - > resource ) {
ret = - ENOMEM ;
goto error_no_resource ;
}
break ;
}
if ( ! devmem - > resource ) {
ret = - ERANGE ;
goto error_no_resource ;
}
devmem - > resource - > desc = IORES_DESC_DEVICE_PRIVATE_MEMORY ;
devmem - > pfn_first = devmem - > resource - > start > > PAGE_SHIFT ;
devmem - > pfn_last = devmem - > pfn_first +
( resource_size ( devmem - > resource ) > > PAGE_SHIFT ) ;
ret = hmm_devmem_pages_create ( devmem ) ;
if ( ret )
goto error_pages ;
devres_add ( device , devmem ) ;
ret = devm_add_action ( device , hmm_devmem_ref_kill , & devmem - > ref ) ;
if ( ret ) {
hmm_devmem_remove ( devmem ) ;
return ERR_PTR ( ret ) ;
}
return devmem ;
error_pages :
devm_release_mem_region ( device , devmem - > resource - > start ,
resource_size ( devmem - > resource ) ) ;
error_no_resource :
error_devm_add_action :
hmm_devmem_ref_kill ( & devmem - > ref ) ;
hmm_devmem_ref_exit ( & devmem - > ref ) ;
error_percpu_ref :
devres_free ( devmem ) ;
return ERR_PTR ( ret ) ;
}
EXPORT_SYMBOL ( hmm_devmem_add ) ;
2017-09-09 02:12:28 +03:00
struct hmm_devmem * hmm_devmem_add_resource ( const struct hmm_devmem_ops * ops ,
struct device * device ,
struct resource * res )
{
struct hmm_devmem * devmem ;
int ret ;
if ( res - > desc ! = IORES_DESC_DEVICE_PUBLIC_MEMORY )
return ERR_PTR ( - EINVAL ) ;
2018-05-16 21:46:08 +03:00
dev_pagemap_get_ops ( ) ;
2017-09-09 02:12:28 +03:00
devmem = devres_alloc_node ( & hmm_devmem_release , sizeof ( * devmem ) ,
GFP_KERNEL , dev_to_node ( device ) ) ;
if ( ! devmem )
return ERR_PTR ( - ENOMEM ) ;
init_completion ( & devmem - > completion ) ;
devmem - > pfn_first = - 1UL ;
devmem - > pfn_last = - 1UL ;
devmem - > resource = res ;
devmem - > device = device ;
devmem - > ops = ops ;
ret = percpu_ref_init ( & devmem - > ref , & hmm_devmem_ref_release ,
0 , GFP_KERNEL ) ;
if ( ret )
goto error_percpu_ref ;
ret = devm_add_action ( device , hmm_devmem_ref_exit , & devmem - > ref ) ;
if ( ret )
goto error_devm_add_action ;
devmem - > pfn_first = devmem - > resource - > start > > PAGE_SHIFT ;
devmem - > pfn_last = devmem - > pfn_first +
( resource_size ( devmem - > resource ) > > PAGE_SHIFT ) ;
ret = hmm_devmem_pages_create ( devmem ) ;
if ( ret )
goto error_devm_add_action ;
devres_add ( device , devmem ) ;
ret = devm_add_action ( device , hmm_devmem_ref_kill , & devmem - > ref ) ;
if ( ret ) {
hmm_devmem_remove ( devmem ) ;
return ERR_PTR ( ret ) ;
}
return devmem ;
error_devm_add_action :
hmm_devmem_ref_kill ( & devmem - > ref ) ;
hmm_devmem_ref_exit ( & devmem - > ref ) ;
error_percpu_ref :
devres_free ( devmem ) ;
return ERR_PTR ( ret ) ;
}
EXPORT_SYMBOL ( hmm_devmem_add_resource ) ;
2017-09-09 02:11:58 +03:00
/*
* hmm_devmem_remove ( ) - remove device memory ( kill and free ZONE_DEVICE )
*
* @ devmem : hmm_devmem struct use to track and manage the ZONE_DEVICE memory
*
* This will hot - unplug memory that was hotplugged by hmm_devmem_add on behalf
* of the device driver . It will free struct page and remove the resource that
* reserved the physical address range for this device memory .
*/
void hmm_devmem_remove ( struct hmm_devmem * devmem )
{
resource_size_t start , size ;
struct device * device ;
2017-09-09 02:12:28 +03:00
bool cdm = false ;
2017-09-09 02:11:58 +03:00
if ( ! devmem )
return ;
device = devmem - > device ;
start = devmem - > resource - > start ;
size = resource_size ( devmem - > resource ) ;
2017-09-09 02:12:28 +03:00
cdm = devmem - > resource - > desc = = IORES_DESC_DEVICE_PUBLIC_MEMORY ;
2017-09-09 02:11:58 +03:00
hmm_devmem_ref_kill ( & devmem - > ref ) ;
hmm_devmem_ref_exit ( & devmem - > ref ) ;
hmm_devmem_pages_remove ( devmem ) ;
2017-09-09 02:12:28 +03:00
if ( ! cdm )
devm_release_mem_region ( device , start , size ) ;
2017-09-09 02:11:58 +03:00
}
EXPORT_SYMBOL ( hmm_devmem_remove ) ;
2017-09-09 02:12:02 +03:00
/*
* A device driver that wants to handle multiple devices memory through a
* single fake device can use hmm_device to do so . This is purely a helper
* and it is not needed to make use of any HMM functionality .
*/
# define HMM_DEVICE_MAX 256
static DECLARE_BITMAP ( hmm_device_mask , HMM_DEVICE_MAX ) ;
static DEFINE_SPINLOCK ( hmm_device_lock ) ;
static struct class * hmm_device_class ;
static dev_t hmm_device_devt ;
static void hmm_device_release ( struct device * device )
{
struct hmm_device * hmm_device ;
hmm_device = container_of ( device , struct hmm_device , device ) ;
spin_lock ( & hmm_device_lock ) ;
clear_bit ( hmm_device - > minor , hmm_device_mask ) ;
spin_unlock ( & hmm_device_lock ) ;
kfree ( hmm_device ) ;
}
struct hmm_device * hmm_device_new ( void * drvdata )
{
struct hmm_device * hmm_device ;
hmm_device = kzalloc ( sizeof ( * hmm_device ) , GFP_KERNEL ) ;
if ( ! hmm_device )
return ERR_PTR ( - ENOMEM ) ;
spin_lock ( & hmm_device_lock ) ;
hmm_device - > minor = find_first_zero_bit ( hmm_device_mask , HMM_DEVICE_MAX ) ;
if ( hmm_device - > minor > = HMM_DEVICE_MAX ) {
spin_unlock ( & hmm_device_lock ) ;
kfree ( hmm_device ) ;
return ERR_PTR ( - EBUSY ) ;
}
set_bit ( hmm_device - > minor , hmm_device_mask ) ;
spin_unlock ( & hmm_device_lock ) ;
dev_set_name ( & hmm_device - > device , " hmm_device%d " , hmm_device - > minor ) ;
hmm_device - > device . devt = MKDEV ( MAJOR ( hmm_device_devt ) ,
hmm_device - > minor ) ;
hmm_device - > device . release = hmm_device_release ;
dev_set_drvdata ( & hmm_device - > device , drvdata ) ;
hmm_device - > device . class = hmm_device_class ;
device_initialize ( & hmm_device - > device ) ;
return hmm_device ;
}
EXPORT_SYMBOL ( hmm_device_new ) ;
void hmm_device_put ( struct hmm_device * hmm_device )
{
put_device ( & hmm_device - > device ) ;
}
EXPORT_SYMBOL ( hmm_device_put ) ;
static int __init hmm_init ( void )
{
int ret ;
ret = alloc_chrdev_region ( & hmm_device_devt , 0 ,
HMM_DEVICE_MAX ,
" hmm_device " ) ;
if ( ret )
return ret ;
hmm_device_class = class_create ( THIS_MODULE , " hmm_device " ) ;
if ( IS_ERR ( hmm_device_class ) ) {
unregister_chrdev_region ( hmm_device_devt , HMM_DEVICE_MAX ) ;
return PTR_ERR ( hmm_device_class ) ;
}
return 0 ;
}
device_initcall ( hmm_init ) ;
2017-09-09 02:12:24 +03:00
# endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */