2019-05-27 09:55:06 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2017-09-09 02:11:23 +03:00
/*
* Copyright 2013 Red Hat Inc .
*
2018-10-31 01:04:06 +03:00
* Authors : Jérôme Glisse < jglisse @ redhat . com >
2017-09-09 02:11:23 +03:00
*/
/*
* Refer to include / linux / hmm . h for information about heterogeneous memory
* management or HMM for short .
*/
2019-08-28 17:19:53 +03:00
# include <linux/pagewalk.h>
2017-09-09 02:11:23 +03:00
# include <linux/hmm.h>
2017-09-09 02:12:02 +03:00
# include <linux/init.h>
2017-09-09 02:11:31 +03:00
# include <linux/rmap.h>
# include <linux/swap.h>
2017-09-09 02:11:23 +03:00
# include <linux/slab.h>
# include <linux/sched.h>
2017-09-09 02:11:58 +03:00
# include <linux/mmzone.h>
# include <linux/pagemap.h>
2017-09-09 02:11:31 +03:00
# include <linux/swapops.h>
# include <linux/hugetlb.h>
2017-09-09 02:11:58 +03:00
# include <linux/memremap.h>
2019-05-23 16:36:46 +03:00
# include <linux/sched/mm.h>
2017-09-09 02:11:46 +03:00
# include <linux/jump_label.h>
2019-05-14 03:20:28 +03:00
# include <linux/dma-mapping.h>
2017-09-09 02:11:27 +03:00
# include <linux/mmu_notifier.h>
2017-09-09 02:11:58 +03:00
# include <linux/memory_hotplug.h>
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk {
struct hmm_range * range ;
2019-05-14 03:20:21 +03:00
struct dev_pagemap * pgmap ;
2017-09-09 02:11:35 +03:00
unsigned long last ;
2019-07-26 03:56:46 +03:00
unsigned int flags ;
2017-09-09 02:11:35 +03:00
} ;
2018-04-11 02:29:02 +03:00
static int hmm_vma_do_fault ( struct mm_walk * walk , unsigned long addr ,
bool write_fault , uint64_t * pfn )
2017-09-09 02:11:35 +03:00
{
2019-05-10 22:53:24 +03:00
unsigned int flags = FAULT_FLAG_REMOTE ;
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2017-09-09 02:11:35 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-08-18 01:44:47 +03:00
vm_fault_t ret ;
2017-09-09 02:11:35 +03:00
2019-08-24 01:17:52 +03:00
if ( ! vma )
goto err ;
2019-07-26 03:56:46 +03:00
if ( hmm_vma_walk - > flags & HMM_FAULT_ALLOW_RETRY )
flags | = FAULT_FLAG_ALLOW_RETRY ;
if ( write_fault )
flags | = FAULT_FLAG_WRITE ;
2018-08-18 01:44:47 +03:00
ret = handle_mm_fault ( vma , addr , flags ) ;
2019-07-24 09:52:58 +03:00
if ( ret & VM_FAULT_RETRY ) {
/* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
2019-05-14 03:19:58 +03:00
return - EAGAIN ;
2019-07-24 09:52:58 +03:00
}
2019-08-24 01:17:52 +03:00
if ( ret & VM_FAULT_ERROR )
goto err ;
2017-09-09 02:11:35 +03:00
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2019-08-24 01:17:52 +03:00
err :
* pfn = range - > values [ HMM_PFN_ERROR ] ;
return - EFAULT ;
2017-09-09 02:11:35 +03:00
}
2019-11-05 01:21:40 +03:00
static int hmm_pfns_fill ( unsigned long addr , unsigned long end ,
struct hmm_range * range , enum hmm_pfn_value_e value )
2017-09-09 02:11:31 +03:00
{
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long i ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
for ( ; addr < end ; addr + = PAGE_SIZE , i + + )
2019-11-05 01:21:40 +03:00
pfns [ i ] = range - > values [ value ] ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2018-04-11 02:28:46 +03:00
/*
2019-07-26 03:56:45 +03:00
* hmm_vma_walk_hole_ ( ) - handle a range lacking valid pmd or pte ( s )
* @ addr : range virtual start address ( inclusive )
2018-04-11 02:28:46 +03:00
* @ end : range virtual end address ( exclusive )
2018-04-11 02:29:02 +03:00
* @ fault : should we fault or not ?
* @ write_fault : write fault ?
2018-04-11 02:28:46 +03:00
* @ walk : mm_walk structure
2019-05-07 02:29:39 +03:00
* Return : 0 on success , - EBUSY after page fault , or page fault error
2018-04-11 02:28:46 +03:00
*
* This function will be called whenever pmd_none ( ) or pte_none ( ) returns true ,
* or whenever there is no page directory covering the virtual address range .
*/
2018-04-11 02:29:02 +03:00
static int hmm_vma_walk_hole_ ( unsigned long addr , unsigned long end ,
bool fault , bool write_fault ,
struct mm_walk * walk )
2017-09-09 02:11:31 +03:00
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2019-08-06 19:05:45 +03:00
unsigned long i ;
2017-09-09 02:11:31 +03:00
2017-09-09 02:11:35 +03:00
hmm_vma_walk - > last = addr ;
2019-08-06 19:05:45 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2019-05-14 03:20:18 +03:00
2019-08-24 01:17:53 +03:00
if ( write_fault & & walk - > vma & & ! ( walk - > vma - > vm_flags & VM_WRITE ) )
return - EPERM ;
2019-08-06 19:05:45 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , i + + ) {
2018-04-11 02:29:06 +03:00
pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2017-09-09 02:11:35 +03:00
int ret ;
2017-09-09 02:11:31 +03:00
2018-04-11 02:29:02 +03:00
ret = hmm_vma_do_fault ( walk , addr , write_fault ,
& pfns [ i ] ) ;
2019-05-14 03:19:58 +03:00
if ( ret ! = - EBUSY )
2017-09-09 02:11:35 +03:00
return ret ;
}
}
2019-05-14 03:19:58 +03:00
return ( fault | | write_fault ) ? - EBUSY : 0 ;
2018-04-11 02:29:02 +03:00
}
static inline void hmm_pte_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
uint64_t pfns , uint64_t cpu_flags ,
bool * fault , bool * write_fault )
{
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2019-07-26 03:56:47 +03:00
if ( hmm_vma_walk - > flags & HMM_FAULT_SNAPSHOT )
2018-04-11 02:29:02 +03:00
return ;
2019-05-14 03:20:05 +03:00
/*
* So we not only consider the individual per page request we also
* consider the default flags requested for the range . The API can
2019-07-26 03:56:45 +03:00
* be used 2 ways . The first one where the HMM user coalesces
* multiple page faults into one request and sets flags per pfn for
* those faults . The second one where the HMM user wants to pre -
2019-05-14 03:20:05 +03:00
* fault a range with specific flags . For the latter one it is a
* waste to have the user pre - fill the pfn arrays with a default
* flags value .
*/
pfns = ( pfns & range - > pfn_flags_mask ) | range - > default_flags ;
2018-04-11 02:29:02 +03:00
/* We aren't ask to do anything ... */
2018-04-11 02:29:06 +03:00
if ( ! ( pfns & range - > flags [ HMM_PFN_VALID ] ) )
2018-04-11 02:29:02 +03:00
return ;
2019-07-26 03:56:45 +03:00
/* If this is device memory then only fault if explicitly requested */
2018-04-11 02:29:06 +03:00
if ( ( cpu_flags & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) ) {
/* Do we fault on device memory ? */
if ( pfns & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) {
* write_fault = pfns & range - > flags [ HMM_PFN_WRITE ] ;
* fault = true ;
}
2018-04-11 02:29:02 +03:00
return ;
}
2018-04-11 02:29:06 +03:00
/* If CPU page table is not valid then we need to fault */
* fault = ! ( cpu_flags & range - > flags [ HMM_PFN_VALID ] ) ;
/* Need to write fault ? */
if ( ( pfns & range - > flags [ HMM_PFN_WRITE ] ) & &
! ( cpu_flags & range - > flags [ HMM_PFN_WRITE ] ) ) {
* write_fault = true ;
2018-04-11 02:29:02 +03:00
* fault = true ;
}
}
static void hmm_range_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
const uint64_t * pfns , unsigned long npages ,
uint64_t cpu_flags , bool * fault ,
bool * write_fault )
{
unsigned long i ;
2019-07-26 03:56:47 +03:00
if ( hmm_vma_walk - > flags & HMM_FAULT_SNAPSHOT ) {
2018-04-11 02:29:02 +03:00
* fault = * write_fault = false ;
return ;
}
2019-05-14 03:20:01 +03:00
* fault = * write_fault = false ;
2018-04-11 02:29:02 +03:00
for ( i = 0 ; i < npages ; + + i ) {
hmm_pte_need_fault ( hmm_vma_walk , pfns [ i ] , cpu_flags ,
fault , write_fault ) ;
2019-05-14 03:20:01 +03:00
if ( ( * write_fault ) )
2018-04-11 02:29:02 +03:00
return ;
}
}
static int hmm_vma_walk_hole ( unsigned long addr , unsigned long end ,
2020-02-04 04:36:03 +03:00
__always_unused int depth , struct mm_walk * walk )
2018-04-11 02:29:02 +03:00
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
bool fault , write_fault ;
unsigned long i , npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
}
2018-04-11 02:29:06 +03:00
static inline uint64_t pmd_to_hmm_pfn_flags ( struct hmm_range * range , pmd_t pmd )
2018-04-11 02:29:02 +03:00
{
if ( pmd_protnone ( pmd ) )
return 0 ;
2018-04-11 02:29:06 +03:00
return pmd_write ( pmd ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:21 +03:00
# ifdef CONFIG_TRANSPARENT_HUGEPAGE
2019-08-06 19:05:49 +03:00
static int hmm_vma_handle_pmd ( struct mm_walk * walk , unsigned long addr ,
unsigned long end , uint64_t * pfns , pmd_t pmd )
{
2018-04-11 02:28:59 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
unsigned long pfn , npages , i ;
bool fault , write_fault ;
2018-04-11 02:29:06 +03:00
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
npages = ( end - addr ) > > PAGE_SHIFT ;
2018-04-11 02:29:06 +03:00
cpu_flags = pmd_to_hmm_pfn_flags ( range , pmd ) ;
2018-04-11 02:29:02 +03:00
hmm_range_need_fault ( hmm_vma_walk , pfns , npages , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
if ( pmd_protnone ( pmd ) | | fault | | write_fault )
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
2019-08-06 19:05:47 +03:00
pfn = pmd_pfn ( pmd ) + ( ( addr & ~ PMD_MASK ) > > PAGE_SHIFT ) ;
2019-05-14 03:20:21 +03:00
for ( i = 0 ; addr < end ; addr + = PAGE_SIZE , i + + , pfn + + ) {
if ( pmd_devmap ( pmd ) ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pfn ,
hmm_vma_walk - > pgmap ) ;
if ( unlikely ( ! hmm_vma_walk - > pgmap ) )
return - EBUSY ;
}
2019-05-14 03:20:31 +03:00
pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) | cpu_flags ;
2019-05-14 03:20:21 +03:00
}
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = end ;
return 0 ;
}
2019-08-06 19:05:49 +03:00
# else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* stub to allow the code below to compile */
int hmm_vma_handle_pmd ( struct mm_walk * walk , unsigned long addr ,
unsigned long end , uint64_t * pfns , pmd_t pmd ) ;
# endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:06 +03:00
static inline uint64_t pte_to_hmm_pfn_flags ( struct hmm_range * range , pte_t pte )
2018-04-11 02:29:02 +03:00
{
2019-05-23 23:32:31 +03:00
if ( pte_none ( pte ) | | ! pte_present ( pte ) | | pte_protnone ( pte ) )
2018-04-11 02:29:02 +03:00
return 0 ;
2018-04-11 02:29:06 +03:00
return pte_write ( pte ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2018-04-11 02:29:02 +03:00
}
2018-04-11 02:28:59 +03:00
static int hmm_vma_handle_pte ( struct mm_walk * walk , unsigned long addr ,
unsigned long end , pmd_t * pmdp , pte_t * ptep ,
uint64_t * pfn )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
bool fault , write_fault ;
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
pte_t pte = * ptep ;
2018-04-11 02:29:06 +03:00
uint64_t orig_pfn = * pfn ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_NONE ] ;
2019-05-14 03:19:58 +03:00
fault = write_fault = false ;
2018-04-11 02:28:59 +03:00
if ( pte_none ( pte ) ) {
2019-05-14 03:19:58 +03:00
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , 0 ,
& fault , & write_fault ) ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
if ( ! pte_present ( pte ) ) {
swp_entry_t entry = pte_to_swp_entry ( pte ) ;
if ( ! non_swap_entry ( entry ) ) {
2019-08-15 23:52:56 +03:00
cpu_flags = pte_to_hmm_pfn_flags ( range , pte ) ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
/*
* This is a special swap entry , ignore migration , use
* device and report anything else as error .
*/
if ( is_device_private_entry ( entry ) ) {
2018-04-11 02:29:06 +03:00
cpu_flags = range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ;
2018-04-11 02:29:02 +03:00
cpu_flags | = is_write_device_private_entry ( entry ) ?
2018-04-11 02:29:06 +03:00
range - > flags [ HMM_PFN_WRITE ] : 0 ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
if ( fault | | write_fault )
goto fault ;
2019-05-14 03:20:31 +03:00
* pfn = hmm_device_entry_from_pfn ( range ,
swp_offset ( entry ) ) ;
2018-04-11 02:29:06 +03:00
* pfn | = cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
}
if ( is_migration_entry ( entry ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2018-04-11 02:28:59 +03:00
pte_unmap ( ptep ) ;
hmm_vma_walk - > last = addr ;
2019-07-26 03:56:45 +03:00
migration_entry_wait ( walk - > mm , pmdp , addr ) ;
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2018-04-11 02:28:59 +03:00
}
return 0 ;
}
/* Report error for everything else */
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_ERROR ] ;
2018-04-11 02:28:59 +03:00
return - EFAULT ;
2019-05-14 03:19:58 +03:00
} else {
cpu_flags = pte_to_hmm_pfn_flags ( range , pte ) ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
}
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
2019-05-14 03:20:21 +03:00
if ( pte_devmap ( pte ) ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pte_pfn ( pte ) ,
hmm_vma_walk - > pgmap ) ;
if ( unlikely ( ! hmm_vma_walk - > pgmap ) )
return - EBUSY ;
} else if ( IS_ENABLED ( CONFIG_ARCH_HAS_PTE_SPECIAL ) & & pte_special ( pte ) ) {
2019-10-23 22:55:14 +03:00
if ( ! is_zero_pfn ( pte_pfn ( pte ) ) ) {
* pfn = range - > values [ HMM_PFN_SPECIAL ] ;
return - EFAULT ;
}
/*
* Since each architecture defines a struct page for the zero
* page , just fall through and treat it like a normal page .
*/
2019-05-14 03:20:21 +03:00
}
2019-05-14 03:20:31 +03:00
* pfn = hmm_device_entry_from_pfn ( range , pte_pfn ( pte ) ) | cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
fault :
2019-05-14 03:20:21 +03:00
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2018-04-11 02:28:59 +03:00
pte_unmap ( ptep ) ;
/* Fault any virtual address we were asked to fault */
2018-04-11 02:29:02 +03:00
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
}
2017-09-09 02:11:31 +03:00
static int hmm_vma_walk_pmd ( pmd_t * pmdp ,
unsigned long start ,
unsigned long end ,
struct mm_walk * walk )
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long addr = start , i ;
pte_t * ptep ;
2018-10-31 01:04:20 +03:00
pmd_t pmd ;
2017-09-09 02:11:31 +03:00
again :
2018-10-31 01:04:20 +03:00
pmd = READ_ONCE ( * pmdp ) ;
if ( pmd_none ( pmd ) )
2020-02-04 04:36:03 +03:00
return hmm_vma_walk_hole ( start , end , - 1 , walk ) ;
2017-09-09 02:11:31 +03:00
2018-10-31 01:04:20 +03:00
if ( thp_migration_supported ( ) & & is_pmd_migration_entry ( pmd ) ) {
bool fault , write_fault ;
unsigned long npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
if ( fault | | write_fault ) {
hmm_vma_walk - > last = addr ;
2019-07-26 03:56:45 +03:00
pmd_migration_entry_wait ( walk - > mm , pmdp ) ;
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2018-10-31 01:04:20 +03:00
}
return 0 ;
} else if ( ! pmd_present ( pmd ) )
2019-11-05 01:21:40 +03:00
return hmm_pfns_fill ( start , end , range , HMM_PFN_ERROR ) ;
2017-09-09 02:11:31 +03:00
2018-10-31 01:04:20 +03:00
if ( pmd_devmap ( pmd ) | | pmd_trans_huge ( pmd ) ) {
2017-09-09 02:11:31 +03:00
/*
2019-07-26 03:56:45 +03:00
* No need to take pmd_lock here , even if some other thread
2017-09-09 02:11:31 +03:00
* is splitting the huge pmd we will get that event through
* mmu_notifier callback .
*
2019-07-26 03:56:45 +03:00
* So just read pmd value and check again it ' s a transparent
2017-09-09 02:11:31 +03:00
* huge or device mapping one and compute corresponding pfn
* values .
*/
pmd = pmd_read_atomic ( pmdp ) ;
barrier ( ) ;
if ( ! pmd_devmap ( pmd ) & & ! pmd_trans_huge ( pmd ) )
goto again ;
2017-09-09 02:11:35 +03:00
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2018-04-11 02:28:59 +03:00
return hmm_vma_handle_pmd ( walk , addr , end , & pfns [ i ] , pmd ) ;
2017-09-09 02:11:31 +03:00
}
2018-10-31 01:04:20 +03:00
/*
2019-07-26 03:56:45 +03:00
* We have handled all the valid cases above ie either none , migration ,
2018-10-31 01:04:20 +03:00
* huge or transparent huge . At this point either it is a valid pmd
* entry pointing to pte directory or it is a bad pmd that will not
* recover .
*/
if ( pmd_bad ( pmd ) )
2019-11-05 01:21:40 +03:00
return hmm_pfns_fill ( start , end , range , HMM_PFN_ERROR ) ;
2017-09-09 02:11:31 +03:00
ptep = pte_offset_map ( pmdp , addr ) ;
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:31 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , ptep + + , i + + ) {
2018-04-11 02:28:59 +03:00
int r ;
2017-09-09 02:11:35 +03:00
2018-04-11 02:28:59 +03:00
r = hmm_vma_handle_pte ( walk , addr , end , pmdp , ptep , & pfns [ i ] ) ;
if ( r ) {
/* hmm_vma_handle_pte() did unmap pte directory */
hmm_vma_walk - > last = addr ;
return r ;
2017-09-09 02:11:35 +03:00
}
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:21 +03:00
if ( hmm_vma_walk - > pgmap ) {
/*
* We do put_dev_pagemap ( ) here and not in hmm_vma_handle_pte ( )
* so that we can leverage get_dev_pagemap ( ) optimization which
* will not re - take a reference on a pgmap if we already have
* one .
*/
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2017-09-09 02:11:31 +03:00
pte_unmap ( ptep - 1 ) ;
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = addr ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2019-08-06 19:05:48 +03:00
# if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
defined ( CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD )
static inline uint64_t pud_to_hmm_pfn_flags ( struct hmm_range * range , pud_t pud )
{
if ( ! pud_present ( pud ) )
return 0 ;
return pud_write ( pud ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
}
static int hmm_vma_walk_pud ( pud_t * pudp , unsigned long start , unsigned long end ,
struct mm_walk * walk )
2019-05-14 03:20:21 +03:00
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2020-02-04 04:35:45 +03:00
unsigned long addr = start ;
2019-05-14 03:20:21 +03:00
pud_t pud ;
2020-02-04 04:35:45 +03:00
int ret = 0 ;
spinlock_t * ptl = pud_trans_huge_lock ( pudp , walk - > vma ) ;
if ( ! ptl )
return 0 ;
/* Normally we don't want to split the huge page */
walk - > action = ACTION_CONTINUE ;
2019-05-14 03:20:21 +03:00
pud = READ_ONCE ( * pudp ) ;
2020-02-04 04:35:45 +03:00
if ( pud_none ( pud ) ) {
2020-02-04 04:36:03 +03:00
ret = hmm_vma_walk_hole ( start , end , - 1 , walk ) ;
2020-02-04 04:35:45 +03:00
goto out_unlock ;
}
2019-05-14 03:20:21 +03:00
if ( pud_huge ( pud ) & & pud_devmap ( pud ) ) {
unsigned long i , npages , pfn ;
uint64_t * pfns , cpu_flags ;
bool fault , write_fault ;
2020-02-04 04:35:45 +03:00
if ( ! pud_present ( pud ) ) {
2020-02-04 04:36:03 +03:00
ret = hmm_vma_walk_hole ( start , end , - 1 , walk ) ;
2020-02-04 04:35:45 +03:00
goto out_unlock ;
}
2019-05-14 03:20:21 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
cpu_flags = pud_to_hmm_pfn_flags ( range , pud ) ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
cpu_flags , & fault , & write_fault ) ;
2020-02-04 04:35:45 +03:00
if ( fault | | write_fault ) {
ret = hmm_vma_walk_hole_ ( addr , end , fault ,
write_fault , walk ) ;
goto out_unlock ;
}
2019-05-14 03:20:21 +03:00
pfn = pud_pfn ( pud ) + ( ( addr & ~ PUD_MASK ) > > PAGE_SHIFT ) ;
for ( i = 0 ; i < npages ; + + i , + + pfn ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pfn ,
hmm_vma_walk - > pgmap ) ;
2020-02-04 04:35:45 +03:00
if ( unlikely ( ! hmm_vma_walk - > pgmap ) ) {
ret = - EBUSY ;
goto out_unlock ;
}
2019-05-14 03:20:31 +03:00
pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) |
cpu_flags ;
2019-05-14 03:20:21 +03:00
}
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
hmm_vma_walk - > last = end ;
2020-02-04 04:35:45 +03:00
goto out_unlock ;
2019-05-14 03:20:21 +03:00
}
2020-02-04 04:35:45 +03:00
/* Ask for the PUD to be split */
walk - > action = ACTION_SUBTREE ;
2019-05-14 03:20:21 +03:00
2020-02-04 04:35:45 +03:00
out_unlock :
spin_unlock ( ptl ) ;
return ret ;
2019-05-14 03:20:21 +03:00
}
2019-08-06 19:05:48 +03:00
# else
# define hmm_vma_walk_pud NULL
# endif
2019-05-14 03:20:21 +03:00
2019-08-06 19:05:50 +03:00
# ifdef CONFIG_HUGETLB_PAGE
2019-05-14 03:20:18 +03:00
static int hmm_vma_walk_hugetlb_entry ( pte_t * pte , unsigned long hmask ,
unsigned long start , unsigned long end ,
struct mm_walk * walk )
{
2019-08-06 19:05:46 +03:00
unsigned long addr = start , i , pfn ;
2019-05-14 03:20:18 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
struct vm_area_struct * vma = walk - > vma ;
uint64_t orig_pfn , cpu_flags ;
bool fault , write_fault ;
spinlock_t * ptl ;
pte_t entry ;
int ret = 0 ;
2019-07-26 03:56:45 +03:00
ptl = huge_pte_lock ( hstate_vma ( vma ) , walk - > mm , pte ) ;
2019-05-14 03:20:18 +03:00
entry = huge_ptep_get ( pte ) ;
2019-08-06 19:05:45 +03:00
i = ( start - range - > start ) > > PAGE_SHIFT ;
2019-05-14 03:20:18 +03:00
orig_pfn = range - > pfns [ i ] ;
range - > pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
cpu_flags = pte_to_hmm_pfn_flags ( range , entry ) ;
fault = write_fault = false ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
if ( fault | | write_fault ) {
ret = - ENOENT ;
goto unlock ;
}
2019-08-06 19:05:46 +03:00
pfn = pte_pfn ( entry ) + ( ( start & ~ hmask ) > > PAGE_SHIFT ) ;
2019-08-06 19:05:45 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , i + + , pfn + + )
2019-05-14 03:20:31 +03:00
range - > pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) |
cpu_flags ;
2019-05-14 03:20:18 +03:00
hmm_vma_walk - > last = end ;
unlock :
spin_unlock ( ptl ) ;
if ( ret = = - ENOENT )
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
return ret ;
}
2019-08-06 19:05:50 +03:00
# else
# define hmm_vma_walk_hugetlb_entry NULL
# endif /* CONFIG_HUGETLB_PAGE */
2019-05-14 03:20:18 +03:00
2019-11-05 01:21:40 +03:00
static int hmm_vma_walk_test ( unsigned long start , unsigned long end ,
struct mm_walk * walk )
2018-04-11 02:28:54 +03:00
{
2019-11-05 01:21:40 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
struct vm_area_struct * vma = walk - > vma ;
/*
* Skip vma ranges that don ' t have struct page backing them or
* map I / O devices directly .
*/
if ( vma - > vm_flags & ( VM_IO | VM_PFNMAP | VM_MIXEDMAP ) )
return - EFAULT ;
/*
* If the vma does not allow read access , then assume that it does not
* allow write access either . HMM does not support architectures
* that allow write without read .
*/
if ( ! ( vma - > vm_flags & VM_READ ) ) {
bool fault , write_fault ;
/*
* Check to see if a fault is requested for any page in the
* range .
*/
hmm_range_need_fault ( hmm_vma_walk , range - > pfns +
( ( start - range - > start ) > > PAGE_SHIFT ) ,
( end - start ) > > PAGE_SHIFT ,
0 , & fault , & write_fault ) ;
if ( fault | | write_fault )
return - EFAULT ;
hmm_pfns_fill ( start , end , range , HMM_PFN_NONE ) ;
hmm_vma_walk - > last = end ;
/* Skip this vma and continue processing the next vma. */
return 1 ;
}
return 0 ;
2018-04-11 02:28:54 +03:00
}
2019-08-28 17:19:54 +03:00
static const struct mm_walk_ops hmm_walk_ops = {
. pud_entry = hmm_vma_walk_pud ,
. pmd_entry = hmm_vma_walk_pmd ,
. pte_hole = hmm_vma_walk_hole ,
. hugetlb_entry = hmm_vma_walk_hugetlb_entry ,
2019-11-05 01:21:40 +03:00
. test_walk = hmm_vma_walk_test ,
2019-08-28 17:19:54 +03:00
} ;
2019-07-26 03:56:46 +03:00
/**
* hmm_range_fault - try to fault some address in a virtual address range
* @ range : range being faulted
* @ flags : HMM_FAULT_ * flags
*
* Return : the number of valid pages in range - > pfns [ ] ( from range start
* address ) , which may be zero . On error one of the following status codes
* can be returned :
2019-05-14 03:19:58 +03:00
*
2019-07-26 03:56:46 +03:00
* - EINVAL : Invalid arguments or mm or virtual address is in an invalid vma
* ( e . g . , device file vma ) .
* - ENOMEM : Out of memory .
* - EPERM : Invalid permission ( e . g . , asking for write and range is read
* only ) .
* - EAGAIN : A page fault needs to be retried and mmap_sem was dropped .
* - EBUSY : The range has been invalidated and the caller needs to wait for
* the invalidation to finish .
* - EFAULT : Invalid ( i . e . , either no valid vma or it is illegal to access
* that range ) number of valid pages in range - > pfns [ ] ( from
* range start address ) .
2017-09-09 02:11:35 +03:00
*
* This is similar to a regular CPU page fault except that it will not trigger
2019-05-14 03:19:58 +03:00
* any memory migration if the memory being faulted is not accessible by CPUs
* and caller does not ask for migration .
2017-09-09 02:11:35 +03:00
*
2018-04-11 02:28:38 +03:00
* On error , for one virtual address in the range , the function will mark the
* corresponding HMM pfn entry with an error flag .
2017-09-09 02:11:35 +03:00
*/
2019-07-26 03:56:46 +03:00
long hmm_range_fault ( struct hmm_range * range , unsigned int flags )
2017-09-09 02:11:35 +03:00
{
2019-11-05 01:21:40 +03:00
struct hmm_vma_walk hmm_vma_walk = {
. range = range ,
. last = range - > start ,
. flags = flags ,
} ;
2019-11-12 23:22:30 +03:00
struct mm_struct * mm = range - > notifier - > mm ;
2017-09-09 02:11:35 +03:00
int ret ;
2019-11-12 23:22:20 +03:00
lockdep_assert_held ( & mm - > mmap_sem ) ;
2019-05-14 03:19:48 +03:00
2019-05-14 03:20:01 +03:00
do {
/* If range is no longer valid force retry. */
2019-11-12 23:22:30 +03:00
if ( mmu_interval_check_retry ( range - > notifier ,
range - > notifier_seq ) )
2019-07-24 09:52:52 +03:00
return - EBUSY ;
2019-11-05 01:21:40 +03:00
ret = walk_page_range ( mm , hmm_vma_walk . last , range - > end ,
& hmm_walk_ops , & hmm_vma_walk ) ;
} while ( ret = = - EBUSY ) ;
2017-09-09 02:11:35 +03:00
2019-11-05 01:21:40 +03:00
if ( ret )
return ret ;
2019-05-14 03:19:58 +03:00
return ( hmm_vma_walk . last - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:35 +03:00
}
2019-05-14 03:19:58 +03:00
EXPORT_SYMBOL ( hmm_range_fault ) ;