2019-05-27 09:55:06 +03:00
// SPDX-License-Identifier: GPL-2.0-or-later
2017-09-09 02:11:23 +03:00
/*
* Copyright 2013 Red Hat Inc .
*
2018-10-31 01:04:06 +03:00
* Authors : Jérôme Glisse < jglisse @ redhat . com >
2017-09-09 02:11:23 +03:00
*/
/*
* Refer to include / linux / hmm . h for information about heterogeneous memory
* management or HMM for short .
*/
# include <linux/mm.h>
# include <linux/hmm.h>
2017-09-09 02:12:02 +03:00
# include <linux/init.h>
2017-09-09 02:11:31 +03:00
# include <linux/rmap.h>
# include <linux/swap.h>
2017-09-09 02:11:23 +03:00
# include <linux/slab.h>
# include <linux/sched.h>
2017-09-09 02:11:58 +03:00
# include <linux/mmzone.h>
# include <linux/pagemap.h>
2017-09-09 02:11:31 +03:00
# include <linux/swapops.h>
# include <linux/hugetlb.h>
2017-09-09 02:11:58 +03:00
# include <linux/memremap.h>
2019-05-23 16:36:46 +03:00
# include <linux/sched/mm.h>
2017-09-09 02:11:46 +03:00
# include <linux/jump_label.h>
2019-05-14 03:20:28 +03:00
# include <linux/dma-mapping.h>
2017-09-09 02:11:27 +03:00
# include <linux/mmu_notifier.h>
2017-09-09 02:11:58 +03:00
# include <linux/memory_hotplug.h>
2017-09-09 02:11:27 +03:00
static const struct mmu_notifier_ops hmm_mmu_notifier_ops ;
2019-05-14 03:19:48 +03:00
/**
* hmm_get_or_create - register HMM against an mm ( HMM internal )
2017-09-09 02:11:23 +03:00
*
* @ mm : mm struct to attach to
2019-05-14 03:19:48 +03:00
* Returns : returns an HMM object , either by referencing the existing
* ( per - process ) object , or by creating a new one .
2017-09-09 02:11:23 +03:00
*
2019-05-14 03:19:48 +03:00
* This is not intended to be used directly by device drivers . If mm already
* has an HMM struct then it get a reference on it and returns it . Otherwise
* it allocates an HMM struct , initializes it , associate it with the mm and
* returns it .
2017-09-09 02:11:23 +03:00
*/
2019-05-14 03:19:48 +03:00
static struct hmm * hmm_get_or_create ( struct mm_struct * mm )
2017-09-09 02:11:23 +03:00
{
2019-05-23 16:24:13 +03:00
struct hmm * hmm ;
2017-09-09 02:11:23 +03:00
2019-07-15 05:42:11 +03:00
lockdep_assert_held_write ( & mm - > mmap_sem ) ;
2017-09-09 02:11:23 +03:00
2019-05-23 16:24:13 +03:00
/* Abuse the page_table_lock to also protect mm->hmm. */
spin_lock ( & mm - > page_table_lock ) ;
hmm = mm - > hmm ;
if ( mm - > hmm & & kref_get_unless_zero ( & mm - > hmm - > kref ) )
goto out_unlock ;
spin_unlock ( & mm - > page_table_lock ) ;
2017-09-09 02:11:27 +03:00
hmm = kmalloc ( sizeof ( * hmm ) , GFP_KERNEL ) ;
if ( ! hmm )
return NULL ;
2019-05-14 03:20:01 +03:00
init_waitqueue_head ( & hmm - > wq ) ;
2017-09-09 02:11:27 +03:00
INIT_LIST_HEAD ( & hmm - > mirrors ) ;
init_rwsem ( & hmm - > mirrors_sem ) ;
hmm - > mmu_notifier . ops = NULL ;
2017-09-09 02:11:31 +03:00
INIT_LIST_HEAD ( & hmm - > ranges ) ;
2019-06-07 18:10:33 +03:00
spin_lock_init ( & hmm - > ranges_lock ) ;
2019-05-14 03:19:48 +03:00
kref_init ( & hmm - > kref ) ;
2019-05-14 03:20:01 +03:00
hmm - > notifiers = 0 ;
2017-09-09 02:11:27 +03:00
hmm - > mm = mm ;
2019-05-23 16:24:13 +03:00
hmm - > mmu_notifier . ops = & hmm_mmu_notifier_ops ;
if ( __mmu_notifier_register ( & hmm - > mmu_notifier , mm ) ) {
kfree ( hmm ) ;
return NULL ;
}
2017-09-09 02:11:27 +03:00
2019-05-23 16:24:13 +03:00
mmgrab ( hmm - > mm ) ;
2018-10-31 01:04:14 +03:00
/*
2019-05-23 16:24:13 +03:00
* We hold the exclusive mmap_sem here so we know that mm - > hmm is
* still NULL or 0 kref , and is safe to update .
2018-10-31 01:04:14 +03:00
*/
spin_lock ( & mm - > page_table_lock ) ;
2019-05-23 16:24:13 +03:00
mm - > hmm = hmm ;
2017-09-09 02:11:27 +03:00
2019-05-23 16:24:13 +03:00
out_unlock :
2018-10-31 01:04:14 +03:00
spin_unlock ( & mm - > page_table_lock ) ;
2019-05-14 03:19:48 +03:00
return hmm ;
2017-09-09 02:11:23 +03:00
}
2018-10-31 01:04:14 +03:00
2019-05-22 22:52:52 +03:00
static void hmm_free_rcu ( struct rcu_head * rcu )
{
2019-05-23 16:24:13 +03:00
struct hmm * hmm = container_of ( rcu , struct hmm , rcu ) ;
mmdrop ( hmm - > mm ) ;
2018-10-31 01:04:14 +03:00
kfree ( hmm ) ;
2017-09-09 02:11:23 +03:00
}
2019-05-14 03:19:48 +03:00
static void hmm_free ( struct kref * kref )
{
struct hmm * hmm = container_of ( kref , struct hmm , kref ) ;
2019-05-23 16:24:13 +03:00
spin_lock ( & hmm - > mm - > page_table_lock ) ;
if ( hmm - > mm - > hmm = = hmm )
hmm - > mm - > hmm = NULL ;
spin_unlock ( & hmm - > mm - > page_table_lock ) ;
2019-05-14 03:19:48 +03:00
2019-05-23 16:24:13 +03:00
mmu_notifier_unregister_no_release ( & hmm - > mmu_notifier , hmm - > mm ) ;
2019-05-22 22:52:52 +03:00
mmu_notifier_call_srcu ( & hmm - > rcu , hmm_free_rcu ) ;
2019-05-14 03:19:48 +03:00
}
static inline void hmm_put ( struct hmm * hmm )
{
kref_put ( & hmm - > kref , hmm_free ) ;
}
2019-05-14 03:20:01 +03:00
static void hmm_release ( struct mmu_notifier * mn , struct mm_struct * mm )
2017-09-09 02:11:23 +03:00
{
2019-05-22 22:52:52 +03:00
struct hmm * hmm = container_of ( mn , struct hmm , mmu_notifier ) ;
2017-09-09 02:11:27 +03:00
struct hmm_mirror * mirror ;
2019-05-14 03:19:48 +03:00
2019-05-22 22:52:52 +03:00
/* Bail out if hmm is in the process of being freed */
if ( ! kref_get_unless_zero ( & hmm - > kref ) )
2019-05-14 03:19:48 +03:00
return ;
2019-05-22 22:52:52 +03:00
2019-05-23 17:08:28 +03:00
/*
* Since hmm_range_register ( ) holds the mmget ( ) lock hmm_release ( ) is
* prevented as long as a range exists .
*/
WARN_ON ( ! list_empty_careful ( & hmm - > ranges ) ) ;
2018-04-11 02:28:19 +03:00
2019-05-24 18:14:08 +03:00
down_read ( & hmm - > mirrors_sem ) ;
list_for_each_entry ( mirror , & hmm - > mirrors , list ) {
/*
* Note : The driver is not allowed to trigger
* hmm_mirror_unregister ( ) from this thread .
*/
if ( mirror - > ops - > release )
2018-04-11 02:28:19 +03:00
mirror - > ops - > release ( mirror ) ;
2019-05-14 03:19:48 +03:00
}
2019-05-24 18:14:08 +03:00
up_read ( & hmm - > mirrors_sem ) ;
2019-05-14 03:19:48 +03:00
hmm_put ( hmm ) ;
2017-09-09 02:11:23 +03:00
}
2017-09-09 02:11:27 +03:00
2019-06-07 18:10:33 +03:00
static void notifiers_decrement ( struct hmm * hmm )
2017-09-09 02:11:27 +03:00
{
2019-06-07 18:10:33 +03:00
unsigned long flags ;
2017-09-09 02:11:31 +03:00
2019-06-07 18:10:33 +03:00
spin_lock_irqsave ( & hmm - > ranges_lock , flags ) ;
hmm - > notifiers - - ;
if ( ! hmm - > notifiers ) {
struct hmm_range * range ;
2018-04-11 02:28:19 +03:00
2019-06-07 18:10:33 +03:00
list_for_each_entry ( range , & hmm - > ranges , list ) {
if ( range - > valid )
continue ;
range - > valid = true ;
2018-04-11 02:28:19 +03:00
}
2019-06-07 18:10:33 +03:00
wake_up_all ( & hmm - > wq ) ;
2018-04-11 02:28:19 +03:00
}
2019-06-07 18:10:33 +03:00
spin_unlock_irqrestore ( & hmm - > ranges_lock , flags ) ;
2018-04-11 02:28:19 +03:00
}
2018-08-22 07:52:33 +03:00
static int hmm_invalidate_range_start ( struct mmu_notifier * mn ,
2019-05-14 03:20:01 +03:00
const struct mmu_notifier_range * nrange )
2017-09-09 02:11:27 +03:00
{
2019-05-22 22:52:52 +03:00
struct hmm * hmm = container_of ( mn , struct hmm , mmu_notifier ) ;
2019-05-14 03:20:01 +03:00
struct hmm_mirror * mirror ;
2018-10-31 01:04:28 +03:00
struct hmm_update update ;
2019-05-14 03:20:01 +03:00
struct hmm_range * range ;
2019-06-07 18:10:33 +03:00
unsigned long flags ;
2019-05-14 03:20:01 +03:00
int ret = 0 ;
2017-09-09 02:11:27 +03:00
2019-05-22 22:52:52 +03:00
if ( ! kref_get_unless_zero ( & hmm - > kref ) )
return 0 ;
2017-09-09 02:11:27 +03:00
2019-05-14 03:20:01 +03:00
update . start = nrange - > start ;
update . end = nrange - > end ;
2018-10-31 01:04:28 +03:00
update . event = HMM_UPDATE_INVALIDATE ;
2019-05-14 03:20:38 +03:00
update . blockable = mmu_notifier_range_blockable ( nrange ) ;
2019-05-14 03:20:01 +03:00
2019-06-07 18:10:33 +03:00
spin_lock_irqsave ( & hmm - > ranges_lock , flags ) ;
2019-05-14 03:20:01 +03:00
hmm - > notifiers + + ;
list_for_each_entry ( range , & hmm - > ranges , list ) {
if ( update . end < range - > start | | update . start > = range - > end )
continue ;
range - > valid = false ;
}
2019-06-07 18:10:33 +03:00
spin_unlock_irqrestore ( & hmm - > ranges_lock , flags ) ;
2019-05-14 03:20:01 +03:00
2019-05-14 03:20:38 +03:00
if ( mmu_notifier_range_blockable ( nrange ) )
2019-05-14 03:20:01 +03:00
down_read ( & hmm - > mirrors_sem ) ;
else if ( ! down_read_trylock ( & hmm - > mirrors_sem ) ) {
ret = - EAGAIN ;
goto out ;
}
2019-06-07 18:10:33 +03:00
2019-05-14 03:20:01 +03:00
list_for_each_entry ( mirror , & hmm - > mirrors , list ) {
2019-06-07 18:10:33 +03:00
int rc ;
2019-05-14 03:20:01 +03:00
2019-06-07 18:10:33 +03:00
rc = mirror - > ops - > sync_cpu_device_pagetables ( mirror , & update ) ;
if ( rc ) {
if ( WARN_ON ( update . blockable | | rc ! = - EAGAIN ) )
continue ;
2019-05-14 03:20:01 +03:00
ret = - EAGAIN ;
2019-05-07 02:29:39 +03:00
break ;
2019-05-14 03:20:01 +03:00
}
}
up_read ( & hmm - > mirrors_sem ) ;
out :
2019-06-07 18:10:33 +03:00
if ( ret )
notifiers_decrement ( hmm ) ;
2019-05-14 03:19:48 +03:00
hmm_put ( hmm ) ;
return ret ;
2017-09-09 02:11:27 +03:00
}
static void hmm_invalidate_range_end ( struct mmu_notifier * mn ,
2019-05-14 03:20:01 +03:00
const struct mmu_notifier_range * nrange )
2017-09-09 02:11:27 +03:00
{
2019-05-22 22:52:52 +03:00
struct hmm * hmm = container_of ( mn , struct hmm , mmu_notifier ) ;
2017-09-09 02:11:27 +03:00
2019-05-22 22:52:52 +03:00
if ( ! kref_get_unless_zero ( & hmm - > kref ) )
return ;
2019-05-14 03:20:01 +03:00
2019-06-07 18:10:33 +03:00
notifiers_decrement ( hmm ) ;
2019-05-14 03:19:48 +03:00
hmm_put ( hmm ) ;
2017-09-09 02:11:27 +03:00
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
2018-04-11 02:28:19 +03:00
. release = hmm_release ,
2017-09-09 02:11:27 +03:00
. invalidate_range_start = hmm_invalidate_range_start ,
. invalidate_range_end = hmm_invalidate_range_end ,
} ;
/*
* hmm_mirror_register ( ) - register a mirror against an mm
*
* @ mirror : new mirror struct to register
* @ mm : mm to register against
2019-05-07 02:29:39 +03:00
* Return : 0 on success , - ENOMEM if no memory , - EINVAL if invalid arguments
2017-09-09 02:11:27 +03:00
*
* To start mirroring a process address space , the device driver must register
* an HMM mirror struct .
*/
int hmm_mirror_register ( struct hmm_mirror * mirror , struct mm_struct * mm )
{
2019-07-15 05:42:11 +03:00
lockdep_assert_held_write ( & mm - > mmap_sem ) ;
2019-05-23 17:23:30 +03:00
2017-09-09 02:11:27 +03:00
/* Sanity check */
if ( ! mm | | ! mirror | | ! mirror - > ops )
return - EINVAL ;
2019-05-14 03:19:48 +03:00
mirror - > hmm = hmm_get_or_create ( mm ) ;
2017-09-09 02:11:27 +03:00
if ( ! mirror - > hmm )
return - ENOMEM ;
down_write ( & mirror - > hmm - > mirrors_sem ) ;
2019-05-14 03:19:48 +03:00
list_add ( & mirror - > list , & mirror - > hmm - > mirrors ) ;
up_write ( & mirror - > hmm - > mirrors_sem ) ;
2017-09-09 02:11:27 +03:00
return 0 ;
}
EXPORT_SYMBOL ( hmm_mirror_register ) ;
/*
* hmm_mirror_unregister ( ) - unregister a mirror
*
2019-05-07 02:29:39 +03:00
* @ mirror : mirror struct to unregister
2017-09-09 02:11:27 +03:00
*
* Stop mirroring a process address space , and cleanup .
*/
void hmm_mirror_unregister ( struct hmm_mirror * mirror )
{
2019-05-23 17:31:45 +03:00
struct hmm * hmm = mirror - > hmm ;
2017-09-09 02:11:27 +03:00
down_write ( & hmm - > mirrors_sem ) ;
2019-05-24 18:14:08 +03:00
list_del ( & mirror - > list ) ;
2017-09-09 02:11:27 +03:00
up_write ( & hmm - > mirrors_sem ) ;
2019-05-14 03:19:48 +03:00
hmm_put ( hmm ) ;
2017-09-09 02:11:27 +03:00
}
EXPORT_SYMBOL ( hmm_mirror_unregister ) ;
2017-09-09 02:11:31 +03:00
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk {
struct hmm_range * range ;
2019-05-14 03:20:21 +03:00
struct dev_pagemap * pgmap ;
2017-09-09 02:11:35 +03:00
unsigned long last ;
bool fault ;
bool block ;
} ;
2018-04-11 02:29:02 +03:00
static int hmm_vma_do_fault ( struct mm_walk * walk , unsigned long addr ,
bool write_fault , uint64_t * pfn )
2017-09-09 02:11:35 +03:00
{
2019-05-10 22:53:24 +03:00
unsigned int flags = FAULT_FLAG_REMOTE ;
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2017-09-09 02:11:35 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-08-18 01:44:47 +03:00
vm_fault_t ret ;
2017-09-09 02:11:35 +03:00
flags | = hmm_vma_walk - > block ? 0 : FAULT_FLAG_ALLOW_RETRY ;
2018-04-11 02:29:02 +03:00
flags | = write_fault ? FAULT_FLAG_WRITE : 0 ;
2018-08-18 01:44:47 +03:00
ret = handle_mm_fault ( vma , addr , flags ) ;
if ( ret & VM_FAULT_RETRY )
2019-05-14 03:19:58 +03:00
return - EAGAIN ;
2018-08-18 01:44:47 +03:00
if ( ret & VM_FAULT_ERROR ) {
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_ERROR ] ;
2017-09-09 02:11:35 +03:00
return - EFAULT ;
}
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2017-09-09 02:11:35 +03:00
}
2017-09-09 02:11:31 +03:00
static int hmm_pfns_bad ( unsigned long addr ,
unsigned long end ,
struct mm_walk * walk )
{
2018-04-11 02:28:27 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long i ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
for ( ; addr < end ; addr + = PAGE_SIZE , i + + )
2018-04-11 02:29:06 +03:00
pfns [ i ] = range - > values [ HMM_PFN_ERROR ] ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2018-04-11 02:28:46 +03:00
/*
* hmm_vma_walk_hole ( ) - handle a range lacking valid pmd or pte ( s )
* @ start : range virtual start address ( inclusive )
* @ end : range virtual end address ( exclusive )
2018-04-11 02:29:02 +03:00
* @ fault : should we fault or not ?
* @ write_fault : write fault ?
2018-04-11 02:28:46 +03:00
* @ walk : mm_walk structure
2019-05-07 02:29:39 +03:00
* Return : 0 on success , - EBUSY after page fault , or page fault error
2018-04-11 02:28:46 +03:00
*
* This function will be called whenever pmd_none ( ) or pte_none ( ) returns true ,
* or whenever there is no page directory covering the virtual address range .
*/
2018-04-11 02:29:02 +03:00
static int hmm_vma_walk_hole_ ( unsigned long addr , unsigned long end ,
bool fault , bool write_fault ,
struct mm_walk * walk )
2017-09-09 02:11:31 +03:00
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2019-05-14 03:20:18 +03:00
unsigned long i , page_size ;
2017-09-09 02:11:31 +03:00
2017-09-09 02:11:35 +03:00
hmm_vma_walk - > last = addr ;
2019-05-14 03:20:18 +03:00
page_size = hmm_range_page_size ( range ) ;
i = ( addr - range - > start ) > > range - > page_shift ;
for ( ; addr < end ; addr + = page_size , i + + ) {
2018-04-11 02:29:06 +03:00
pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2017-09-09 02:11:35 +03:00
int ret ;
2017-09-09 02:11:31 +03:00
2018-04-11 02:29:02 +03:00
ret = hmm_vma_do_fault ( walk , addr , write_fault ,
& pfns [ i ] ) ;
2019-05-14 03:19:58 +03:00
if ( ret ! = - EBUSY )
2017-09-09 02:11:35 +03:00
return ret ;
}
}
2019-05-14 03:19:58 +03:00
return ( fault | | write_fault ) ? - EBUSY : 0 ;
2018-04-11 02:29:02 +03:00
}
static inline void hmm_pte_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
uint64_t pfns , uint64_t cpu_flags ,
bool * fault , bool * write_fault )
{
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
if ( ! hmm_vma_walk - > fault )
return ;
2019-05-14 03:20:05 +03:00
/*
* So we not only consider the individual per page request we also
* consider the default flags requested for the range . The API can
* be use in 2 fashions . The first one where the HMM user coalesce
* multiple page fault into one request and set flags per pfns for
* of those faults . The second one where the HMM user want to pre -
* fault a range with specific flags . For the latter one it is a
* waste to have the user pre - fill the pfn arrays with a default
* flags value .
*/
pfns = ( pfns & range - > pfn_flags_mask ) | range - > default_flags ;
2018-04-11 02:29:02 +03:00
/* We aren't ask to do anything ... */
2018-04-11 02:29:06 +03:00
if ( ! ( pfns & range - > flags [ HMM_PFN_VALID ] ) )
2018-04-11 02:29:02 +03:00
return ;
2018-04-11 02:29:06 +03:00
/* If this is device memory than only fault if explicitly requested */
if ( ( cpu_flags & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) ) {
/* Do we fault on device memory ? */
if ( pfns & range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ) {
* write_fault = pfns & range - > flags [ HMM_PFN_WRITE ] ;
* fault = true ;
}
2018-04-11 02:29:02 +03:00
return ;
}
2018-04-11 02:29:06 +03:00
/* If CPU page table is not valid then we need to fault */
* fault = ! ( cpu_flags & range - > flags [ HMM_PFN_VALID ] ) ;
/* Need to write fault ? */
if ( ( pfns & range - > flags [ HMM_PFN_WRITE ] ) & &
! ( cpu_flags & range - > flags [ HMM_PFN_WRITE ] ) ) {
* write_fault = true ;
2018-04-11 02:29:02 +03:00
* fault = true ;
}
}
static void hmm_range_need_fault ( const struct hmm_vma_walk * hmm_vma_walk ,
const uint64_t * pfns , unsigned long npages ,
uint64_t cpu_flags , bool * fault ,
bool * write_fault )
{
unsigned long i ;
if ( ! hmm_vma_walk - > fault ) {
* fault = * write_fault = false ;
return ;
}
2019-05-14 03:20:01 +03:00
* fault = * write_fault = false ;
2018-04-11 02:29:02 +03:00
for ( i = 0 ; i < npages ; + + i ) {
hmm_pte_need_fault ( hmm_vma_walk , pfns [ i ] , cpu_flags ,
fault , write_fault ) ;
2019-05-14 03:20:01 +03:00
if ( ( * write_fault ) )
2018-04-11 02:29:02 +03:00
return ;
}
}
static int hmm_vma_walk_hole ( unsigned long addr , unsigned long end ,
struct mm_walk * walk )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
bool fault , write_fault ;
unsigned long i , npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
}
2018-04-11 02:29:06 +03:00
static inline uint64_t pmd_to_hmm_pfn_flags ( struct hmm_range * range , pmd_t pmd )
2018-04-11 02:29:02 +03:00
{
if ( pmd_protnone ( pmd ) )
return 0 ;
2018-04-11 02:29:06 +03:00
return pmd_write ( pmd ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:21 +03:00
static inline uint64_t pud_to_hmm_pfn_flags ( struct hmm_range * range , pud_t pud )
{
if ( ! pud_present ( pud ) )
return 0 ;
return pud_write ( pud ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
}
2018-04-11 02:28:59 +03:00
static int hmm_vma_handle_pmd ( struct mm_walk * walk ,
unsigned long addr ,
unsigned long end ,
uint64_t * pfns ,
pmd_t pmd )
{
2019-05-14 03:20:21 +03:00
# ifdef CONFIG_TRANSPARENT_HUGEPAGE
2018-04-11 02:28:59 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:29:02 +03:00
unsigned long pfn , npages , i ;
bool fault , write_fault ;
2018-04-11 02:29:06 +03:00
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
npages = ( end - addr ) > > PAGE_SHIFT ;
2018-04-11 02:29:06 +03:00
cpu_flags = pmd_to_hmm_pfn_flags ( range , pmd ) ;
2018-04-11 02:29:02 +03:00
hmm_range_need_fault ( hmm_vma_walk , pfns , npages , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:02 +03:00
if ( pmd_protnone ( pmd ) | | fault | | write_fault )
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
pfn = pmd_pfn ( pmd ) + pte_index ( addr ) ;
2019-05-14 03:20:21 +03:00
for ( i = 0 ; addr < end ; addr + = PAGE_SIZE , i + + , pfn + + ) {
if ( pmd_devmap ( pmd ) ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pfn ,
hmm_vma_walk - > pgmap ) ;
if ( unlikely ( ! hmm_vma_walk - > pgmap ) )
return - EBUSY ;
}
2019-05-14 03:20:31 +03:00
pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) | cpu_flags ;
2019-05-14 03:20:21 +03:00
}
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = end ;
return 0 ;
2019-05-14 03:20:21 +03:00
# else
/* If THP is not enabled then we should never reach that code ! */
return - EINVAL ;
# endif
2018-04-11 02:28:59 +03:00
}
2018-04-11 02:29:06 +03:00
static inline uint64_t pte_to_hmm_pfn_flags ( struct hmm_range * range , pte_t pte )
2018-04-11 02:29:02 +03:00
{
2019-05-23 23:32:31 +03:00
if ( pte_none ( pte ) | | ! pte_present ( pte ) | | pte_protnone ( pte ) )
2018-04-11 02:29:02 +03:00
return 0 ;
2018-04-11 02:29:06 +03:00
return pte_write ( pte ) ? range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_WRITE ] :
range - > flags [ HMM_PFN_VALID ] ;
2018-04-11 02:29:02 +03:00
}
2018-04-11 02:28:59 +03:00
static int hmm_vma_handle_pte ( struct mm_walk * walk , unsigned long addr ,
unsigned long end , pmd_t * pmdp , pte_t * ptep ,
uint64_t * pfn )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
2018-04-11 02:29:06 +03:00
struct hmm_range * range = hmm_vma_walk - > range ;
2018-04-11 02:28:59 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-04-11 02:29:02 +03:00
bool fault , write_fault ;
uint64_t cpu_flags ;
2018-04-11 02:28:59 +03:00
pte_t pte = * ptep ;
2018-04-11 02:29:06 +03:00
uint64_t orig_pfn = * pfn ;
2018-04-11 02:28:59 +03:00
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_NONE ] ;
2019-05-14 03:19:58 +03:00
fault = write_fault = false ;
2018-04-11 02:28:59 +03:00
if ( pte_none ( pte ) ) {
2019-05-14 03:19:58 +03:00
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , 0 ,
& fault , & write_fault ) ;
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
if ( ! pte_present ( pte ) ) {
swp_entry_t entry = pte_to_swp_entry ( pte ) ;
if ( ! non_swap_entry ( entry ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
return 0 ;
}
/*
* This is a special swap entry , ignore migration , use
* device and report anything else as error .
*/
if ( is_device_private_entry ( entry ) ) {
2018-04-11 02:29:06 +03:00
cpu_flags = range - > flags [ HMM_PFN_VALID ] |
range - > flags [ HMM_PFN_DEVICE_PRIVATE ] ;
2018-04-11 02:29:02 +03:00
cpu_flags | = is_write_device_private_entry ( entry ) ?
2018-04-11 02:29:06 +03:00
range - > flags [ HMM_PFN_WRITE ] : 0 ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
if ( fault | | write_fault )
goto fault ;
2019-05-14 03:20:31 +03:00
* pfn = hmm_device_entry_from_pfn ( range ,
swp_offset ( entry ) ) ;
2018-04-11 02:29:06 +03:00
* pfn | = cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
}
if ( is_migration_entry ( entry ) ) {
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault ) {
2018-04-11 02:28:59 +03:00
pte_unmap ( ptep ) ;
hmm_vma_walk - > last = addr ;
migration_entry_wait ( vma - > vm_mm ,
2018-04-11 02:29:02 +03:00
pmdp , addr ) ;
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2018-04-11 02:28:59 +03:00
}
return 0 ;
}
/* Report error for everything else */
2018-04-11 02:29:06 +03:00
* pfn = range - > values [ HMM_PFN_ERROR ] ;
2018-04-11 02:28:59 +03:00
return - EFAULT ;
2019-05-14 03:19:58 +03:00
} else {
cpu_flags = pte_to_hmm_pfn_flags ( range , pte ) ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
2018-04-11 02:28:59 +03:00
}
2018-04-11 02:29:02 +03:00
if ( fault | | write_fault )
2018-04-11 02:28:59 +03:00
goto fault ;
2019-05-14 03:20:21 +03:00
if ( pte_devmap ( pte ) ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pte_pfn ( pte ) ,
hmm_vma_walk - > pgmap ) ;
if ( unlikely ( ! hmm_vma_walk - > pgmap ) )
return - EBUSY ;
} else if ( IS_ENABLED ( CONFIG_ARCH_HAS_PTE_SPECIAL ) & & pte_special ( pte ) ) {
* pfn = range - > values [ HMM_PFN_SPECIAL ] ;
return - EFAULT ;
}
2019-05-14 03:20:31 +03:00
* pfn = hmm_device_entry_from_pfn ( range , pte_pfn ( pte ) ) | cpu_flags ;
2018-04-11 02:28:59 +03:00
return 0 ;
fault :
2019-05-14 03:20:21 +03:00
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2018-04-11 02:28:59 +03:00
pte_unmap ( ptep ) ;
/* Fault any virtual address we were asked to fault */
2018-04-11 02:29:02 +03:00
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
2018-04-11 02:28:59 +03:00
}
2017-09-09 02:11:31 +03:00
static int hmm_vma_walk_pmd ( pmd_t * pmdp ,
unsigned long start ,
unsigned long end ,
struct mm_walk * walk )
{
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
2018-10-31 01:04:20 +03:00
struct vm_area_struct * vma = walk - > vma ;
2018-04-11 02:28:38 +03:00
uint64_t * pfns = range - > pfns ;
2017-09-09 02:11:31 +03:00
unsigned long addr = start , i ;
pte_t * ptep ;
2018-10-31 01:04:20 +03:00
pmd_t pmd ;
2017-09-09 02:11:31 +03:00
again :
2018-10-31 01:04:20 +03:00
pmd = READ_ONCE ( * pmdp ) ;
if ( pmd_none ( pmd ) )
2017-09-09 02:11:31 +03:00
return hmm_vma_walk_hole ( start , end , walk ) ;
2018-10-31 01:04:20 +03:00
if ( pmd_huge ( pmd ) & & ( range - > vma - > vm_flags & VM_HUGETLB ) )
2017-09-09 02:11:31 +03:00
return hmm_pfns_bad ( start , end , walk ) ;
2018-10-31 01:04:20 +03:00
if ( thp_migration_supported ( ) & & is_pmd_migration_entry ( pmd ) ) {
bool fault , write_fault ;
unsigned long npages ;
uint64_t * pfns ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
0 , & fault , & write_fault ) ;
if ( fault | | write_fault ) {
hmm_vma_walk - > last = addr ;
pmd_migration_entry_wait ( vma - > vm_mm , pmdp ) ;
2019-05-14 03:19:58 +03:00
return - EBUSY ;
2018-10-31 01:04:20 +03:00
}
return 0 ;
} else if ( ! pmd_present ( pmd ) )
return hmm_pfns_bad ( start , end , walk ) ;
2017-09-09 02:11:31 +03:00
2018-10-31 01:04:20 +03:00
if ( pmd_devmap ( pmd ) | | pmd_trans_huge ( pmd ) ) {
2017-09-09 02:11:31 +03:00
/*
* No need to take pmd_lock here , even if some other threads
* is splitting the huge pmd we will get that event through
* mmu_notifier callback .
*
* So just read pmd value and check again its a transparent
* huge or device mapping one and compute corresponding pfn
* values .
*/
pmd = pmd_read_atomic ( pmdp ) ;
barrier ( ) ;
if ( ! pmd_devmap ( pmd ) & & ! pmd_trans_huge ( pmd ) )
goto again ;
2017-09-09 02:11:35 +03:00
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2018-04-11 02:28:59 +03:00
return hmm_vma_handle_pmd ( walk , addr , end , & pfns [ i ] , pmd ) ;
2017-09-09 02:11:31 +03:00
}
2018-10-31 01:04:20 +03:00
/*
* We have handled all the valid case above ie either none , migration ,
* huge or transparent huge . At this point either it is a valid pmd
* entry pointing to pte directory or it is a bad pmd that will not
* recover .
*/
if ( pmd_bad ( pmd ) )
2017-09-09 02:11:31 +03:00
return hmm_pfns_bad ( start , end , walk ) ;
ptep = pte_offset_map ( pmdp , addr ) ;
2018-10-31 01:04:20 +03:00
i = ( addr - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:31 +03:00
for ( ; addr < end ; addr + = PAGE_SIZE , ptep + + , i + + ) {
2018-04-11 02:28:59 +03:00
int r ;
2017-09-09 02:11:35 +03:00
2018-04-11 02:28:59 +03:00
r = hmm_vma_handle_pte ( walk , addr , end , pmdp , ptep , & pfns [ i ] ) ;
if ( r ) {
/* hmm_vma_handle_pte() did unmap pte directory */
hmm_vma_walk - > last = addr ;
return r ;
2017-09-09 02:11:35 +03:00
}
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:21 +03:00
if ( hmm_vma_walk - > pgmap ) {
/*
* We do put_dev_pagemap ( ) here and not in hmm_vma_handle_pte ( )
* so that we can leverage get_dev_pagemap ( ) optimization which
* will not re - take a reference on a pgmap if we already have
* one .
*/
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
2017-09-09 02:11:31 +03:00
pte_unmap ( ptep - 1 ) ;
2018-04-11 02:28:59 +03:00
hmm_vma_walk - > last = addr ;
2017-09-09 02:11:31 +03:00
return 0 ;
}
2019-05-14 03:20:21 +03:00
static int hmm_vma_walk_pud ( pud_t * pudp ,
unsigned long start ,
unsigned long end ,
struct mm_walk * walk )
{
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
unsigned long addr = start , next ;
pmd_t * pmdp ;
pud_t pud ;
int ret ;
again :
pud = READ_ONCE ( * pudp ) ;
if ( pud_none ( pud ) )
return hmm_vma_walk_hole ( start , end , walk ) ;
if ( pud_huge ( pud ) & & pud_devmap ( pud ) ) {
unsigned long i , npages , pfn ;
uint64_t * pfns , cpu_flags ;
bool fault , write_fault ;
if ( ! pud_present ( pud ) )
return hmm_vma_walk_hole ( start , end , walk ) ;
i = ( addr - range - > start ) > > PAGE_SHIFT ;
npages = ( end - addr ) > > PAGE_SHIFT ;
pfns = & range - > pfns [ i ] ;
cpu_flags = pud_to_hmm_pfn_flags ( range , pud ) ;
hmm_range_need_fault ( hmm_vma_walk , pfns , npages ,
cpu_flags , & fault , & write_fault ) ;
if ( fault | | write_fault )
return hmm_vma_walk_hole_ ( addr , end , fault ,
write_fault , walk ) ;
pfn = pud_pfn ( pud ) + ( ( addr & ~ PUD_MASK ) > > PAGE_SHIFT ) ;
for ( i = 0 ; i < npages ; + + i , + + pfn ) {
hmm_vma_walk - > pgmap = get_dev_pagemap ( pfn ,
hmm_vma_walk - > pgmap ) ;
if ( unlikely ( ! hmm_vma_walk - > pgmap ) )
return - EBUSY ;
2019-05-14 03:20:31 +03:00
pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) |
cpu_flags ;
2019-05-14 03:20:21 +03:00
}
if ( hmm_vma_walk - > pgmap ) {
put_dev_pagemap ( hmm_vma_walk - > pgmap ) ;
hmm_vma_walk - > pgmap = NULL ;
}
hmm_vma_walk - > last = end ;
return 0 ;
}
split_huge_pud ( walk - > vma , pudp , addr ) ;
if ( pud_none ( * pudp ) )
goto again ;
pmdp = pmd_offset ( pudp , addr ) ;
do {
next = pmd_addr_end ( addr , end ) ;
ret = hmm_vma_walk_pmd ( pmdp , addr , next , walk ) ;
if ( ret )
return ret ;
} while ( pmdp + + , addr = next , addr ! = end ) ;
return 0 ;
}
2019-05-14 03:20:18 +03:00
static int hmm_vma_walk_hugetlb_entry ( pte_t * pte , unsigned long hmask ,
unsigned long start , unsigned long end ,
struct mm_walk * walk )
{
# ifdef CONFIG_HUGETLB_PAGE
unsigned long addr = start , i , pfn , mask , size , pfn_inc ;
struct hmm_vma_walk * hmm_vma_walk = walk - > private ;
struct hmm_range * range = hmm_vma_walk - > range ;
struct vm_area_struct * vma = walk - > vma ;
struct hstate * h = hstate_vma ( vma ) ;
uint64_t orig_pfn , cpu_flags ;
bool fault , write_fault ;
spinlock_t * ptl ;
pte_t entry ;
int ret = 0 ;
size = 1UL < < huge_page_shift ( h ) ;
mask = size - 1 ;
if ( range - > page_shift ! = PAGE_SHIFT ) {
/* Make sure we are looking at full page. */
if ( start & mask )
return - EINVAL ;
if ( end < ( start + size ) )
return - EINVAL ;
pfn_inc = size > > PAGE_SHIFT ;
} else {
pfn_inc = 1 ;
size = PAGE_SIZE ;
}
ptl = huge_pte_lock ( hstate_vma ( walk - > vma ) , walk - > mm , pte ) ;
entry = huge_ptep_get ( pte ) ;
i = ( start - range - > start ) > > range - > page_shift ;
orig_pfn = range - > pfns [ i ] ;
range - > pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
cpu_flags = pte_to_hmm_pfn_flags ( range , entry ) ;
fault = write_fault = false ;
hmm_pte_need_fault ( hmm_vma_walk , orig_pfn , cpu_flags ,
& fault , & write_fault ) ;
if ( fault | | write_fault ) {
ret = - ENOENT ;
goto unlock ;
}
pfn = pte_pfn ( entry ) + ( ( start & mask ) > > range - > page_shift ) ;
for ( ; addr < end ; addr + = size , i + + , pfn + = pfn_inc )
2019-05-14 03:20:31 +03:00
range - > pfns [ i ] = hmm_device_entry_from_pfn ( range , pfn ) |
cpu_flags ;
2019-05-14 03:20:18 +03:00
hmm_vma_walk - > last = end ;
unlock :
spin_unlock ( ptl ) ;
if ( ret = = - ENOENT )
return hmm_vma_walk_hole_ ( addr , end , fault , write_fault , walk ) ;
return ret ;
# else /* CONFIG_HUGETLB_PAGE */
return - EINVAL ;
# endif
}
2018-04-11 02:29:06 +03:00
static void hmm_pfns_clear ( struct hmm_range * range ,
uint64_t * pfns ,
2018-04-11 02:28:54 +03:00
unsigned long addr ,
unsigned long end )
{
for ( ; addr < end ; addr + = PAGE_SIZE , pfns + + )
2018-04-11 02:29:06 +03:00
* pfns = range - > values [ HMM_PFN_NONE ] ;
2018-04-11 02:28:54 +03:00
}
2017-09-09 02:11:31 +03:00
/*
2019-05-14 03:20:01 +03:00
* hmm_range_register ( ) - start tracking change to CPU page table over a range
2019-05-14 03:19:55 +03:00
* @ range : range
2019-05-14 03:20:01 +03:00
* @ mm : the mm struct for the range of virtual address
* @ start : start virtual address ( inclusive )
* @ end : end virtual address ( exclusive )
2019-05-14 03:20:18 +03:00
* @ page_shift : expect page shift for the range
2019-05-14 03:20:01 +03:00
* Returns 0 on success , - EFAULT if the address space is no longer valid
2019-05-14 03:19:55 +03:00
*
2019-05-14 03:20:01 +03:00
* Track updates to the CPU page table see include / linux / hmm . h
2017-09-09 02:11:31 +03:00
*/
2019-05-14 03:20:01 +03:00
int hmm_range_register ( struct hmm_range * range ,
2019-05-23 15:41:19 +03:00
struct hmm_mirror * mirror ,
2019-05-14 03:20:01 +03:00
unsigned long start ,
2019-05-14 03:20:18 +03:00
unsigned long end ,
unsigned page_shift )
2017-09-09 02:11:31 +03:00
{
2019-05-14 03:20:18 +03:00
unsigned long mask = ( ( 1UL < < page_shift ) - 1UL ) ;
2019-05-23 15:41:19 +03:00
struct hmm * hmm = mirror - > hmm ;
2019-06-07 18:10:33 +03:00
unsigned long flags ;
2019-05-14 03:20:18 +03:00
2019-05-14 03:20:01 +03:00
range - > valid = false ;
2019-05-14 03:19:48 +03:00
range - > hmm = NULL ;
2019-05-14 03:20:18 +03:00
if ( ( start & mask ) | | ( end & mask ) )
return - EINVAL ;
if ( start > = end )
2017-09-09 02:11:31 +03:00
return - EINVAL ;
2019-05-14 03:20:18 +03:00
range - > page_shift = page_shift ;
2019-05-14 03:20:01 +03:00
range - > start = start ;
range - > end = end ;
2019-05-23 17:08:28 +03:00
/* Prevent hmm_release() from running while the range is valid */
if ( ! mmget_not_zero ( hmm - > mm ) )
2019-05-14 03:20:01 +03:00
return - EFAULT ;
2017-09-09 02:11:31 +03:00
2019-05-07 02:29:39 +03:00
/* Initialize range to track CPU page table updates. */
2019-06-07 18:10:33 +03:00
spin_lock_irqsave ( & hmm - > ranges_lock , flags ) ;
2018-04-11 02:28:42 +03:00
2019-05-07 02:29:39 +03:00
range - > hmm = hmm ;
2019-05-23 15:41:19 +03:00
kref_get ( & hmm - > kref ) ;
2019-05-23 17:43:43 +03:00
list_add ( & range - > list , & hmm - > ranges ) ;
2018-04-11 02:28:34 +03:00
2019-05-14 03:19:48 +03:00
/*
2019-05-14 03:20:01 +03:00
* If there are any concurrent notifiers we have to wait for them for
* the range to be valid ( see hmm_range_wait_until_valid ( ) ) .
2019-05-14 03:19:48 +03:00
*/
2019-05-07 02:29:39 +03:00
if ( ! hmm - > notifiers )
2019-05-14 03:20:01 +03:00
range - > valid = true ;
2019-06-07 18:10:33 +03:00
spin_unlock_irqrestore ( & hmm - > ranges_lock , flags ) ;
2019-05-14 03:20:01 +03:00
return 0 ;
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:01 +03:00
EXPORT_SYMBOL ( hmm_range_register ) ;
2017-09-09 02:11:31 +03:00
/*
2019-05-14 03:20:01 +03:00
* hmm_range_unregister ( ) - stop tracking change to CPU page table over a range
* @ range : range
2017-09-09 02:11:31 +03:00
*
* Range struct is used to track updates to the CPU page table after a call to
2019-05-14 03:20:01 +03:00
* hmm_range_register ( ) . See include / linux / hmm . h for how to use it .
2017-09-09 02:11:31 +03:00
*/
2019-05-14 03:20:01 +03:00
void hmm_range_unregister ( struct hmm_range * range )
2017-09-09 02:11:31 +03:00
{
2019-05-07 02:29:39 +03:00
struct hmm * hmm = range - > hmm ;
2019-06-07 18:10:33 +03:00
unsigned long flags ;
2017-09-09 02:11:31 +03:00
2019-06-07 18:10:33 +03:00
spin_lock_irqsave ( & hmm - > ranges_lock , flags ) ;
2019-05-23 17:08:28 +03:00
list_del_init ( & range - > list ) ;
2019-06-07 18:10:33 +03:00
spin_unlock_irqrestore ( & hmm - > ranges_lock , flags ) ;
2017-09-09 02:11:31 +03:00
2019-05-14 03:20:01 +03:00
/* Drop reference taken by hmm_range_register() */
2019-05-23 17:08:28 +03:00
mmput ( hmm - > mm ) ;
2019-05-07 02:29:39 +03:00
hmm_put ( hmm ) ;
2019-05-23 17:40:24 +03:00
/*
* The range is now invalid and the ref on the hmm is dropped , so
* poison the pointer . Leave other fields in place , for the caller ' s
* use .
*/
2019-05-14 03:20:01 +03:00
range - > valid = false ;
2019-05-23 17:40:24 +03:00
memset ( & range - > hmm , POISON_INUSE , sizeof ( range - > hmm ) ) ;
2017-09-09 02:11:31 +03:00
}
2019-05-14 03:20:01 +03:00
EXPORT_SYMBOL ( hmm_range_unregister ) ;
/*
* hmm_range_snapshot ( ) - snapshot CPU page table for a range
* @ range : range
2019-05-07 02:29:39 +03:00
* Return : - EINVAL if invalid argument , - ENOMEM out of memory , - EPERM invalid
2019-05-14 03:20:01 +03:00
* permission ( for instance asking for write and range is read only ) ,
2019-07-24 09:52:52 +03:00
* - EBUSY if you need to retry , - EFAULT invalid ( ie either no valid
2019-05-14 03:20:01 +03:00
* vma or it is illegal to access that range ) , number of valid pages
* in range - > pfns [ ] ( from range start address ) .
*
* This snapshots the CPU page table for a range of virtual addresses . Snapshot
* validity is tracked by range struct . See in include / linux / hmm . h for example
* on how to use .
*/
long hmm_range_snapshot ( struct hmm_range * range )
{
2019-05-14 03:20:18 +03:00
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP ;
2019-05-14 03:20:01 +03:00
unsigned long start = range - > start , end ;
struct hmm_vma_walk hmm_vma_walk ;
struct hmm * hmm = range - > hmm ;
struct vm_area_struct * vma ;
struct mm_walk mm_walk ;
2019-05-23 17:08:28 +03:00
lockdep_assert_held ( & hmm - > mm - > mmap_sem ) ;
2019-05-14 03:20:01 +03:00
do {
/* If range is no longer valid force retry. */
if ( ! range - > valid )
2019-07-24 09:52:52 +03:00
return - EBUSY ;
2019-05-14 03:20:01 +03:00
vma = find_vma ( hmm - > mm , start ) ;
2019-05-14 03:20:18 +03:00
if ( vma = = NULL | | ( vma - > vm_flags & device_vma ) )
2019-05-14 03:20:01 +03:00
return - EFAULT ;
2019-05-14 03:20:18 +03:00
if ( is_vm_hugetlb_page ( vma ) ) {
2019-05-27 23:02:21 +03:00
if ( huge_page_shift ( hstate_vma ( vma ) ) ! =
range - > page_shift & &
2019-05-14 03:20:18 +03:00
range - > page_shift ! = PAGE_SHIFT )
return - EINVAL ;
} else {
if ( range - > page_shift ! = PAGE_SHIFT )
return - EINVAL ;
}
2019-05-14 03:20:01 +03:00
if ( ! ( vma - > vm_flags & VM_READ ) ) {
/*
* If vma do not allow read access , then assume that it
* does not allow write access , either . HMM does not
* support architecture that allow write without read .
*/
hmm_pfns_clear ( range , range - > pfns ,
range - > start , range - > end ) ;
return - EPERM ;
}
range - > vma = vma ;
2019-05-14 03:20:21 +03:00
hmm_vma_walk . pgmap = NULL ;
2019-05-14 03:20:01 +03:00
hmm_vma_walk . last = start ;
hmm_vma_walk . fault = false ;
hmm_vma_walk . range = range ;
mm_walk . private = & hmm_vma_walk ;
end = min ( range - > end , vma - > vm_end ) ;
mm_walk . vma = vma ;
mm_walk . mm = vma - > vm_mm ;
mm_walk . pte_entry = NULL ;
mm_walk . test_walk = NULL ;
mm_walk . hugetlb_entry = NULL ;
2019-05-14 03:20:21 +03:00
mm_walk . pud_entry = hmm_vma_walk_pud ;
2019-05-14 03:20:01 +03:00
mm_walk . pmd_entry = hmm_vma_walk_pmd ;
mm_walk . pte_hole = hmm_vma_walk_hole ;
2019-05-14 03:20:18 +03:00
mm_walk . hugetlb_entry = hmm_vma_walk_hugetlb_entry ;
2019-05-14 03:20:01 +03:00
walk_page_range ( start , end , & mm_walk ) ;
start = end ;
} while ( start < range - > end ) ;
return ( hmm_vma_walk . last - range - > start ) > > PAGE_SHIFT ;
}
EXPORT_SYMBOL ( hmm_range_snapshot ) ;
2017-09-09 02:11:35 +03:00
/*
2019-05-14 03:19:58 +03:00
* hmm_range_fault ( ) - try to fault some address in a virtual address range
2018-04-11 02:28:30 +03:00
* @ range : range being faulted
2017-09-09 02:11:35 +03:00
* @ block : allow blocking on fault ( if true it sleeps and do not drop mmap_sem )
2019-05-07 02:29:39 +03:00
* Return : number of valid pages in range - > pfns [ ] ( from range start
2019-05-14 03:19:58 +03:00
* address ) . This may be zero . If the return value is negative ,
* then one of the following values may be returned :
*
* - EINVAL invalid arguments or mm or virtual address are in an
2019-05-14 03:20:18 +03:00
* invalid vma ( for instance device file vma ) .
2019-05-14 03:19:58 +03:00
* - ENOMEM : Out of memory .
* - EPERM : Invalid permission ( for instance asking for write and
* range is read only ) .
* - EAGAIN : If you need to retry and mmap_sem was drop . This can only
* happens if block argument is false .
* - EBUSY : If the the range is being invalidated and you should wait
* for invalidation to finish .
* - EFAULT : Invalid ( ie either no valid vma or it is illegal to access
* that range ) , number of valid pages in range - > pfns [ ] ( from
* range start address ) .
2017-09-09 02:11:35 +03:00
*
* This is similar to a regular CPU page fault except that it will not trigger
2019-05-14 03:19:58 +03:00
* any memory migration if the memory being faulted is not accessible by CPUs
* and caller does not ask for migration .
2017-09-09 02:11:35 +03:00
*
2018-04-11 02:28:38 +03:00
* On error , for one virtual address in the range , the function will mark the
* corresponding HMM pfn entry with an error flag .
2017-09-09 02:11:35 +03:00
*/
2019-05-14 03:19:58 +03:00
long hmm_range_fault ( struct hmm_range * range , bool block )
2017-09-09 02:11:35 +03:00
{
2019-05-14 03:20:18 +03:00
const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP ;
2019-05-14 03:20:01 +03:00
unsigned long start = range - > start , end ;
2017-09-09 02:11:35 +03:00
struct hmm_vma_walk hmm_vma_walk ;
2019-05-14 03:20:01 +03:00
struct hmm * hmm = range - > hmm ;
struct vm_area_struct * vma ;
2017-09-09 02:11:35 +03:00
struct mm_walk mm_walk ;
int ret ;
2019-05-23 17:08:28 +03:00
lockdep_assert_held ( & hmm - > mm - > mmap_sem ) ;
2019-05-14 03:19:48 +03:00
2019-05-14 03:20:01 +03:00
do {
/* If range is no longer valid force retry. */
2019-07-24 09:52:52 +03:00
if ( ! range - > valid )
return - EBUSY ;
2017-09-09 02:11:35 +03:00
2019-05-14 03:20:01 +03:00
vma = find_vma ( hmm - > mm , start ) ;
2019-05-14 03:20:18 +03:00
if ( vma = = NULL | | ( vma - > vm_flags & device_vma ) )
2019-05-14 03:20:01 +03:00
return - EFAULT ;
2019-05-14 03:19:48 +03:00
2019-05-14 03:20:18 +03:00
if ( is_vm_hugetlb_page ( vma ) ) {
if ( huge_page_shift ( hstate_vma ( vma ) ) ! =
range - > page_shift & &
range - > page_shift ! = PAGE_SHIFT )
return - EINVAL ;
} else {
if ( range - > page_shift ! = PAGE_SHIFT )
return - EINVAL ;
}
2019-05-14 03:20:01 +03:00
if ( ! ( vma - > vm_flags & VM_READ ) ) {
/*
* If vma do not allow read access , then assume that it
* does not allow write access , either . HMM does not
* support architecture that allow write without read .
*/
hmm_pfns_clear ( range , range - > pfns ,
range - > start , range - > end ) ;
return - EPERM ;
}
2017-09-09 02:11:35 +03:00
2019-05-14 03:20:01 +03:00
range - > vma = vma ;
2019-05-14 03:20:21 +03:00
hmm_vma_walk . pgmap = NULL ;
2019-05-14 03:20:01 +03:00
hmm_vma_walk . last = start ;
hmm_vma_walk . fault = true ;
hmm_vma_walk . block = block ;
hmm_vma_walk . range = range ;
mm_walk . private = & hmm_vma_walk ;
end = min ( range - > end , vma - > vm_end ) ;
mm_walk . vma = vma ;
mm_walk . mm = vma - > vm_mm ;
mm_walk . pte_entry = NULL ;
mm_walk . test_walk = NULL ;
mm_walk . hugetlb_entry = NULL ;
2019-05-14 03:20:21 +03:00
mm_walk . pud_entry = hmm_vma_walk_pud ;
2019-05-14 03:20:01 +03:00
mm_walk . pmd_entry = hmm_vma_walk_pmd ;
mm_walk . pte_hole = hmm_vma_walk_hole ;
2019-05-14 03:20:18 +03:00
mm_walk . hugetlb_entry = hmm_vma_walk_hugetlb_entry ;
2019-05-14 03:20:01 +03:00
do {
ret = walk_page_range ( start , end , & mm_walk ) ;
start = hmm_vma_walk . last ;
/* Keep trying while the range is valid. */
} while ( ret = = - EBUSY & & range - > valid ) ;
if ( ret ) {
unsigned long i ;
i = ( hmm_vma_walk . last - range - > start ) > > PAGE_SHIFT ;
hmm_pfns_clear ( range , & range - > pfns [ i ] ,
hmm_vma_walk . last , range - > end ) ;
return ret ;
}
start = end ;
2017-09-09 02:11:35 +03:00
2019-05-14 03:20:01 +03:00
} while ( start < range - > end ) ;
2019-05-14 03:19:48 +03:00
2019-05-14 03:19:58 +03:00
return ( hmm_vma_walk . last - range - > start ) > > PAGE_SHIFT ;
2017-09-09 02:11:35 +03:00
}
2019-05-14 03:19:58 +03:00
EXPORT_SYMBOL ( hmm_range_fault ) ;
2019-05-14 03:20:28 +03:00
/**
* hmm_range_dma_map ( ) - hmm_range_fault ( ) and dma map page all in one .
* @ range : range being faulted
* @ device : device against to dma map page to
* @ daddrs : dma address of mapped pages
* @ block : allow blocking on fault ( if true it sleeps and do not drop mmap_sem )
2019-05-07 02:29:39 +03:00
* Return : number of pages mapped on success , - EAGAIN if mmap_sem have been
2019-05-14 03:20:28 +03:00
* drop and you need to try again , some other error value otherwise
*
* Note same usage pattern as hmm_range_fault ( ) .
*/
long hmm_range_dma_map ( struct hmm_range * range ,
struct device * device ,
dma_addr_t * daddrs ,
bool block )
{
unsigned long i , npages , mapped ;
long ret ;
ret = hmm_range_fault ( range , block ) ;
if ( ret < = 0 )
return ret ? ret : - EBUSY ;
npages = ( range - > end - range - > start ) > > PAGE_SHIFT ;
for ( i = 0 , mapped = 0 ; i < npages ; + + i ) {
enum dma_data_direction dir = DMA_TO_DEVICE ;
struct page * page ;
/*
* FIXME need to update DMA API to provide invalid DMA address
* value instead of a function to test dma address value . This
* would remove lot of dumb code duplicated accross many arch .
*
* For now setting it to 0 here is good enough as the pfns [ ]
* value is what is use to check what is valid and what isn ' t .
*/
daddrs [ i ] = 0 ;
2019-05-14 03:20:31 +03:00
page = hmm_device_entry_to_page ( range , range - > pfns [ i ] ) ;
2019-05-14 03:20:28 +03:00
if ( page = = NULL )
continue ;
/* Check if range is being invalidated */
if ( ! range - > valid ) {
ret = - EBUSY ;
goto unmap ;
}
/* If it is read and write than map bi-directional. */
if ( range - > pfns [ i ] & range - > flags [ HMM_PFN_WRITE ] )
dir = DMA_BIDIRECTIONAL ;
daddrs [ i ] = dma_map_page ( device , page , 0 , PAGE_SIZE , dir ) ;
if ( dma_mapping_error ( device , daddrs [ i ] ) ) {
ret = - EFAULT ;
goto unmap ;
}
mapped + + ;
}
return mapped ;
unmap :
for ( npages = i , i = 0 ; ( i < npages ) & & mapped ; + + i ) {
enum dma_data_direction dir = DMA_TO_DEVICE ;
struct page * page ;
2019-05-14 03:20:31 +03:00
page = hmm_device_entry_to_page ( range , range - > pfns [ i ] ) ;
2019-05-14 03:20:28 +03:00
if ( page = = NULL )
continue ;
if ( dma_mapping_error ( device , daddrs [ i ] ) )
continue ;
/* If it is read and write than map bi-directional. */
if ( range - > pfns [ i ] & range - > flags [ HMM_PFN_WRITE ] )
dir = DMA_BIDIRECTIONAL ;
dma_unmap_page ( device , daddrs [ i ] , PAGE_SIZE , dir ) ;
mapped - - ;
}
return ret ;
}
EXPORT_SYMBOL ( hmm_range_dma_map ) ;
/**
* hmm_range_dma_unmap ( ) - unmap range of that was map with hmm_range_dma_map ( )
* @ range : range being unmapped
* @ vma : the vma against which the range ( optional )
* @ device : device against which dma map was done
* @ daddrs : dma address of mapped pages
* @ dirty : dirty page if it had the write flag set
2019-05-07 02:29:39 +03:00
* Return : number of page unmapped on success , - EINVAL otherwise
2019-05-14 03:20:28 +03:00
*
* Note that caller MUST abide by mmu notifier or use HMM mirror and abide
* to the sync_cpu_device_pagetables ( ) callback so that it is safe here to
* call set_page_dirty ( ) . Caller must also take appropriate locks to avoid
* concurrent mmu notifier or sync_cpu_device_pagetables ( ) to make progress .
*/
long hmm_range_dma_unmap ( struct hmm_range * range ,
struct vm_area_struct * vma ,
struct device * device ,
dma_addr_t * daddrs ,
bool dirty )
{
unsigned long i , npages ;
long cpages = 0 ;
/* Sanity check. */
if ( range - > end < = range - > start )
return - EINVAL ;
if ( ! daddrs )
return - EINVAL ;
if ( ! range - > pfns )
return - EINVAL ;
npages = ( range - > end - range - > start ) > > PAGE_SHIFT ;
for ( i = 0 ; i < npages ; + + i ) {
enum dma_data_direction dir = DMA_TO_DEVICE ;
struct page * page ;
2019-05-14 03:20:31 +03:00
page = hmm_device_entry_to_page ( range , range - > pfns [ i ] ) ;
2019-05-14 03:20:28 +03:00
if ( page = = NULL )
continue ;
/* If it is read and write than map bi-directional. */
if ( range - > pfns [ i ] & range - > flags [ HMM_PFN_WRITE ] ) {
dir = DMA_BIDIRECTIONAL ;
/*
* See comments in function description on why it is
* safe here to call set_page_dirty ( )
*/
if ( dirty )
set_page_dirty ( page ) ;
}
/* Unmap and clear pfns/dma address */
dma_unmap_page ( device , daddrs [ i ] , PAGE_SIZE , dir ) ;
range - > pfns [ i ] = range - > values [ HMM_PFN_NONE ] ;
/* FIXME see comments in hmm_vma_dma_map() */
daddrs [ i ] = 0 ;
cpages + + ;
}
return cpages ;
}
EXPORT_SYMBOL ( hmm_range_dma_unmap ) ;