2014-06-05 03:08:10 +04:00
# include <linux/kernel.h>
# include <linux/errno.h>
# include <linux/err.h>
# include <linux/spinlock.h>
# include <linux/mm.h>
# include <linux/pagemap.h>
# include <linux/rmap.h>
# include <linux/swap.h>
# include <linux/swapops.h>
2014-10-10 02:29:14 +04:00
# include <linux/sched.h>
# include <linux/rwsem.h>
2014-11-05 19:27:40 +03:00
# include <linux/hugetlb.h>
2014-10-10 02:29:14 +04:00
# include <asm/pgtable.h>
2014-06-05 03:08:10 +04:00
# include "internal.h"
2014-06-05 03:08:11 +04:00
static struct page * no_page_table ( struct vm_area_struct * vma ,
unsigned int flags )
2014-06-05 03:08:10 +04:00
{
2014-06-05 03:08:11 +04:00
/*
* When core dumping an enormous anonymous area that nobody
* has touched so far , we don ' t want to allocate unnecessary pages or
* page tables . Return error instead of NULL to skip handle_mm_fault ,
* then get_dump_page ( ) will return NULL to leave a hole in the dump .
* But we can only make this optimization where a hole would surely
* be zero - filled if handle_mm_fault ( ) actually did handle it .
*/
if ( ( flags & FOLL_DUMP ) & & ( ! vma - > vm_ops | | ! vma - > vm_ops - > fault ) )
return ERR_PTR ( - EFAULT ) ;
return NULL ;
}
2014-06-05 03:08:10 +04:00
2014-06-05 03:08:11 +04:00
static struct page * follow_page_pte ( struct vm_area_struct * vma ,
unsigned long address , pmd_t * pmd , unsigned int flags )
{
struct mm_struct * mm = vma - > vm_mm ;
struct page * page ;
spinlock_t * ptl ;
pte_t * ptep , pte ;
2014-06-05 03:08:10 +04:00
2014-06-05 03:08:11 +04:00
retry :
2014-06-05 03:08:10 +04:00
if ( unlikely ( pmd_bad ( * pmd ) ) )
2014-06-05 03:08:11 +04:00
return no_page_table ( vma , flags ) ;
2014-06-05 03:08:10 +04:00
ptep = pte_offset_map_lock ( mm , pmd , address , & ptl ) ;
pte = * ptep ;
if ( ! pte_present ( pte ) ) {
swp_entry_t entry ;
/*
* KSM ' s break_ksm ( ) relies upon recognizing a ksm page
* even while it is being migrated , so for that case we
* need migration_entry_wait ( ) .
*/
if ( likely ( ! ( flags & FOLL_MIGRATION ) ) )
goto no_page ;
if ( pte_none ( pte ) | | pte_file ( pte ) )
goto no_page ;
entry = pte_to_swp_entry ( pte ) ;
if ( ! is_migration_entry ( entry ) )
goto no_page ;
pte_unmap_unlock ( ptep , ptl ) ;
migration_entry_wait ( mm , pmd , address ) ;
2014-06-05 03:08:11 +04:00
goto retry ;
2014-06-05 03:08:10 +04:00
}
if ( ( flags & FOLL_NUMA ) & & pte_numa ( pte ) )
goto no_page ;
2014-06-05 03:08:11 +04:00
if ( ( flags & FOLL_WRITE ) & & ! pte_write ( pte ) ) {
pte_unmap_unlock ( ptep , ptl ) ;
return NULL ;
}
2014-06-05 03:08:10 +04:00
page = vm_normal_page ( vma , address , pte ) ;
if ( unlikely ( ! page ) ) {
if ( ( flags & FOLL_DUMP ) | |
! is_zero_pfn ( pte_pfn ( pte ) ) )
goto bad_page ;
page = pte_page ( pte ) ;
}
if ( flags & FOLL_GET )
get_page_foll ( page ) ;
if ( flags & FOLL_TOUCH ) {
if ( ( flags & FOLL_WRITE ) & &
! pte_dirty ( pte ) & & ! PageDirty ( page ) )
set_page_dirty ( page ) ;
/*
* pte_mkyoung ( ) would be more correct here , but atomic care
* is needed to avoid losing the dirty bit : it is easier to use
* mark_page_accessed ( ) .
*/
mark_page_accessed ( page ) ;
}
if ( ( flags & FOLL_MLOCK ) & & ( vma - > vm_flags & VM_LOCKED ) ) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
* which might bounce very badly if there is contention .
*
* If the page is already locked , we don ' t need to
* handle it now - vmscan will handle it later if and
* when it attempts to reclaim the page .
*/
if ( page - > mapping & & trylock_page ( page ) ) {
lru_add_drain ( ) ; /* push cached pages to LRU */
/*
* Because we lock page here , and migration is
* blocked by the pte ' s page reference , and we
* know the page is still mapped , we don ' t even
* need to check for file - cache page truncation .
*/
mlock_vma_page ( page ) ;
unlock_page ( page ) ;
}
}
pte_unmap_unlock ( ptep , ptl ) ;
return page ;
bad_page :
pte_unmap_unlock ( ptep , ptl ) ;
return ERR_PTR ( - EFAULT ) ;
no_page :
pte_unmap_unlock ( ptep , ptl ) ;
if ( ! pte_none ( pte ) )
2014-06-05 03:08:11 +04:00
return NULL ;
return no_page_table ( vma , flags ) ;
}
/**
* follow_page_mask - look up a page descriptor from a user - virtual address
* @ vma : vm_area_struct mapping @ address
* @ address : virtual address to look up
* @ flags : flags modifying lookup behaviour
* @ page_mask : on output , * page_mask is set according to the size of the page
*
* @ flags can have FOLL_ flags set , defined in < linux / mm . h >
*
* Returns the mapped ( struct page * ) , % NULL if no mapping exists , or
* an error pointer if there is a mapping to something not represented
* by a page descriptor ( see also vm_normal_page ( ) ) .
*/
struct page * follow_page_mask ( struct vm_area_struct * vma ,
unsigned long address , unsigned int flags ,
unsigned int * page_mask )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
spinlock_t * ptl ;
struct page * page ;
struct mm_struct * mm = vma - > vm_mm ;
* page_mask = 0 ;
page = follow_huge_addr ( mm , address , flags & FOLL_WRITE ) ;
if ( ! IS_ERR ( page ) ) {
BUG_ON ( flags & FOLL_GET ) ;
2014-06-05 03:08:10 +04:00
return page ;
2014-06-05 03:08:11 +04:00
}
2014-06-05 03:08:10 +04:00
2014-06-05 03:08:11 +04:00
pgd = pgd_offset ( mm , address ) ;
if ( pgd_none ( * pgd ) | | unlikely ( pgd_bad ( * pgd ) ) )
return no_page_table ( vma , flags ) ;
pud = pud_offset ( pgd , address ) ;
if ( pud_none ( * pud ) )
return no_page_table ( vma , flags ) ;
if ( pud_huge ( * pud ) & & vma - > vm_flags & VM_HUGETLB ) {
if ( flags & FOLL_GET )
return NULL ;
page = follow_huge_pud ( mm , address , pud , flags & FOLL_WRITE ) ;
return page ;
}
if ( unlikely ( pud_bad ( * pud ) ) )
return no_page_table ( vma , flags ) ;
pmd = pmd_offset ( pud , address ) ;
if ( pmd_none ( * pmd ) )
return no_page_table ( vma , flags ) ;
if ( pmd_huge ( * pmd ) & & vma - > vm_flags & VM_HUGETLB ) {
page = follow_huge_pmd ( mm , address , pmd , flags & FOLL_WRITE ) ;
if ( flags & FOLL_GET ) {
/*
* Refcount on tail pages are not well - defined and
* shouldn ' t be taken . The caller should handle a NULL
* return when trying to follow tail pages .
*/
if ( PageHead ( page ) )
get_page ( page ) ;
else
page = NULL ;
}
return page ;
}
if ( ( flags & FOLL_NUMA ) & & pmd_numa ( * pmd ) )
return no_page_table ( vma , flags ) ;
if ( pmd_trans_huge ( * pmd ) ) {
if ( flags & FOLL_SPLIT ) {
split_huge_page_pmd ( vma , address , pmd ) ;
return follow_page_pte ( vma , address , pmd , flags ) ;
}
ptl = pmd_lock ( mm , pmd ) ;
if ( likely ( pmd_trans_huge ( * pmd ) ) ) {
if ( unlikely ( pmd_trans_splitting ( * pmd ) ) ) {
spin_unlock ( ptl ) ;
wait_split_huge_page ( vma - > anon_vma , pmd ) ;
} else {
page = follow_trans_huge_pmd ( vma , address ,
pmd , flags ) ;
spin_unlock ( ptl ) ;
* page_mask = HPAGE_PMD_NR - 1 ;
return page ;
}
} else
spin_unlock ( ptl ) ;
}
return follow_page_pte ( vma , address , pmd , flags ) ;
2014-06-05 03:08:10 +04:00
}
2014-06-05 03:08:11 +04:00
static int get_gate_page ( struct mm_struct * mm , unsigned long address ,
unsigned int gup_flags , struct vm_area_struct * * vma ,
struct page * * page )
{
pgd_t * pgd ;
pud_t * pud ;
pmd_t * pmd ;
pte_t * pte ;
int ret = - EFAULT ;
/* user gate pages are read-only */
if ( gup_flags & FOLL_WRITE )
return - EFAULT ;
if ( address > TASK_SIZE )
pgd = pgd_offset_k ( address ) ;
else
pgd = pgd_offset_gate ( mm , address ) ;
BUG_ON ( pgd_none ( * pgd ) ) ;
pud = pud_offset ( pgd , address ) ;
BUG_ON ( pud_none ( * pud ) ) ;
pmd = pmd_offset ( pud , address ) ;
if ( pmd_none ( * pmd ) )
return - EFAULT ;
VM_BUG_ON ( pmd_trans_huge ( * pmd ) ) ;
pte = pte_offset_map ( pmd , address ) ;
if ( pte_none ( * pte ) )
goto unmap ;
* vma = get_gate_vma ( mm ) ;
if ( ! page )
goto out ;
* page = vm_normal_page ( * vma , address , * pte ) ;
if ( ! * page ) {
if ( ( gup_flags & FOLL_DUMP ) | | ! is_zero_pfn ( pte_pfn ( * pte ) ) )
goto unmap ;
* page = pte_page ( * pte ) ;
}
get_page ( * page ) ;
out :
ret = 0 ;
unmap :
pte_unmap ( pte ) ;
return ret ;
}
2014-08-07 03:07:24 +04:00
/*
* mmap_sem must be held on entry . If @ nonblocking ! = NULL and
* * @ flags does not include FOLL_NOWAIT , the mmap_sem may be released .
* If it is , * @ nonblocking will be set to 0 and - EBUSY returned .
*/
2014-06-05 03:08:12 +04:00
static int faultin_page ( struct task_struct * tsk , struct vm_area_struct * vma ,
unsigned long address , unsigned int * flags , int * nonblocking )
{
struct mm_struct * mm = vma - > vm_mm ;
unsigned int fault_flags = 0 ;
int ret ;
/* For mlock, just skip the stack guard page. */
if ( ( * flags & FOLL_MLOCK ) & &
( stack_guard_page_start ( vma , address ) | |
stack_guard_page_end ( vma , address + PAGE_SIZE ) ) )
return - ENOENT ;
if ( * flags & FOLL_WRITE )
fault_flags | = FAULT_FLAG_WRITE ;
if ( nonblocking )
fault_flags | = FAULT_FLAG_ALLOW_RETRY ;
if ( * flags & FOLL_NOWAIT )
fault_flags | = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT ;
2014-09-17 21:51:48 +04:00
if ( * flags & FOLL_TRIED ) {
VM_WARN_ON_ONCE ( fault_flags & FAULT_FLAG_ALLOW_RETRY ) ;
fault_flags | = FAULT_FLAG_TRIED ;
}
2014-06-05 03:08:12 +04:00
ret = handle_mm_fault ( mm , vma , address , fault_flags ) ;
if ( ret & VM_FAULT_ERROR ) {
if ( ret & VM_FAULT_OOM )
return - ENOMEM ;
if ( ret & ( VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE ) )
return * flags & FOLL_HWPOISON ? - EHWPOISON : - EFAULT ;
if ( ret & VM_FAULT_SIGBUS )
return - EFAULT ;
BUG ( ) ;
}
if ( tsk ) {
if ( ret & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
}
if ( ret & VM_FAULT_RETRY ) {
if ( nonblocking )
* nonblocking = 0 ;
return - EBUSY ;
}
/*
* The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
* necessary , even if maybe_mkwrite decided not to set pte_write . We
* can thus safely do subsequent page lookups as if they were reads .
* But only do so when looping for pte_write is futile : in some cases
* userspace may also be wanting to write to the gotten user page ,
* which a read fault here might prevent ( a readonly page might get
* reCOWed by userspace write ) .
*/
if ( ( ret & VM_FAULT_WRITE ) & & ! ( vma - > vm_flags & VM_WRITE ) )
* flags & = ~ FOLL_WRITE ;
return 0 ;
}
2014-06-05 03:08:13 +04:00
static int check_vma_flags ( struct vm_area_struct * vma , unsigned long gup_flags )
{
vm_flags_t vm_flags = vma - > vm_flags ;
if ( vm_flags & ( VM_IO | VM_PFNMAP ) )
return - EFAULT ;
if ( gup_flags & FOLL_WRITE ) {
if ( ! ( vm_flags & VM_WRITE ) ) {
if ( ! ( gup_flags & FOLL_FORCE ) )
return - EFAULT ;
/*
* We used to let the write , force case do COW in a
* VM_MAYWRITE VM_SHARED ! VM_WRITE vma , so ptrace could
* set a breakpoint in a read - only mapping of an
* executable , without corrupting the file ( yet only
* when that file had been opened for writing ! ) .
* Anon pages in shared mappings are surprising : now
* just reject it .
*/
if ( ! is_cow_mapping ( vm_flags ) ) {
WARN_ON_ONCE ( vm_flags & VM_MAYWRITE ) ;
return - EFAULT ;
}
}
} else if ( ! ( vm_flags & VM_READ ) ) {
if ( ! ( gup_flags & FOLL_FORCE ) )
return - EFAULT ;
/*
* Is there actually any vma we can reach here which does not
* have VM_MAYREAD set ?
*/
if ( ! ( vm_flags & VM_MAYREAD ) )
return - EFAULT ;
}
return 0 ;
}
2014-06-05 03:08:10 +04:00
/**
* __get_user_pages ( ) - pin user pages in memory
* @ tsk : task_struct of target task
* @ mm : mm_struct of target mm
* @ start : starting user address
* @ nr_pages : number of pages from start to pin
* @ gup_flags : flags modifying pin behaviour
* @ pages : array that receives pointers to the pages pinned .
* Should be at least nr_pages long . Or NULL , if caller
* only intends to ensure the pages are faulted in .
* @ vmas : array of pointers to vmas corresponding to each page .
* Or NULL if the caller does not require them .
* @ nonblocking : whether waiting for disk IO or mmap_sem contention
*
* Returns number of pages pinned . This may be fewer than the number
* requested . If nr_pages is 0 or negative , returns 0. If no pages
* were pinned , returns - errno . Each page returned must be released
* with a put_page ( ) call when it is finished with . vmas will only
* remain valid while mmap_sem is held .
*
2014-08-07 03:07:24 +04:00
* Must be called with mmap_sem held . It may be released . See below .
2014-06-05 03:08:10 +04:00
*
* __get_user_pages walks a process ' s page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant . That is , it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant .
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns , and there may even be a completely different
* page there in some cases ( eg . if mmapped pagecache has been invalidated
* and subsequently re faulted ) . However it does guarantee that the page
* won ' t be freed completely . And mostly callers simply care that the page
* contains data that was valid * at some point in time * . Typically , an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can ' t be held over the syscall boundary .
*
* If @ gup_flags & FOLL_WRITE = = 0 , the page must not be written to . If
* the page is written to , set_page_dirty ( or set_page_dirty_lock , as
* appropriate ) must be called after the page is finished with , and
* before put_page is called .
*
* If @ nonblocking ! = NULL , __get_user_pages will not wait for disk IO
* or mmap_sem contention , and if waiting is needed to pin all pages ,
2014-08-07 03:07:24 +04:00
* * @ nonblocking will be set to 0. Further , if @ gup_flags does not
* include FOLL_NOWAIT , the mmap_sem will be released via up_read ( ) in
* this case .
*
* A caller using such a combination of @ nonblocking and @ gup_flags
* must therefore hold the mmap_sem for reading only , and recognize
* when it ' s been released . Otherwise , it must be held for either
* reading or writing and will not be released .
2014-06-05 03:08:10 +04:00
*
* In most cases , get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages . __get_user_pages should be used only if
* you need some special @ gup_flags .
*/
long __get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long start , unsigned long nr_pages ,
unsigned int gup_flags , struct page * * pages ,
struct vm_area_struct * * vmas , int * nonblocking )
{
2014-06-05 03:08:13 +04:00
long i = 0 ;
2014-06-05 03:08:10 +04:00
unsigned int page_mask ;
2014-06-05 03:08:13 +04:00
struct vm_area_struct * vma = NULL ;
2014-06-05 03:08:10 +04:00
if ( ! nr_pages )
return 0 ;
VM_BUG_ON ( ! ! pages ! = ! ! ( gup_flags & FOLL_GET ) ) ;
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if ( ! ( gup_flags & FOLL_FORCE ) )
gup_flags | = FOLL_NUMA ;
do {
2014-06-05 03:08:13 +04:00
struct page * page ;
unsigned int foll_flags = gup_flags ;
unsigned int page_increm ;
/* first iteration or cross vma bound */
if ( ! vma | | start > = vma - > vm_end ) {
vma = find_extend_vma ( mm , start ) ;
if ( ! vma & & in_gate_area ( mm , start ) ) {
int ret ;
ret = get_gate_page ( mm , start & PAGE_MASK ,
gup_flags , & vma ,
pages ? & pages [ i ] : NULL ) ;
if ( ret )
return i ? : ret ;
page_mask = 0 ;
goto next_page ;
}
2014-06-05 03:08:10 +04:00
2014-06-05 03:08:13 +04:00
if ( ! vma | | check_vma_flags ( vma , gup_flags ) )
return i ? : - EFAULT ;
if ( is_vm_hugetlb_page ( vma ) ) {
i = follow_hugetlb_page ( mm , vma , pages , vmas ,
& start , & nr_pages , i ,
gup_flags ) ;
continue ;
2014-06-05 03:08:10 +04:00
}
2014-06-05 03:08:13 +04:00
}
retry :
/*
* If we have a pending SIGKILL , don ' t keep faulting pages and
* potentially allocating memory .
*/
if ( unlikely ( fatal_signal_pending ( current ) ) )
return i ? i : - ERESTARTSYS ;
cond_resched ( ) ;
page = follow_page_mask ( vma , start , foll_flags , & page_mask ) ;
if ( ! page ) {
int ret ;
ret = faultin_page ( tsk , vma , start , & foll_flags ,
nonblocking ) ;
switch ( ret ) {
case 0 :
goto retry ;
case - EFAULT :
case - ENOMEM :
case - EHWPOISON :
return i ? i : ret ;
case - EBUSY :
return i ;
case - ENOENT :
goto next_page ;
2014-06-05 03:08:10 +04:00
}
2014-06-05 03:08:13 +04:00
BUG ( ) ;
2014-06-05 03:08:10 +04:00
}
2014-06-05 03:08:13 +04:00
if ( IS_ERR ( page ) )
return i ? i : PTR_ERR ( page ) ;
if ( pages ) {
pages [ i ] = page ;
flush_anon_page ( vma , page , start ) ;
flush_dcache_page ( page ) ;
page_mask = 0 ;
2014-06-05 03:08:10 +04:00
}
next_page :
2014-06-05 03:08:13 +04:00
if ( vmas ) {
vmas [ i ] = vma ;
page_mask = 0 ;
}
page_increm = 1 + ( ~ ( start > > PAGE_SHIFT ) & page_mask ) ;
if ( page_increm > nr_pages )
page_increm = nr_pages ;
i + = page_increm ;
start + = page_increm * PAGE_SIZE ;
nr_pages - = page_increm ;
2014-06-05 03:08:10 +04:00
} while ( nr_pages ) ;
return i ;
}
EXPORT_SYMBOL ( __get_user_pages ) ;
/*
* fixup_user_fault ( ) - manually resolve a user page fault
* @ tsk : the task_struct to use for page fault accounting , or
* NULL if faults are not to be recorded .
* @ mm : mm_struct of target mm
* @ address : user address
* @ fault_flags : flags to pass down to handle_mm_fault ( )
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context ( within a pagefault_disable ( )
* section ) , this returns - EFAULT , and we want to resolve the user fault before
* trying again .
*
* Typically this is meant to be used by the futex code .
*
* The main difference with get_user_pages ( ) is that this function will
* unconditionally call handle_mm_fault ( ) which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE , while
* handle_mm_fault ( ) only guarantees to update these in the struct page .
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software . On
* such architectures , gup ( ) will not be enough to make a subsequent access
* succeed .
*
2014-08-07 03:07:24 +04:00
* This has the same semantics wrt the @ mm - > mmap_sem as does filemap_fault ( ) .
2014-06-05 03:08:10 +04:00
*/
int fixup_user_fault ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long address , unsigned int fault_flags )
{
struct vm_area_struct * vma ;
vm_flags_t vm_flags ;
int ret ;
vma = find_extend_vma ( mm , address ) ;
if ( ! vma | | address < vma - > vm_start )
return - EFAULT ;
vm_flags = ( fault_flags & FAULT_FLAG_WRITE ) ? VM_WRITE : VM_READ ;
if ( ! ( vm_flags & vma - > vm_flags ) )
return - EFAULT ;
ret = handle_mm_fault ( mm , vma , address , fault_flags ) ;
if ( ret & VM_FAULT_ERROR ) {
if ( ret & VM_FAULT_OOM )
return - ENOMEM ;
if ( ret & ( VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE ) )
return - EHWPOISON ;
if ( ret & VM_FAULT_SIGBUS )
return - EFAULT ;
BUG ( ) ;
}
if ( tsk ) {
if ( ret & VM_FAULT_MAJOR )
tsk - > maj_flt + + ;
else
tsk - > min_flt + + ;
}
return 0 ;
}
/*
* get_user_pages ( ) - pin user pages in memory
* @ tsk : the task_struct to use for page fault accounting , or
* NULL if faults are not to be recorded .
* @ mm : mm_struct of target mm
* @ start : starting user address
* @ nr_pages : number of pages from start to pin
* @ write : whether pages will be written to by the caller
* @ force : whether to force access even when user mapping is currently
* protected ( but never forces write access to shared mapping ) .
* @ pages : array that receives pointers to the pages pinned .
* Should be at least nr_pages long . Or NULL , if caller
* only intends to ensure the pages are faulted in .
* @ vmas : array of pointers to vmas corresponding to each page .
* Or NULL if the caller does not require them .
*
* Returns number of pages pinned . This may be fewer than the number
* requested . If nr_pages is 0 or negative , returns 0. If no pages
* were pinned , returns - errno . Each page returned must be released
* with a put_page ( ) call when it is finished with . vmas will only
* remain valid while mmap_sem is held .
*
* Must be called with mmap_sem held for read or write .
*
* get_user_pages walks a process ' s page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant . That is , it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant .
*
* This does not guarantee that the page exists in the user mappings when
* get_user_pages returns , and there may even be a completely different
* page there in some cases ( eg . if mmapped pagecache has been invalidated
* and subsequently re faulted ) . However it does guarantee that the page
* won ' t be freed completely . And mostly callers simply care that the page
* contains data that was valid * at some point in time * . Typically , an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can ' t be held over the syscall boundary .
*
* If write = 0 , the page must not be written to . If the page is written to ,
* set_page_dirty ( or set_page_dirty_lock , as appropriate ) must be called
* after the page is finished with , and before put_page is called .
*
* get_user_pages is typically used for fewer - copy IO operations , to get a
* handle on the memory by some means other than accesses via the user virtual
* addresses . The pages may be submitted for DMA to devices or accessed via
* their kernel linear mapping ( via the kmap APIs ) . Care should be taken to
* use the correct cache flushing APIs .
*
* See also get_user_pages_fast , for performance critical applications .
*/
long get_user_pages ( struct task_struct * tsk , struct mm_struct * mm ,
unsigned long start , unsigned long nr_pages , int write ,
int force , struct page * * pages , struct vm_area_struct * * vmas )
{
int flags = FOLL_TOUCH ;
if ( pages )
flags | = FOLL_GET ;
if ( write )
flags | = FOLL_WRITE ;
if ( force )
flags | = FOLL_FORCE ;
return __get_user_pages ( tsk , mm , start , nr_pages , flags , pages , vmas ,
NULL ) ;
}
EXPORT_SYMBOL ( get_user_pages ) ;
/**
* get_dump_page ( ) - pin user page in memory while writing it to core dump
* @ addr : user address
*
* Returns struct page pointer of user page pinned for dump ,
* to be freed afterwards by page_cache_release ( ) or put_page ( ) .
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile , to preserve alignment with its headers ; and also returns
* NULL wherever the ZERO_PAGE , or an anonymous pte_none , has been found -
* allowing a hole to be left in the corefile to save diskspace .
*
* Called without mmap_sem , but after all other threads have been killed .
*/
# ifdef CONFIG_ELF_CORE
struct page * get_dump_page ( unsigned long addr )
{
struct vm_area_struct * vma ;
struct page * page ;
if ( __get_user_pages ( current , current - > mm , addr , 1 ,
FOLL_FORCE | FOLL_DUMP | FOLL_GET , & page , & vma ,
NULL ) < 1 )
return NULL ;
flush_cache_page ( vma , addr , page_to_pfn ( page ) ) ;
return page ;
}
# endif /* CONFIG_ELF_CORE */
2014-10-10 02:29:14 +04:00
/*
* Generic RCU Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks . Thus the walker needs to be
* protected from page table pages being freed from under it , and should
* block any THP splits .
*
* One way to achieve this is to have the walker disable interrupts , and
* rely on IPIs from the TLB flushing code blocking before the page table
* pages are freed . This is unsuitable for architectures that do not need
* to broadcast an IPI when invalidating TLBs .
*
* Another way to achieve this is to batch up page table containing pages
* belonging to more than one mm_user , then rcu_sched a callback to free those
* pages . Disabling interrupts will allow the fast_gup walker to both block
* the rcu_sched callback , and an IPI that we broadcast for splitting THPs
* ( which is a relatively rare event ) . The code below adopts this strategy .
*
* Before activating this code , please be aware that the following assumptions
* are currently made :
*
* * ) HAVE_RCU_TABLE_FREE is enabled , and tlb_remove_table is used to free
* pages containing page tables .
*
* * ) THP splits will broadcast an IPI , this can be achieved by overriding
* pmdp_splitting_flush .
*
* * ) ptes can be read atomically by the architecture .
*
* * ) access_ok is sufficient to validate userspace address ranges .
*
* The last two assumptions can be relaxed by the addition of helper functions .
*
* This code is based heavily on the PowerPC implementation by Nick Piggin .
*/
# ifdef CONFIG_HAVE_GENERIC_RCU_GUP
# ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range ( pmd_t pmd , unsigned long addr , unsigned long end ,
int write , struct page * * pages , int * nr )
{
pte_t * ptep , * ptem ;
int ret = 0 ;
ptem = ptep = pte_offset_map ( & pmd , addr ) ;
do {
/*
* In the line below we are assuming that the pte can be read
* atomically . If this is not the case for your architecture ,
* please wrap this in a helper function !
*
* for an example see gup_get_pte in arch / x86 / mm / gup . c
*/
pte_t pte = ACCESS_ONCE ( * ptep ) ;
struct page * page ;
/*
* Similar to the PMD case below , NUMA hinting must take slow
* path
*/
if ( ! pte_present ( pte ) | | pte_special ( pte ) | |
pte_numa ( pte ) | | ( write & & ! pte_write ( pte ) ) )
goto pte_unmap ;
VM_BUG_ON ( ! pfn_valid ( pte_pfn ( pte ) ) ) ;
page = pte_page ( pte ) ;
if ( ! page_cache_get_speculative ( page ) )
goto pte_unmap ;
if ( unlikely ( pte_val ( pte ) ! = pte_val ( * ptep ) ) ) {
put_page ( page ) ;
goto pte_unmap ;
}
pages [ * nr ] = page ;
( * nr ) + + ;
} while ( ptep + + , addr + = PAGE_SIZE , addr ! = end ) ;
ret = 1 ;
pte_unmap :
pte_unmap ( ptem ) ;
return ret ;
}
# else
/*
* If we can ' t determine whether or not a pte is special , then fail immediately
* for ptes . Note , we can still pin HugeTLB and THP as these are guaranteed not
* to be special .
*
* For a futex to be placed on a THP tail page , get_futex_key requires a
* __get_user_pages_fast implementation that can pin pages . Thus it ' s still
* useful to have gup_huge_pmd even if we can ' t operate on ptes .
*/
static int gup_pte_range ( pmd_t pmd , unsigned long addr , unsigned long end ,
int write , struct page * * pages , int * nr )
{
return 0 ;
}
# endif /* __HAVE_ARCH_PTE_SPECIAL */
static int gup_huge_pmd ( pmd_t orig , pmd_t * pmdp , unsigned long addr ,
unsigned long end , int write , struct page * * pages , int * nr )
{
struct page * head , * page , * tail ;
int refs ;
if ( write & & ! pmd_write ( orig ) )
return 0 ;
refs = 0 ;
head = pmd_page ( orig ) ;
page = head + ( ( addr & ~ PMD_MASK ) > > PAGE_SHIFT ) ;
tail = page ;
do {
VM_BUG_ON_PAGE ( compound_head ( page ) ! = head , page ) ;
pages [ * nr ] = page ;
( * nr ) + + ;
page + + ;
refs + + ;
} while ( addr + = PAGE_SIZE , addr ! = end ) ;
if ( ! page_cache_add_speculative ( head , refs ) ) {
* nr - = refs ;
return 0 ;
}
if ( unlikely ( pmd_val ( orig ) ! = pmd_val ( * pmdp ) ) ) {
* nr - = refs ;
while ( refs - - )
put_page ( head ) ;
return 0 ;
}
/*
* Any tail pages need their mapcount reference taken before we
* return . ( This allows the THP code to bump their ref count when
* they are split into base pages ) .
*/
while ( refs - - ) {
if ( PageTail ( tail ) )
get_huge_page_tail ( tail ) ;
tail + + ;
}
return 1 ;
}
static int gup_huge_pud ( pud_t orig , pud_t * pudp , unsigned long addr ,
unsigned long end , int write , struct page * * pages , int * nr )
{
struct page * head , * page , * tail ;
int refs ;
if ( write & & ! pud_write ( orig ) )
return 0 ;
refs = 0 ;
head = pud_page ( orig ) ;
page = head + ( ( addr & ~ PUD_MASK ) > > PAGE_SHIFT ) ;
tail = page ;
do {
VM_BUG_ON_PAGE ( compound_head ( page ) ! = head , page ) ;
pages [ * nr ] = page ;
( * nr ) + + ;
page + + ;
refs + + ;
} while ( addr + = PAGE_SIZE , addr ! = end ) ;
if ( ! page_cache_add_speculative ( head , refs ) ) {
* nr - = refs ;
return 0 ;
}
if ( unlikely ( pud_val ( orig ) ! = pud_val ( * pudp ) ) ) {
* nr - = refs ;
while ( refs - - )
put_page ( head ) ;
return 0 ;
}
while ( refs - - ) {
if ( PageTail ( tail ) )
get_huge_page_tail ( tail ) ;
tail + + ;
}
return 1 ;
}
2014-11-05 19:27:40 +03:00
static int gup_huge_pgd ( pgd_t orig , pgd_t * pgdp , unsigned long addr ,
unsigned long end , int write ,
struct page * * pages , int * nr )
{
int refs ;
struct page * head , * page , * tail ;
if ( write & & ! pgd_write ( orig ) )
return 0 ;
refs = 0 ;
head = pgd_page ( orig ) ;
page = head + ( ( addr & ~ PGDIR_MASK ) > > PAGE_SHIFT ) ;
tail = page ;
do {
VM_BUG_ON_PAGE ( compound_head ( page ) ! = head , page ) ;
pages [ * nr ] = page ;
( * nr ) + + ;
page + + ;
refs + + ;
} while ( addr + = PAGE_SIZE , addr ! = end ) ;
if ( ! page_cache_add_speculative ( head , refs ) ) {
* nr - = refs ;
return 0 ;
}
if ( unlikely ( pgd_val ( orig ) ! = pgd_val ( * pgdp ) ) ) {
* nr - = refs ;
while ( refs - - )
put_page ( head ) ;
return 0 ;
}
while ( refs - - ) {
if ( PageTail ( tail ) )
get_huge_page_tail ( tail ) ;
tail + + ;
}
return 1 ;
}
2014-10-10 02:29:14 +04:00
static int gup_pmd_range ( pud_t pud , unsigned long addr , unsigned long end ,
int write , struct page * * pages , int * nr )
{
unsigned long next ;
pmd_t * pmdp ;
pmdp = pmd_offset ( & pud , addr ) ;
do {
pmd_t pmd = ACCESS_ONCE ( * pmdp ) ;
next = pmd_addr_end ( addr , end ) ;
if ( pmd_none ( pmd ) | | pmd_trans_splitting ( pmd ) )
return 0 ;
if ( unlikely ( pmd_trans_huge ( pmd ) | | pmd_huge ( pmd ) ) ) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration .
*/
if ( pmd_numa ( pmd ) )
return 0 ;
if ( ! gup_huge_pmd ( pmd , pmdp , addr , next , write ,
pages , nr ) )
return 0 ;
2014-11-05 19:27:40 +03:00
} else if ( unlikely ( is_hugepd ( __hugepd ( pmd_val ( pmd ) ) ) ) ) {
/*
* architecture have different format for hugetlbfs
* pmd format and THP pmd format
*/
if ( ! gup_huge_pd ( __hugepd ( pmd_val ( pmd ) ) , addr ,
PMD_SHIFT , next , write , pages , nr ) )
return 0 ;
2014-10-10 02:29:14 +04:00
} else if ( ! gup_pte_range ( pmd , addr , next , write , pages , nr ) )
return 0 ;
} while ( pmdp + + , addr = next , addr ! = end ) ;
return 1 ;
}
2014-11-05 19:27:40 +03:00
static int gup_pud_range ( pgd_t pgd , unsigned long addr , unsigned long end ,
int write , struct page * * pages , int * nr )
2014-10-10 02:29:14 +04:00
{
unsigned long next ;
pud_t * pudp ;
2014-11-05 19:27:40 +03:00
pudp = pud_offset ( & pgd , addr ) ;
2014-10-10 02:29:14 +04:00
do {
2014-12-07 23:41:33 +03:00
pud_t pud = READ_ONCE ( * pudp ) ;
2014-10-10 02:29:14 +04:00
next = pud_addr_end ( addr , end ) ;
if ( pud_none ( pud ) )
return 0 ;
2014-11-05 19:27:40 +03:00
if ( unlikely ( pud_huge ( pud ) ) ) {
2014-10-10 02:29:14 +04:00
if ( ! gup_huge_pud ( pud , pudp , addr , next , write ,
2014-11-05 19:27:40 +03:00
pages , nr ) )
return 0 ;
} else if ( unlikely ( is_hugepd ( __hugepd ( pud_val ( pud ) ) ) ) ) {
if ( ! gup_huge_pd ( __hugepd ( pud_val ( pud ) ) , addr ,
PUD_SHIFT , next , write , pages , nr ) )
2014-10-10 02:29:14 +04:00
return 0 ;
} else if ( ! gup_pmd_range ( pud , addr , next , write , pages , nr ) )
return 0 ;
} while ( pudp + + , addr = next , addr ! = end ) ;
return 1 ;
}
/*
* Like get_user_pages_fast ( ) except it ' s IRQ - safe in that it won ' t fall back to
* the regular GUP . It will only return non - negative values .
*/
int __get_user_pages_fast ( unsigned long start , int nr_pages , int write ,
struct page * * pages )
{
struct mm_struct * mm = current - > mm ;
unsigned long addr , len , end ;
unsigned long next , flags ;
pgd_t * pgdp ;
int nr = 0 ;
start & = PAGE_MASK ;
addr = start ;
len = ( unsigned long ) nr_pages < < PAGE_SHIFT ;
end = start + len ;
if ( unlikely ( ! access_ok ( write ? VERIFY_WRITE : VERIFY_READ ,
start , len ) ) )
return 0 ;
/*
* Disable interrupts . We use the nested form as we can already have
* interrupts disabled by get_futex_key .
*
* With interrupts disabled , we block page table pages from being
* freed from under us . See mmu_gather_tlb in asm - generic / tlb . h
* for more details .
*
* We do not adopt an rcu_read_lock ( . ) here as we also want to
* block IPIs that come from THPs splitting .
*/
local_irq_save ( flags ) ;
pgdp = pgd_offset ( mm , addr ) ;
do {
2014-11-05 19:27:40 +03:00
pgd_t pgd = ACCESS_ONCE ( * pgdp ) ;
2014-10-10 02:29:14 +04:00
next = pgd_addr_end ( addr , end ) ;
2014-11-05 19:27:40 +03:00
if ( pgd_none ( pgd ) )
2014-10-10 02:29:14 +04:00
break ;
2014-11-05 19:27:40 +03:00
if ( unlikely ( pgd_huge ( pgd ) ) ) {
if ( ! gup_huge_pgd ( pgd , pgdp , addr , next , write ,
pages , & nr ) )
break ;
} else if ( unlikely ( is_hugepd ( __hugepd ( pgd_val ( pgd ) ) ) ) ) {
if ( ! gup_huge_pd ( __hugepd ( pgd_val ( pgd ) ) , addr ,
PGDIR_SHIFT , next , write , pages , & nr ) )
break ;
} else if ( ! gup_pud_range ( pgd , addr , next , write , pages , & nr ) )
2014-10-10 02:29:14 +04:00
break ;
} while ( pgdp + + , addr = next , addr ! = end ) ;
local_irq_restore ( flags ) ;
return nr ;
}
/**
* get_user_pages_fast ( ) - pin user pages in memory
* @ start : starting user address
* @ nr_pages : number of pages from start to pin
* @ write : whether pages will be written to
* @ pages : array that receives pointers to the pages pinned .
* Should be at least nr_pages long .
*
* Attempt to pin user pages in memory without taking mm - > mmap_sem .
* If not successful , it will fall back to taking the lock and
* calling get_user_pages ( ) .
*
* Returns number of pages pinned . This may be fewer than the number
* requested . If nr_pages is 0 or negative , returns 0. If no pages
* were pinned , returns - errno .
*/
int get_user_pages_fast ( unsigned long start , int nr_pages , int write ,
struct page * * pages )
{
struct mm_struct * mm = current - > mm ;
int nr , ret ;
start & = PAGE_MASK ;
nr = __get_user_pages_fast ( start , nr_pages , write , pages ) ;
ret = nr ;
if ( nr < nr_pages ) {
/* Try to get the remaining pages with get_user_pages */
start + = nr < < PAGE_SHIFT ;
pages + = nr ;
down_read ( & mm - > mmap_sem ) ;
ret = get_user_pages ( current , mm , start ,
nr_pages - nr , write , 0 , pages , NULL ) ;
up_read ( & mm - > mmap_sem ) ;
/* Have to be a bit careful with return values */
if ( nr > 0 ) {
if ( ret < 0 )
ret = nr ;
else
ret + = nr ;
}
}
return ret ;
}
# endif /* CONFIG_HAVE_GENERIC_RCU_GUP */