2005-04-17 02:20:36 +04:00
/*
* linux / mm / filemap . c
*
* Copyright ( C ) 1994 - 1999 Linus Torvalds
*/
/*
* This file handles the generic file mmap semantics used by
* most " normal " filesystems ( but you don ' t / have / to use this :
* the NFS filesystem used to do this differently , for example )
*/
# include <linux/config.h>
# include <linux/module.h>
# include <linux/slab.h>
# include <linux/compiler.h>
# include <linux/fs.h>
# include <linux/aio.h>
# include <linux/kernel_stat.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/mman.h>
# include <linux/pagemap.h>
# include <linux/file.h>
# include <linux/uio.h>
# include <linux/hash.h>
# include <linux/writeback.h>
# include <linux/pagevec.h>
# include <linux/blkdev.h>
# include <linux/security.h>
# include <linux/syscalls.h>
/*
* This is needed for the following functions :
* - try_to_release_page
* - block_invalidatepage
* - generic_osync_inode
*
* FIXME : remove all knowledge of the buffer layer from the core VM
*/
# include <linux/buffer_head.h> /* for generic_osync_inode */
# include <asm/uaccess.h>
# include <asm/mman.h>
/*
* Shared mappings implemented 30.11 .1994 . It ' s not fully working yet ,
* though .
*
* Shared mappings now work . 15.8 .1995 Bruno .
*
* finished ' unifying ' the page and buffer cache and SMP - threaded the
* page - cache , 21.05 .1999 , Ingo Molnar < mingo @ redhat . com >
*
* SMP - threaded pagemap - LRU 1999 , Andrea Arcangeli < andrea @ suse . de >
*/
/*
* Lock ordering :
*
* - > i_mmap_lock ( vmtruncate )
* - > private_lock ( __free_pte - > __set_page_dirty_buffers )
* - > swap_list_lock
* - > swap_device_lock ( exclusive_swap_page , others )
* - > mapping - > tree_lock
*
* - > i_sem
* - > i_mmap_lock ( truncate - > unmap_mapping_range )
*
* - > mmap_sem
* - > i_mmap_lock
* - > page_table_lock ( various places , mainly in mmap . c )
* - > mapping - > tree_lock ( arch - dependent flush_dcache_mmap_lock )
*
* - > mmap_sem
* - > lock_page ( access_process_vm )
*
* - > mmap_sem
* - > i_sem ( msync )
*
* - > i_sem
* - > i_alloc_sem ( various )
*
* - > inode_lock
* - > sb_lock ( fs / fs - writeback . c )
* - > mapping - > tree_lock ( __sync_single_inode )
*
* - > i_mmap_lock
* - > anon_vma . lock ( vma_adjust )
*
* - > anon_vma . lock
* - > page_table_lock ( anon_vma_prepare and various )
*
* - > page_table_lock
* - > swap_device_lock ( try_to_unmap_one )
* - > private_lock ( try_to_unmap_one )
* - > tree_lock ( try_to_unmap_one )
* - > zone . lru_lock ( follow_page - > mark_page_accessed )
* - > private_lock ( page_remove_rmap - > set_page_dirty )
* - > tree_lock ( page_remove_rmap - > set_page_dirty )
* - > inode_lock ( page_remove_rmap - > set_page_dirty )
* - > inode_lock ( zap_pte_range - > set_page_dirty )
* - > private_lock ( zap_pte_range - > __set_page_dirty_buffers )
*
* - > task - > proc_lock
* - > dcache_lock ( proc_pid_lookup )
*/
/*
* Remove a page from the page cache and free it . Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe . The caller must hold a write_lock on the mapping ' s tree_lock .
*/
void __remove_from_page_cache ( struct page * page )
{
struct address_space * mapping = page - > mapping ;
radix_tree_delete ( & mapping - > page_tree , page - > index ) ;
page - > mapping = NULL ;
mapping - > nrpages - - ;
pagecache_acct ( - 1 ) ;
}
void remove_from_page_cache ( struct page * page )
{
struct address_space * mapping = page - > mapping ;
if ( unlikely ( ! PageLocked ( page ) ) )
PAGE_BUG ( page ) ;
write_lock_irq ( & mapping - > tree_lock ) ;
__remove_from_page_cache ( page ) ;
write_unlock_irq ( & mapping - > tree_lock ) ;
}
static int sync_page ( void * word )
{
struct address_space * mapping ;
struct page * page ;
page = container_of ( ( page_flags_t * ) word , struct page , flags ) ;
/*
* FIXME , fercrissake . What is this barrier here for ?
*/
smp_mb ( ) ;
mapping = page_mapping ( page ) ;
if ( mapping & & mapping - > a_ops & & mapping - > a_ops - > sync_page )
mapping - > a_ops - > sync_page ( page ) ;
io_schedule ( ) ;
return 0 ;
}
/**
* filemap_fdatawrite_range - start writeback against all of a mapping ' s
* dirty pages that lie within the byte offsets < start , end >
* @ mapping : address space structure to write
* @ start : offset in bytes where the range starts
* @ end : offset in bytes where the range ends
*
* If sync_mode is WB_SYNC_ALL then this is a " data integrity " operation , as
* opposed to a regular memory * cleansing writeback . The difference between
* these two operations is that if a dirty page / buffer is encountered , it must
* be waited upon , and not just skipped over .
*/
static int __filemap_fdatawrite_range ( struct address_space * mapping ,
loff_t start , loff_t end , int sync_mode )
{
int ret ;
struct writeback_control wbc = {
. sync_mode = sync_mode ,
. nr_to_write = mapping - > nrpages * 2 ,
. start = start ,
. end = end ,
} ;
if ( ! mapping_cap_writeback_dirty ( mapping ) )
return 0 ;
ret = do_writepages ( mapping , & wbc ) ;
return ret ;
}
static inline int __filemap_fdatawrite ( struct address_space * mapping ,
int sync_mode )
{
return __filemap_fdatawrite_range ( mapping , 0 , 0 , sync_mode ) ;
}
int filemap_fdatawrite ( struct address_space * mapping )
{
return __filemap_fdatawrite ( mapping , WB_SYNC_ALL ) ;
}
EXPORT_SYMBOL ( filemap_fdatawrite ) ;
static int filemap_fdatawrite_range ( struct address_space * mapping ,
loff_t start , loff_t end )
{
return __filemap_fdatawrite_range ( mapping , start , end , WB_SYNC_ALL ) ;
}
/*
* This is a mostly non - blocking flush . Not suitable for data - integrity
* purposes - I / O may not be started against all dirty pages .
*/
int filemap_flush ( struct address_space * mapping )
{
return __filemap_fdatawrite ( mapping , WB_SYNC_NONE ) ;
}
EXPORT_SYMBOL ( filemap_flush ) ;
/*
* Wait for writeback to complete against pages indexed by start - > end
* inclusive
*/
static int wait_on_page_writeback_range ( struct address_space * mapping ,
pgoff_t start , pgoff_t end )
{
struct pagevec pvec ;
int nr_pages ;
int ret = 0 ;
pgoff_t index ;
if ( end < start )
return 0 ;
pagevec_init ( & pvec , 0 ) ;
index = start ;
while ( ( index < = end ) & &
( nr_pages = pagevec_lookup_tag ( & pvec , mapping , & index ,
PAGECACHE_TAG_WRITEBACK ,
min ( end - index , ( pgoff_t ) PAGEVEC_SIZE - 1 ) + 1 ) ) ! = 0 ) {
unsigned i ;
for ( i = 0 ; i < nr_pages ; i + + ) {
struct page * page = pvec . pages [ i ] ;
/* until radix tree lookup accepts end_index */
if ( page - > index > end )
continue ;
wait_on_page_writeback ( page ) ;
if ( PageError ( page ) )
ret = - EIO ;
}
pagevec_release ( & pvec ) ;
cond_resched ( ) ;
}
/* Check for outstanding write errors */
if ( test_and_clear_bit ( AS_ENOSPC , & mapping - > flags ) )
ret = - ENOSPC ;
if ( test_and_clear_bit ( AS_EIO , & mapping - > flags ) )
ret = - EIO ;
return ret ;
}
/*
* Write and wait upon all the pages in the passed range . This is a " data
* integrity " operation. It waits upon in-flight writeout before starting and
* waiting upon new writeout . If there was an IO error , return it .
*
* We need to re - take i_sem during the generic_osync_inode list walk because
* it is otherwise livelockable .
*/
int sync_page_range ( struct inode * inode , struct address_space * mapping ,
loff_t pos , size_t count )
{
pgoff_t start = pos > > PAGE_CACHE_SHIFT ;
pgoff_t end = ( pos + count - 1 ) > > PAGE_CACHE_SHIFT ;
int ret ;
if ( ! mapping_cap_writeback_dirty ( mapping ) | | ! count )
return 0 ;
ret = filemap_fdatawrite_range ( mapping , pos , pos + count - 1 ) ;
if ( ret = = 0 ) {
down ( & inode - > i_sem ) ;
ret = generic_osync_inode ( inode , mapping , OSYNC_METADATA ) ;
up ( & inode - > i_sem ) ;
}
if ( ret = = 0 )
ret = wait_on_page_writeback_range ( mapping , start , end ) ;
return ret ;
}
EXPORT_SYMBOL ( sync_page_range ) ;
/*
* Note : Holding i_sem across sync_page_range_nolock is not a good idea
* as it forces O_SYNC writers to different parts of the same file
* to be serialised right until io completion .
*/
int sync_page_range_nolock ( struct inode * inode , struct address_space * mapping ,
loff_t pos , size_t count )
{
pgoff_t start = pos > > PAGE_CACHE_SHIFT ;
pgoff_t end = ( pos + count - 1 ) > > PAGE_CACHE_SHIFT ;
int ret ;
if ( ! mapping_cap_writeback_dirty ( mapping ) | | ! count )
return 0 ;
ret = filemap_fdatawrite_range ( mapping , pos , pos + count - 1 ) ;
if ( ret = = 0 )
ret = generic_osync_inode ( inode , mapping , OSYNC_METADATA ) ;
if ( ret = = 0 )
ret = wait_on_page_writeback_range ( mapping , start , end ) ;
return ret ;
}
EXPORT_SYMBOL ( sync_page_range_nolock ) ;
/**
* filemap_fdatawait - walk the list of under - writeback pages of the given
* address space and wait for all of them .
*
* @ mapping : address space structure to wait for
*/
int filemap_fdatawait ( struct address_space * mapping )
{
loff_t i_size = i_size_read ( mapping - > host ) ;
if ( i_size = = 0 )
return 0 ;
return wait_on_page_writeback_range ( mapping , 0 ,
( i_size - 1 ) > > PAGE_CACHE_SHIFT ) ;
}
EXPORT_SYMBOL ( filemap_fdatawait ) ;
int filemap_write_and_wait ( struct address_space * mapping )
{
int retval = 0 ;
if ( mapping - > nrpages ) {
retval = filemap_fdatawrite ( mapping ) ;
if ( retval = = 0 )
retval = filemap_fdatawait ( mapping ) ;
}
return retval ;
}
int filemap_write_and_wait_range ( struct address_space * mapping ,
loff_t lstart , loff_t lend )
{
int retval = 0 ;
if ( mapping - > nrpages ) {
retval = __filemap_fdatawrite_range ( mapping , lstart , lend ,
WB_SYNC_ALL ) ;
if ( retval = = 0 )
retval = wait_on_page_writeback_range ( mapping ,
lstart > > PAGE_CACHE_SHIFT ,
lend > > PAGE_CACHE_SHIFT ) ;
}
return retval ;
}
/*
* This function is used to add newly allocated pagecache pages :
* the page is new , so we can just run SetPageLocked ( ) against it .
* The other page state flags were set by rmqueue ( ) .
*
* This function does not add the page to the LRU . The caller must do that .
*/
int add_to_page_cache ( struct page * page , struct address_space * mapping ,
pgoff_t offset , int gfp_mask )
{
int error = radix_tree_preload ( gfp_mask & ~ __GFP_HIGHMEM ) ;
if ( error = = 0 ) {
write_lock_irq ( & mapping - > tree_lock ) ;
error = radix_tree_insert ( & mapping - > page_tree , offset , page ) ;
if ( ! error ) {
page_cache_get ( page ) ;
SetPageLocked ( page ) ;
page - > mapping = mapping ;
page - > index = offset ;
mapping - > nrpages + + ;
pagecache_acct ( 1 ) ;
}
write_unlock_irq ( & mapping - > tree_lock ) ;
radix_tree_preload_end ( ) ;
}
return error ;
}
EXPORT_SYMBOL ( add_to_page_cache ) ;
int add_to_page_cache_lru ( struct page * page , struct address_space * mapping ,
pgoff_t offset , int gfp_mask )
{
int ret = add_to_page_cache ( page , mapping , offset , gfp_mask ) ;
if ( ret = = 0 )
lru_cache_add ( page ) ;
return ret ;
}
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages . By using a hash table of
* waitqueues where the bucket discipline is to maintain all
* waiters on the same queue and wake all when any of the pages
* become available , and for the woken contexts to check to be
* sure the appropriate page became available , this saves space
* at a cost of " thundering herd " phenomena during rare hash
* collisions .
*/
static wait_queue_head_t * page_waitqueue ( struct page * page )
{
const struct zone * zone = page_zone ( page ) ;
return & zone - > wait_table [ hash_ptr ( page , zone - > wait_table_bits ) ] ;
}
static inline void wake_up_page ( struct page * page , int bit )
{
__wake_up_bit ( page_waitqueue ( page ) , & page - > flags , bit ) ;
}
void fastcall wait_on_page_bit ( struct page * page , int bit_nr )
{
DEFINE_WAIT_BIT ( wait , & page - > flags , bit_nr ) ;
if ( test_bit ( bit_nr , & page - > flags ) )
__wait_on_bit ( page_waitqueue ( page ) , & wait , sync_page ,
TASK_UNINTERRUPTIBLE ) ;
}
EXPORT_SYMBOL ( wait_on_page_bit ) ;
/**
* unlock_page ( ) - unlock a locked page
*
* @ page : the page
*
* Unlocks the page and wakes up sleepers in ___wait_on_page_locked ( ) .
* Also wakes sleepers in wait_on_page_writeback ( ) because the wakeup
* mechananism between PageLocked pages and PageWriteback pages is shared .
* But that ' s OK - sleepers in wait_on_page_writeback ( ) just go back to sleep .
*
* The first mb is necessary to safely close the critical section opened by the
* TestSetPageLocked ( ) , the second mb is necessary to enforce ordering between
* the clear_bit and the read of the waitqueue ( to avoid SMP races with a
* parallel wait_on_page_locked ( ) ) .
*/
void fastcall unlock_page ( struct page * page )
{
smp_mb__before_clear_bit ( ) ;
if ( ! TestClearPageLocked ( page ) )
BUG ( ) ;
smp_mb__after_clear_bit ( ) ;
wake_up_page ( page , PG_locked ) ;
}
EXPORT_SYMBOL ( unlock_page ) ;
/*
* End writeback against a page .
*/
void end_page_writeback ( struct page * page )
{
if ( ! TestClearPageReclaim ( page ) | | rotate_reclaimable_page ( page ) ) {
if ( ! test_clear_page_writeback ( page ) )
BUG ( ) ;
}
smp_mb__after_clear_bit ( ) ;
wake_up_page ( page , PG_writeback ) ;
}
EXPORT_SYMBOL ( end_page_writeback ) ;
/*
* Get a lock on the page , assuming we need to sleep to get it .
*
* Ugly : running sync_page ( ) in state TASK_UNINTERRUPTIBLE is scary . If some
* random driver ' s requestfn sets TASK_RUNNING , we could busywait . However
* chances are that on the second loop , the block layer ' s plug list is empty ,
* so sync_page ( ) will then return in state TASK_UNINTERRUPTIBLE .
*/
void fastcall __lock_page ( struct page * page )
{
DEFINE_WAIT_BIT ( wait , & page - > flags , PG_locked ) ;
__wait_on_bit_lock ( page_waitqueue ( page ) , & wait , sync_page ,
TASK_UNINTERRUPTIBLE ) ;
}
EXPORT_SYMBOL ( __lock_page ) ;
/*
* a rather lightweight function , finding and getting a reference to a
* hashed page atomically .
*/
struct page * find_get_page ( struct address_space * mapping , unsigned long offset )
{
struct page * page ;
read_lock_irq ( & mapping - > tree_lock ) ;
page = radix_tree_lookup ( & mapping - > page_tree , offset ) ;
if ( page )
page_cache_get ( page ) ;
read_unlock_irq ( & mapping - > tree_lock ) ;
return page ;
}
EXPORT_SYMBOL ( find_get_page ) ;
/*
* Same as above , but trylock it instead of incrementing the count .
*/
struct page * find_trylock_page ( struct address_space * mapping , unsigned long offset )
{
struct page * page ;
read_lock_irq ( & mapping - > tree_lock ) ;
page = radix_tree_lookup ( & mapping - > page_tree , offset ) ;
if ( page & & TestSetPageLocked ( page ) )
page = NULL ;
read_unlock_irq ( & mapping - > tree_lock ) ;
return page ;
}
EXPORT_SYMBOL ( find_trylock_page ) ;
/**
* find_lock_page - locate , pin and lock a pagecache page
*
* @ mapping - the address_space to search
* @ offset - the page index
*
* Locates the desired pagecache page , locks it , increments its reference
* count and returns its address .
*
* Returns zero if the page was not present . find_lock_page ( ) may sleep .
*/
struct page * find_lock_page ( struct address_space * mapping ,
unsigned long offset )
{
struct page * page ;
read_lock_irq ( & mapping - > tree_lock ) ;
repeat :
page = radix_tree_lookup ( & mapping - > page_tree , offset ) ;
if ( page ) {
page_cache_get ( page ) ;
if ( TestSetPageLocked ( page ) ) {
read_unlock_irq ( & mapping - > tree_lock ) ;
lock_page ( page ) ;
read_lock_irq ( & mapping - > tree_lock ) ;
/* Has the page been truncated while we slept? */
if ( page - > mapping ! = mapping | | page - > index ! = offset ) {
unlock_page ( page ) ;
page_cache_release ( page ) ;
goto repeat ;
}
}
}
read_unlock_irq ( & mapping - > tree_lock ) ;
return page ;
}
EXPORT_SYMBOL ( find_lock_page ) ;
/**
* find_or_create_page - locate or add a pagecache page
*
* @ mapping - the page ' s address_space
* @ index - the page ' s index into the mapping
* @ gfp_mask - page allocation mode
*
* Locates a page in the pagecache . If the page is not present , a new page
* is allocated using @ gfp_mask and is added to the pagecache and to the VM ' s
* LRU list . The returned page is locked and has its reference count
* incremented .
*
* find_or_create_page ( ) may sleep , even if @ gfp_flags specifies an atomic
* allocation !
*
* find_or_create_page ( ) returns the desired page ' s address , or zero on
* memory exhaustion .
*/
struct page * find_or_create_page ( struct address_space * mapping ,
unsigned long index , unsigned int gfp_mask )
{
struct page * page , * cached_page = NULL ;
int err ;
repeat :
page = find_lock_page ( mapping , index ) ;
if ( ! page ) {
if ( ! cached_page ) {
cached_page = alloc_page ( gfp_mask ) ;
if ( ! cached_page )
return NULL ;
}
err = add_to_page_cache_lru ( cached_page , mapping ,
index , gfp_mask ) ;
if ( ! err ) {
page = cached_page ;
cached_page = NULL ;
} else if ( err = = - EEXIST )
goto repeat ;
}
if ( cached_page )
page_cache_release ( cached_page ) ;
return page ;
}
EXPORT_SYMBOL ( find_or_create_page ) ;
/**
* find_get_pages - gang pagecache lookup
* @ mapping : The address_space to search
* @ start : The starting page index
* @ nr_pages : The maximum number of pages
* @ pages : Where the resulting pages are placed
*
* find_get_pages ( ) will search for and return a group of up to
* @ nr_pages pages in the mapping . The pages are placed at @ pages .
* find_get_pages ( ) takes a reference against the returned pages .
*
* The search returns a group of mapping - contiguous pages with ascending
* indexes . There may be holes in the indices due to not - present pages .
*
* find_get_pages ( ) returns the number of pages which were found .
*/
unsigned find_get_pages ( struct address_space * mapping , pgoff_t start ,
unsigned int nr_pages , struct page * * pages )
{
unsigned int i ;
unsigned int ret ;
read_lock_irq ( & mapping - > tree_lock ) ;
ret = radix_tree_gang_lookup ( & mapping - > page_tree ,
( void * * ) pages , start , nr_pages ) ;
for ( i = 0 ; i < ret ; i + + )
page_cache_get ( pages [ i ] ) ;
read_unlock_irq ( & mapping - > tree_lock ) ;
return ret ;
}
/*
* Like find_get_pages , except we only return pages which are tagged with
* ` tag ' . We update * index to index the next page for the traversal .
*/
unsigned find_get_pages_tag ( struct address_space * mapping , pgoff_t * index ,
int tag , unsigned int nr_pages , struct page * * pages )
{
unsigned int i ;
unsigned int ret ;
read_lock_irq ( & mapping - > tree_lock ) ;
ret = radix_tree_gang_lookup_tag ( & mapping - > page_tree ,
( void * * ) pages , * index , nr_pages , tag ) ;
for ( i = 0 ; i < ret ; i + + )
page_cache_get ( pages [ i ] ) ;
if ( ret )
* index = pages [ ret - 1 ] - > index + 1 ;
read_unlock_irq ( & mapping - > tree_lock ) ;
return ret ;
}
/*
* Same as grab_cache_page , but do not wait if the page is unavailable .
* This is intended for speculative data generators , where the data can
* be regenerated if the page couldn ' t be grabbed . This routine should
* be safe to call while holding the lock for another page .
*
* Clear __GFP_FS when allocating the page to avoid recursion into the fs
* and deadlock against the caller ' s locked page .
*/
struct page *
grab_cache_page_nowait ( struct address_space * mapping , unsigned long index )
{
struct page * page = find_get_page ( mapping , index ) ;
unsigned int gfp_mask ;
if ( page ) {
if ( ! TestSetPageLocked ( page ) )
return page ;
page_cache_release ( page ) ;
return NULL ;
}
gfp_mask = mapping_gfp_mask ( mapping ) & ~ __GFP_FS ;
page = alloc_pages ( gfp_mask , 0 ) ;
if ( page & & add_to_page_cache_lru ( page , mapping , index , gfp_mask ) ) {
page_cache_release ( page ) ;
page = NULL ;
}
return page ;
}
EXPORT_SYMBOL ( grab_cache_page_nowait ) ;
/*
* This is a generic file read routine , and uses the
* mapping - > a_ops - > readpage ( ) function for the actual low - level
* stuff .
*
* This is really ugly . But the goto ' s actually try to clarify some
* of the logic when it comes to error handling etc .
*
* Note the struct file * is only passed for the use of readpage . It may be
* NULL .
*/
void do_generic_mapping_read ( struct address_space * mapping ,
struct file_ra_state * _ra ,
struct file * filp ,
loff_t * ppos ,
read_descriptor_t * desc ,
read_actor_t actor )
{
struct inode * inode = mapping - > host ;
unsigned long index ;
unsigned long end_index ;
unsigned long offset ;
unsigned long last_index ;
unsigned long next_index ;
unsigned long prev_index ;
loff_t isize ;
struct page * cached_page ;
int error ;
struct file_ra_state ra = * _ra ;
cached_page = NULL ;
index = * ppos > > PAGE_CACHE_SHIFT ;
next_index = index ;
prev_index = ra . prev_page ;
last_index = ( * ppos + desc - > count + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
offset = * ppos & ~ PAGE_CACHE_MASK ;
isize = i_size_read ( inode ) ;
if ( ! isize )
goto out ;
end_index = ( isize - 1 ) > > PAGE_CACHE_SHIFT ;
for ( ; ; ) {
struct page * page ;
unsigned long nr , ret ;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE ;
if ( index > = end_index ) {
if ( index > end_index )
goto out ;
nr = ( ( isize - 1 ) & ~ PAGE_CACHE_MASK ) + 1 ;
if ( nr < = offset ) {
goto out ;
}
}
nr = nr - offset ;
cond_resched ( ) ;
if ( index = = next_index )
next_index = page_cache_readahead ( mapping , & ra , filp ,
index , last_index - index ) ;
find_page :
page = find_get_page ( mapping , index ) ;
if ( unlikely ( page = = NULL ) ) {
handle_ra_miss ( mapping , & ra , index ) ;
goto no_cached_page ;
}
if ( ! PageUptodate ( page ) )
goto page_not_up_to_date ;
page_ok :
/* If users can be writing to this page using arbitrary
* virtual addresses , take care about potential aliasing
* before reading the page on the kernel side .
*/
if ( mapping_writably_mapped ( mapping ) )
flush_dcache_page ( page ) ;
/*
* When ( part of ) the same page is read multiple times
* in succession , only mark it as accessed the first time .
*/
if ( prev_index ! = index )
mark_page_accessed ( page ) ;
prev_index = index ;
/*
* Ok , we have the page , and it ' s up - to - date , so
* now we can copy it to user space . . .
*
* The actor routine returns how many bytes were actually used . .
* NOTE ! This may not be the same as how much of a user buffer
* we filled up ( we may be padding etc ) , so we can only update
* " pos " here ( the actor routine has to update the user buffer
* pointers and the remaining count ) .
*/
ret = actor ( desc , page , offset , nr ) ;
offset + = ret ;
index + = offset > > PAGE_CACHE_SHIFT ;
offset & = ~ PAGE_CACHE_MASK ;
page_cache_release ( page ) ;
if ( ret = = nr & & desc - > count )
continue ;
goto out ;
page_not_up_to_date :
/* Get exclusive access to the page ... */
lock_page ( page ) ;
/* Did it get unhashed before we got the lock? */
if ( ! page - > mapping ) {
unlock_page ( page ) ;
page_cache_release ( page ) ;
continue ;
}
/* Did somebody else fill it already? */
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto page_ok ;
}
readpage :
/* Start the actual read. The read will unlock the page. */
error = mapping - > a_ops - > readpage ( filp , page ) ;
if ( unlikely ( error ) )
goto readpage_error ;
if ( ! PageUptodate ( page ) ) {
lock_page ( page ) ;
if ( ! PageUptodate ( page ) ) {
if ( page - > mapping = = NULL ) {
/*
* invalidate_inode_pages got it
*/
unlock_page ( page ) ;
page_cache_release ( page ) ;
goto find_page ;
}
unlock_page ( page ) ;
error = - EIO ;
goto readpage_error ;
}
unlock_page ( page ) ;
}
/*
* i_size must be checked after we have done - > readpage .
*
* Checking i_size after the readpage allows us to calculate
* the correct value for " nr " , which means the zero - filled
* part of the page is not copied back to userspace ( unless
* another truncate extends the file - this is desired though ) .
*/
isize = i_size_read ( inode ) ;
end_index = ( isize - 1 ) > > PAGE_CACHE_SHIFT ;
if ( unlikely ( ! isize | | index > end_index ) ) {
page_cache_release ( page ) ;
goto out ;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE ;
if ( index = = end_index ) {
nr = ( ( isize - 1 ) & ~ PAGE_CACHE_MASK ) + 1 ;
if ( nr < = offset ) {
page_cache_release ( page ) ;
goto out ;
}
}
nr = nr - offset ;
goto page_ok ;
readpage_error :
/* UHHUH! A synchronous read error occurred. Report it */
desc - > error = error ;
page_cache_release ( page ) ;
goto out ;
no_cached_page :
/*
* Ok , it wasn ' t cached , so we need to create a new
* page . .
*/
if ( ! cached_page ) {
cached_page = page_cache_alloc_cold ( mapping ) ;
if ( ! cached_page ) {
desc - > error = - ENOMEM ;
goto out ;
}
}
error = add_to_page_cache_lru ( cached_page , mapping ,
index , GFP_KERNEL ) ;
if ( error ) {
if ( error = = - EEXIST )
goto find_page ;
desc - > error = error ;
goto out ;
}
page = cached_page ;
cached_page = NULL ;
goto readpage ;
}
out :
* _ra = ra ;
* ppos = ( ( loff_t ) index < < PAGE_CACHE_SHIFT ) + offset ;
if ( cached_page )
page_cache_release ( cached_page ) ;
if ( filp )
file_accessed ( filp ) ;
}
EXPORT_SYMBOL ( do_generic_mapping_read ) ;
int file_read_actor ( read_descriptor_t * desc , struct page * page ,
unsigned long offset , unsigned long size )
{
char * kaddr ;
unsigned long left , count = desc - > count ;
if ( size > count )
size = count ;
/*
* Faults on the destination of a read are common , so do it before
* taking the kmap .
*/
if ( ! fault_in_pages_writeable ( desc - > arg . buf , size ) ) {
kaddr = kmap_atomic ( page , KM_USER0 ) ;
left = __copy_to_user_inatomic ( desc - > arg . buf ,
kaddr + offset , size ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
if ( left = = 0 )
goto success ;
}
/* Do it the slow way */
kaddr = kmap ( page ) ;
left = __copy_to_user ( desc - > arg . buf , kaddr + offset , size ) ;
kunmap ( page ) ;
if ( left ) {
size - = left ;
desc - > error = - EFAULT ;
}
success :
desc - > count = count - size ;
desc - > written + = size ;
desc - > arg . buf + = size ;
return size ;
}
/*
* This is the " read() " routine for all filesystems
* that can use the page cache directly .
*/
ssize_t
__generic_file_aio_read ( struct kiocb * iocb , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct file * filp = iocb - > ki_filp ;
ssize_t retval ;
unsigned long seg ;
size_t count ;
count = 0 ;
for ( seg = 0 ; seg < nr_segs ; seg + + ) {
const struct iovec * iv = & iov [ seg ] ;
/*
* If any segment has a negative length , or the cumulative
* length ever wraps negative then return - EINVAL .
*/
count + = iv - > iov_len ;
if ( unlikely ( ( ssize_t ) ( count | iv - > iov_len ) < 0 ) )
return - EINVAL ;
if ( access_ok ( VERIFY_WRITE , iv - > iov_base , iv - > iov_len ) )
continue ;
if ( seg = = 0 )
return - EFAULT ;
nr_segs = seg ;
count - = iv - > iov_len ; /* This segment is no good */
break ;
}
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if ( filp - > f_flags & O_DIRECT ) {
loff_t pos = * ppos , size ;
struct address_space * mapping ;
struct inode * inode ;
mapping = filp - > f_mapping ;
inode = mapping - > host ;
retval = 0 ;
if ( ! count )
goto out ; /* skip atime */
size = i_size_read ( inode ) ;
if ( pos < size ) {
retval = generic_file_direct_IO ( READ , iocb ,
iov , pos , nr_segs ) ;
if ( retval > = 0 & & ! is_sync_kiocb ( iocb ) )
retval = - EIOCBQUEUED ;
if ( retval > 0 )
* ppos = pos + retval ;
}
file_accessed ( filp ) ;
goto out ;
}
retval = 0 ;
if ( count ) {
for ( seg = 0 ; seg < nr_segs ; seg + + ) {
read_descriptor_t desc ;
desc . written = 0 ;
desc . arg . buf = iov [ seg ] . iov_base ;
desc . count = iov [ seg ] . iov_len ;
if ( desc . count = = 0 )
continue ;
desc . error = 0 ;
do_generic_file_read ( filp , ppos , & desc , file_read_actor ) ;
retval + = desc . written ;
if ( ! retval ) {
retval = desc . error ;
break ;
}
}
}
out :
return retval ;
}
EXPORT_SYMBOL ( __generic_file_aio_read ) ;
ssize_t
generic_file_aio_read ( struct kiocb * iocb , char __user * buf , size_t count , loff_t pos )
{
struct iovec local_iov = { . iov_base = buf , . iov_len = count } ;
BUG_ON ( iocb - > ki_pos ! = pos ) ;
return __generic_file_aio_read ( iocb , & local_iov , 1 , & iocb - > ki_pos ) ;
}
EXPORT_SYMBOL ( generic_file_aio_read ) ;
ssize_t
generic_file_read ( struct file * filp , char __user * buf , size_t count , loff_t * ppos )
{
struct iovec local_iov = { . iov_base = buf , . iov_len = count } ;
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , filp ) ;
ret = __generic_file_aio_read ( & kiocb , & local_iov , 1 , ppos ) ;
if ( - EIOCBQUEUED = = ret )
ret = wait_on_sync_kiocb ( & kiocb ) ;
return ret ;
}
EXPORT_SYMBOL ( generic_file_read ) ;
int file_send_actor ( read_descriptor_t * desc , struct page * page , unsigned long offset , unsigned long size )
{
ssize_t written ;
unsigned long count = desc - > count ;
struct file * file = desc - > arg . data ;
if ( size > count )
size = count ;
written = file - > f_op - > sendpage ( file , page , offset ,
size , & file - > f_pos , size < count ) ;
if ( written < 0 ) {
desc - > error = written ;
written = 0 ;
}
desc - > count = count - written ;
desc - > written + = written ;
return written ;
}
ssize_t generic_file_sendfile ( struct file * in_file , loff_t * ppos ,
size_t count , read_actor_t actor , void * target )
{
read_descriptor_t desc ;
if ( ! count )
return 0 ;
desc . written = 0 ;
desc . count = count ;
desc . arg . data = target ;
desc . error = 0 ;
do_generic_file_read ( in_file , ppos , & desc , actor ) ;
if ( desc . written )
return desc . written ;
return desc . error ;
}
EXPORT_SYMBOL ( generic_file_sendfile ) ;
static ssize_t
do_readahead ( struct address_space * mapping , struct file * filp ,
unsigned long index , unsigned long nr )
{
if ( ! mapping | | ! mapping - > a_ops | | ! mapping - > a_ops - > readpage )
return - EINVAL ;
force_page_cache_readahead ( mapping , filp , index ,
max_sane_readahead ( nr ) ) ;
return 0 ;
}
asmlinkage ssize_t sys_readahead ( int fd , loff_t offset , size_t count )
{
ssize_t ret ;
struct file * file ;
ret = - EBADF ;
file = fget ( fd ) ;
if ( file ) {
if ( file - > f_mode & FMODE_READ ) {
struct address_space * mapping = file - > f_mapping ;
unsigned long start = offset > > PAGE_CACHE_SHIFT ;
unsigned long end = ( offset + count - 1 ) > > PAGE_CACHE_SHIFT ;
unsigned long len = end - start + 1 ;
ret = do_readahead ( mapping , file , start , len ) ;
}
fput ( file ) ;
}
return ret ;
}
# ifdef CONFIG_MMU
/*
* This adds the requested page to the page cache if it isn ' t already there ,
* and schedules an I / O to read in its contents from disk .
*/
static int FASTCALL ( page_cache_read ( struct file * file , unsigned long offset ) ) ;
static int fastcall page_cache_read ( struct file * file , unsigned long offset )
{
struct address_space * mapping = file - > f_mapping ;
struct page * page ;
int error ;
page = page_cache_alloc_cold ( mapping ) ;
if ( ! page )
return - ENOMEM ;
error = add_to_page_cache_lru ( page , mapping , offset , GFP_KERNEL ) ;
if ( ! error ) {
error = mapping - > a_ops - > readpage ( file , page ) ;
page_cache_release ( page ) ;
return error ;
}
/*
* We arrive here in the unlikely event that someone
* raced with us and added our page to the cache first
* or we are out of memory for radix - tree nodes .
*/
page_cache_release ( page ) ;
return error = = - EEXIST ? 0 : error ;
}
# define MMAP_LOTSAMISS (100)
/*
* filemap_nopage ( ) is invoked via the vma operations vector for a
* mapped memory region to read in file data during a page fault .
*
* The goto ' s are kind of ugly , but this streamlines the normal case of having
* it in the page cache , and handles the special cases reasonably without
* having a lot of duplicated code .
*/
struct page * filemap_nopage ( struct vm_area_struct * area ,
unsigned long address , int * type )
{
int error ;
struct file * file = area - > vm_file ;
struct address_space * mapping = file - > f_mapping ;
struct file_ra_state * ra = & file - > f_ra ;
struct inode * inode = mapping - > host ;
struct page * page ;
unsigned long size , pgoff ;
int did_readaround = 0 , majmin = VM_FAULT_MINOR ;
pgoff = ( ( address - area - > vm_start ) > > PAGE_CACHE_SHIFT ) + area - > vm_pgoff ;
retry_all :
size = ( i_size_read ( inode ) + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
if ( pgoff > = size )
goto outside_data_content ;
/* If we don't want any read-ahead, don't bother */
if ( VM_RandomReadHint ( area ) )
goto no_cached_page ;
/*
* The readahead code wants to be told about each and every page
* so it can build and shrink its windows appropriately
*
* For sequential accesses , we use the generic readahead logic .
*/
if ( VM_SequentialReadHint ( area ) )
page_cache_readahead ( mapping , ra , file , pgoff , 1 ) ;
/*
* Do we have something in the page cache already ?
*/
retry_find :
page = find_get_page ( mapping , pgoff ) ;
if ( ! page ) {
unsigned long ra_pages ;
if ( VM_SequentialReadHint ( area ) ) {
handle_ra_miss ( mapping , ra , pgoff ) ;
goto no_cached_page ;
}
ra - > mmap_miss + + ;
/*
* Do we miss much more than hit in this file ? If so ,
* stop bothering with read - ahead . It will only hurt .
*/
if ( ra - > mmap_miss > ra - > mmap_hit + MMAP_LOTSAMISS )
goto no_cached_page ;
/*
* To keep the pgmajfault counter straight , we need to
* check did_readaround , as this is an inner loop .
*/
if ( ! did_readaround ) {
majmin = VM_FAULT_MAJOR ;
inc_page_state ( pgmajfault ) ;
}
did_readaround = 1 ;
ra_pages = max_sane_readahead ( file - > f_ra . ra_pages ) ;
if ( ra_pages ) {
pgoff_t start = 0 ;
if ( pgoff > ra_pages / 2 )
start = pgoff - ra_pages / 2 ;
do_page_cache_readahead ( mapping , file , start , ra_pages ) ;
}
page = find_get_page ( mapping , pgoff ) ;
if ( ! page )
goto no_cached_page ;
}
if ( ! did_readaround )
ra - > mmap_hit + + ;
/*
* Ok , found a page in the page cache , now we need to check
* that it ' s up - to - date .
*/
if ( ! PageUptodate ( page ) )
goto page_not_uptodate ;
success :
/*
* Found the page and have a reference on it .
*/
mark_page_accessed ( page ) ;
if ( type )
* type = majmin ;
return page ;
outside_data_content :
/*
* An external ptracer can access pages that normally aren ' t
* accessible . .
*/
if ( area - > vm_mm = = current - > mm )
return NULL ;
/* Fall through to the non-read-ahead case */
no_cached_page :
/*
* We ' re only likely to ever get here if MADV_RANDOM is in
* effect .
*/
error = page_cache_read ( file , pgoff ) ;
grab_swap_token ( ) ;
/*
* The page we want has now been added to the page cache .
* In the unlikely event that someone removed it in the
* meantime , we ' ll just come back here and read it again .
*/
if ( error > = 0 )
goto retry_find ;
/*
* An error return from page_cache_read can result if the
* system is low on memory , or a problem occurs while trying
* to schedule I / O .
*/
if ( error = = - ENOMEM )
return NOPAGE_OOM ;
return NULL ;
page_not_uptodate :
if ( ! did_readaround ) {
majmin = VM_FAULT_MAJOR ;
inc_page_state ( pgmajfault ) ;
}
lock_page ( page ) ;
/* Did it get unhashed while we waited for it? */
if ( ! page - > mapping ) {
unlock_page ( page ) ;
page_cache_release ( page ) ;
goto retry_all ;
}
/* Did somebody else get it up-to-date? */
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto success ;
}
if ( ! mapping - > a_ops - > readpage ( file , page ) ) {
wait_on_page_locked ( page ) ;
if ( PageUptodate ( page ) )
goto success ;
}
/*
* Umm , take care of errors if the page isn ' t up - to - date .
* Try to re - read it _once_ . We do this synchronously ,
* because there really aren ' t any performance issues here
* and we need to check for errors .
*/
lock_page ( page ) ;
/* Somebody truncated the page on us? */
if ( ! page - > mapping ) {
unlock_page ( page ) ;
page_cache_release ( page ) ;
goto retry_all ;
}
/* Somebody else successfully read it in? */
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto success ;
}
ClearPageError ( page ) ;
if ( ! mapping - > a_ops - > readpage ( file , page ) ) {
wait_on_page_locked ( page ) ;
if ( PageUptodate ( page ) )
goto success ;
}
/*
* Things didn ' t work out . Return zero to tell the
* mm layer so , possibly freeing the page cache page first .
*/
page_cache_release ( page ) ;
return NULL ;
}
EXPORT_SYMBOL ( filemap_nopage ) ;
static struct page * filemap_getpage ( struct file * file , unsigned long pgoff ,
int nonblock )
{
struct address_space * mapping = file - > f_mapping ;
struct page * page ;
int error ;
/*
* Do we have something in the page cache already ?
*/
retry_find :
page = find_get_page ( mapping , pgoff ) ;
if ( ! page ) {
if ( nonblock )
return NULL ;
goto no_cached_page ;
}
/*
* Ok , found a page in the page cache , now we need to check
* that it ' s up - to - date .
*/
2005-04-17 02:24:05 +04:00
if ( ! PageUptodate ( page ) ) {
if ( nonblock ) {
page_cache_release ( page ) ;
return NULL ;
}
2005-04-17 02:20:36 +04:00
goto page_not_uptodate ;
2005-04-17 02:24:05 +04:00
}
2005-04-17 02:20:36 +04:00
success :
/*
* Found the page and have a reference on it .
*/
mark_page_accessed ( page ) ;
return page ;
no_cached_page :
error = page_cache_read ( file , pgoff ) ;
/*
* The page we want has now been added to the page cache .
* In the unlikely event that someone removed it in the
* meantime , we ' ll just come back here and read it again .
*/
if ( error > = 0 )
goto retry_find ;
/*
* An error return from page_cache_read can result if the
* system is low on memory , or a problem occurs while trying
* to schedule I / O .
*/
return NULL ;
page_not_uptodate :
lock_page ( page ) ;
/* Did it get unhashed while we waited for it? */
if ( ! page - > mapping ) {
unlock_page ( page ) ;
goto err ;
}
/* Did somebody else get it up-to-date? */
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto success ;
}
if ( ! mapping - > a_ops - > readpage ( file , page ) ) {
wait_on_page_locked ( page ) ;
if ( PageUptodate ( page ) )
goto success ;
}
/*
* Umm , take care of errors if the page isn ' t up - to - date .
* Try to re - read it _once_ . We do this synchronously ,
* because there really aren ' t any performance issues here
* and we need to check for errors .
*/
lock_page ( page ) ;
/* Somebody truncated the page on us? */
if ( ! page - > mapping ) {
unlock_page ( page ) ;
goto err ;
}
/* Somebody else successfully read it in? */
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto success ;
}
ClearPageError ( page ) ;
if ( ! mapping - > a_ops - > readpage ( file , page ) ) {
wait_on_page_locked ( page ) ;
if ( PageUptodate ( page ) )
goto success ;
}
/*
* Things didn ' t work out . Return zero to tell the
* mm layer so , possibly freeing the page cache page first .
*/
err :
page_cache_release ( page ) ;
return NULL ;
}
int filemap_populate ( struct vm_area_struct * vma , unsigned long addr ,
unsigned long len , pgprot_t prot , unsigned long pgoff ,
int nonblock )
{
struct file * file = vma - > vm_file ;
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
unsigned long size ;
struct mm_struct * mm = vma - > vm_mm ;
struct page * page ;
int err ;
if ( ! nonblock )
force_page_cache_readahead ( mapping , vma - > vm_file ,
pgoff , len > > PAGE_CACHE_SHIFT ) ;
repeat :
size = ( i_size_read ( inode ) + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
if ( pgoff + ( len > > PAGE_CACHE_SHIFT ) > size )
return - EINVAL ;
page = filemap_getpage ( file , pgoff , nonblock ) ;
if ( ! page & & ! nonblock )
return - ENOMEM ;
if ( page ) {
err = install_page ( mm , vma , addr , page , prot ) ;
if ( err ) {
page_cache_release ( page ) ;
return err ;
}
} else {
err = install_file_pte ( mm , vma , addr , pgoff , prot ) ;
if ( err )
return err ;
}
len - = PAGE_SIZE ;
addr + = PAGE_SIZE ;
pgoff + + ;
if ( len )
goto repeat ;
return 0 ;
}
struct vm_operations_struct generic_file_vm_ops = {
. nopage = filemap_nopage ,
. populate = filemap_populate ,
} ;
/* This is used for a general mmap of a disk file */
int generic_file_mmap ( struct file * file , struct vm_area_struct * vma )
{
struct address_space * mapping = file - > f_mapping ;
if ( ! mapping - > a_ops - > readpage )
return - ENOEXEC ;
file_accessed ( file ) ;
vma - > vm_ops = & generic_file_vm_ops ;
return 0 ;
}
EXPORT_SYMBOL ( filemap_populate ) ;
/*
* This is for filesystems which do not implement - > writepage .
*/
int generic_file_readonly_mmap ( struct file * file , struct vm_area_struct * vma )
{
if ( ( vma - > vm_flags & VM_SHARED ) & & ( vma - > vm_flags & VM_MAYWRITE ) )
return - EINVAL ;
return generic_file_mmap ( file , vma ) ;
}
# else
int generic_file_mmap ( struct file * file , struct vm_area_struct * vma )
{
return - ENOSYS ;
}
int generic_file_readonly_mmap ( struct file * file , struct vm_area_struct * vma )
{
return - ENOSYS ;
}
# endif /* CONFIG_MMU */
EXPORT_SYMBOL ( generic_file_mmap ) ;
EXPORT_SYMBOL ( generic_file_readonly_mmap ) ;
static inline struct page * __read_cache_page ( struct address_space * mapping ,
unsigned long index ,
int ( * filler ) ( void * , struct page * ) ,
void * data )
{
struct page * page , * cached_page = NULL ;
int err ;
repeat :
page = find_get_page ( mapping , index ) ;
if ( ! page ) {
if ( ! cached_page ) {
cached_page = page_cache_alloc_cold ( mapping ) ;
if ( ! cached_page )
return ERR_PTR ( - ENOMEM ) ;
}
err = add_to_page_cache_lru ( cached_page , mapping ,
index , GFP_KERNEL ) ;
if ( err = = - EEXIST )
goto repeat ;
if ( err < 0 ) {
/* Presumably ENOMEM for radix tree node */
page_cache_release ( cached_page ) ;
return ERR_PTR ( err ) ;
}
page = cached_page ;
cached_page = NULL ;
err = filler ( data , page ) ;
if ( err < 0 ) {
page_cache_release ( page ) ;
page = ERR_PTR ( err ) ;
}
}
if ( cached_page )
page_cache_release ( cached_page ) ;
return page ;
}
/*
* Read into the page cache . If a page already exists ,
* and PageUptodate ( ) is not set , try to fill the page .
*/
struct page * read_cache_page ( struct address_space * mapping ,
unsigned long index ,
int ( * filler ) ( void * , struct page * ) ,
void * data )
{
struct page * page ;
int err ;
retry :
page = __read_cache_page ( mapping , index , filler , data ) ;
if ( IS_ERR ( page ) )
goto out ;
mark_page_accessed ( page ) ;
if ( PageUptodate ( page ) )
goto out ;
lock_page ( page ) ;
if ( ! page - > mapping ) {
unlock_page ( page ) ;
page_cache_release ( page ) ;
goto retry ;
}
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto out ;
}
err = filler ( data , page ) ;
if ( err < 0 ) {
page_cache_release ( page ) ;
page = ERR_PTR ( err ) ;
}
out :
return page ;
}
EXPORT_SYMBOL ( read_cache_page ) ;
/*
* If the page was newly created , increment its refcount and add it to the
* caller ' s lru - buffering pagevec . This function is specifically for
* generic_file_write ( ) .
*/
static inline struct page *
__grab_cache_page ( struct address_space * mapping , unsigned long index ,
struct page * * cached_page , struct pagevec * lru_pvec )
{
int err ;
struct page * page ;
repeat :
page = find_lock_page ( mapping , index ) ;
if ( ! page ) {
if ( ! * cached_page ) {
* cached_page = page_cache_alloc ( mapping ) ;
if ( ! * cached_page )
return NULL ;
}
err = add_to_page_cache ( * cached_page , mapping ,
index , GFP_KERNEL ) ;
if ( err = = - EEXIST )
goto repeat ;
if ( err = = 0 ) {
page = * cached_page ;
page_cache_get ( page ) ;
if ( ! pagevec_add ( lru_pvec , page ) )
__pagevec_lru_add ( lru_pvec ) ;
* cached_page = NULL ;
}
}
return page ;
}
/*
* The logic we want is
*
* if suid or ( sgid and xgrp )
* remove privs
*/
int remove_suid ( struct dentry * dentry )
{
mode_t mode = dentry - > d_inode - > i_mode ;
int kill = 0 ;
int result = 0 ;
/* suid always must be killed */
if ( unlikely ( mode & S_ISUID ) )
kill = ATTR_KILL_SUID ;
/*
* sgid without any exec bits is just a mandatory locking mark ; leave
* it alone . If some exec bits are set , it ' s a real sgid ; kill it .
*/
if ( unlikely ( ( mode & S_ISGID ) & & ( mode & S_IXGRP ) ) )
kill | = ATTR_KILL_SGID ;
if ( unlikely ( kill & & ! capable ( CAP_FSETID ) ) ) {
struct iattr newattrs ;
newattrs . ia_valid = ATTR_FORCE | kill ;
result = notify_change ( dentry , & newattrs ) ;
}
return result ;
}
EXPORT_SYMBOL ( remove_suid ) ;
/*
* Copy as much as we can into the page and return the number of bytes which
* were sucessfully copied . If a fault is encountered then clear the page
* out to ( offset + bytes ) and return the number of bytes which were copied .
*/
static inline size_t
filemap_copy_from_user ( struct page * page , unsigned long offset ,
const char __user * buf , unsigned bytes )
{
char * kaddr ;
int left ;
kaddr = kmap_atomic ( page , KM_USER0 ) ;
left = __copy_from_user_inatomic ( kaddr + offset , buf , bytes ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
if ( left ! = 0 ) {
/* Do it the slow way */
kaddr = kmap ( page ) ;
left = __copy_from_user ( kaddr + offset , buf , bytes ) ;
kunmap ( page ) ;
}
return bytes - left ;
}
static size_t
__filemap_copy_from_user_iovec ( char * vaddr ,
const struct iovec * iov , size_t base , size_t bytes )
{
size_t copied = 0 , left = 0 ;
while ( bytes ) {
char __user * buf = iov - > iov_base + base ;
int copy = min ( bytes , iov - > iov_len - base ) ;
base = 0 ;
left = __copy_from_user_inatomic ( vaddr , buf , copy ) ;
copied + = copy ;
bytes - = copy ;
vaddr + = copy ;
iov + + ;
if ( unlikely ( left ) ) {
/* zero the rest of the target like __copy_from_user */
if ( bytes )
memset ( vaddr , 0 , bytes ) ;
break ;
}
}
return copied - left ;
}
/*
* This has the same sideeffects and return value as filemap_copy_from_user ( ) .
* The difference is that on a fault we need to memset the remainder of the
* page ( out to offset + bytes ) , to emulate filemap_copy_from_user ( ) ' s
* single - segment behaviour .
*/
static inline size_t
filemap_copy_from_user_iovec ( struct page * page , unsigned long offset ,
const struct iovec * iov , size_t base , size_t bytes )
{
char * kaddr ;
size_t copied ;
kaddr = kmap_atomic ( page , KM_USER0 ) ;
copied = __filemap_copy_from_user_iovec ( kaddr + offset , iov ,
base , bytes ) ;
kunmap_atomic ( kaddr , KM_USER0 ) ;
if ( copied ! = bytes ) {
kaddr = kmap ( page ) ;
copied = __filemap_copy_from_user_iovec ( kaddr + offset , iov ,
base , bytes ) ;
kunmap ( page ) ;
}
return copied ;
}
static inline void
filemap_set_next_iovec ( const struct iovec * * iovp , size_t * basep , size_t bytes )
{
const struct iovec * iov = * iovp ;
size_t base = * basep ;
while ( bytes ) {
int copy = min ( bytes , iov - > iov_len - base ) ;
bytes - = copy ;
base + = copy ;
if ( iov - > iov_len = = base ) {
iov + + ;
base = 0 ;
}
}
* iovp = iov ;
* basep = base ;
}
/*
* Performs necessary checks before doing a write
*
* Can adjust writing position aor amount of bytes to write .
* Returns appropriate error code that caller should return or
* zero in case that write should be allowed .
*/
inline int generic_write_checks ( struct file * file , loff_t * pos , size_t * count , int isblk )
{
struct inode * inode = file - > f_mapping - > host ;
unsigned long limit = current - > signal - > rlim [ RLIMIT_FSIZE ] . rlim_cur ;
if ( unlikely ( * pos < 0 ) )
return - EINVAL ;
if ( unlikely ( file - > f_error ) ) {
int err = file - > f_error ;
file - > f_error = 0 ;
return err ;
}
if ( ! isblk ) {
/* FIXME: this is for backwards compatibility with 2.4 */
if ( file - > f_flags & O_APPEND )
* pos = i_size_read ( inode ) ;
if ( limit ! = RLIM_INFINITY ) {
if ( * pos > = limit ) {
send_sig ( SIGXFSZ , current , 0 ) ;
return - EFBIG ;
}
if ( * count > limit - ( typeof ( limit ) ) * pos ) {
* count = limit - ( typeof ( limit ) ) * pos ;
}
}
}
/*
* LFS rule
*/
if ( unlikely ( * pos + * count > MAX_NON_LFS & &
! ( file - > f_flags & O_LARGEFILE ) ) ) {
if ( * pos > = MAX_NON_LFS ) {
send_sig ( SIGXFSZ , current , 0 ) ;
return - EFBIG ;
}
if ( * count > MAX_NON_LFS - ( unsigned long ) * pos ) {
* count = MAX_NON_LFS - ( unsigned long ) * pos ;
}
}
/*
* Are we about to exceed the fs block limit ?
*
* If we have written data it becomes a short write . If we have
* exceeded without writing data we send a signal and return EFBIG .
* Linus frestrict idea will clean these up nicely . .
*/
if ( likely ( ! isblk ) ) {
if ( unlikely ( * pos > = inode - > i_sb - > s_maxbytes ) ) {
if ( * count | | * pos > inode - > i_sb - > s_maxbytes ) {
send_sig ( SIGXFSZ , current , 0 ) ;
return - EFBIG ;
}
/* zero-length writes at ->s_maxbytes are OK */
}
if ( unlikely ( * pos + * count > inode - > i_sb - > s_maxbytes ) )
* count = inode - > i_sb - > s_maxbytes - * pos ;
} else {
loff_t isize ;
if ( bdev_read_only ( I_BDEV ( inode ) ) )
return - EPERM ;
isize = i_size_read ( inode ) ;
if ( * pos > = isize ) {
if ( * count | | * pos > isize )
return - ENOSPC ;
}
if ( * pos + * count > isize )
* count = isize - * pos ;
}
return 0 ;
}
EXPORT_SYMBOL ( generic_write_checks ) ;
ssize_t
generic_file_direct_write ( struct kiocb * iocb , const struct iovec * iov ,
unsigned long * nr_segs , loff_t pos , loff_t * ppos ,
size_t count , size_t ocount )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t written ;
if ( count ! = ocount )
* nr_segs = iov_shorten ( ( struct iovec * ) iov , * nr_segs , count ) ;
written = generic_file_direct_IO ( WRITE , iocb , iov , pos , * nr_segs ) ;
if ( written > 0 ) {
loff_t end = pos + written ;
if ( end > i_size_read ( inode ) & & ! S_ISBLK ( inode - > i_mode ) ) {
i_size_write ( inode , end ) ;
mark_inode_dirty ( inode ) ;
}
* ppos = end ;
}
/*
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO .
* i_sem is held , which protects generic_osync_inode ( ) from
* livelocking .
*/
if ( written > = 0 & & file - > f_flags & O_SYNC )
generic_osync_inode ( inode , mapping , OSYNC_METADATA ) ;
if ( written = = count & & ! is_sync_kiocb ( iocb ) )
written = - EIOCBQUEUED ;
return written ;
}
EXPORT_SYMBOL ( generic_file_direct_write ) ;
ssize_t
generic_file_buffered_write ( struct kiocb * iocb , const struct iovec * iov ,
unsigned long nr_segs , loff_t pos , loff_t * ppos ,
size_t count , ssize_t written )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
struct address_space_operations * a_ops = mapping - > a_ops ;
struct inode * inode = mapping - > host ;
long status = 0 ;
struct page * page ;
struct page * cached_page = NULL ;
size_t bytes ;
struct pagevec lru_pvec ;
const struct iovec * cur_iov = iov ; /* current iovec */
size_t iov_base = 0 ; /* offset in the current iovec */
char __user * buf ;
pagevec_init ( & lru_pvec , 0 ) ;
/*
* handle partial DIO write . Adjust cur_iov if needed .
*/
if ( likely ( nr_segs = = 1 ) )
buf = iov - > iov_base + written ;
else {
filemap_set_next_iovec ( & cur_iov , & iov_base , written ) ;
buf = iov - > iov_base + iov_base ;
}
do {
unsigned long index ;
unsigned long offset ;
size_t copied ;
offset = ( pos & ( PAGE_CACHE_SIZE - 1 ) ) ; /* Within page */
index = pos > > PAGE_CACHE_SHIFT ;
bytes = PAGE_CACHE_SIZE - offset ;
if ( bytes > count )
bytes = count ;
/*
* Bring in the user page that we will copy from _first_ .
* Otherwise there ' s a nasty deadlock on copying from the
* same page as we ' re writing to , without it being marked
* up - to - date .
*/
fault_in_pages_readable ( buf , bytes ) ;
page = __grab_cache_page ( mapping , index , & cached_page , & lru_pvec ) ;
if ( ! page ) {
status = - ENOMEM ;
break ;
}
status = a_ops - > prepare_write ( file , page , offset , offset + bytes ) ;
if ( unlikely ( status ) ) {
loff_t isize = i_size_read ( inode ) ;
/*
* prepare_write ( ) may have instantiated a few blocks
* outside i_size . Trim these off again .
*/
unlock_page ( page ) ;
page_cache_release ( page ) ;
if ( pos + bytes > isize )
vmtruncate ( inode , isize ) ;
break ;
}
if ( likely ( nr_segs = = 1 ) )
copied = filemap_copy_from_user ( page , offset ,
buf , bytes ) ;
else
copied = filemap_copy_from_user_iovec ( page , offset ,
cur_iov , iov_base , bytes ) ;
flush_dcache_page ( page ) ;
status = a_ops - > commit_write ( file , page , offset , offset + bytes ) ;
if ( likely ( copied > 0 ) ) {
if ( ! status )
status = copied ;
if ( status > = 0 ) {
written + = status ;
count - = status ;
pos + = status ;
buf + = status ;
if ( unlikely ( nr_segs > 1 ) )
filemap_set_next_iovec ( & cur_iov ,
& iov_base , status ) ;
}
}
if ( unlikely ( copied ! = bytes ) )
if ( status > = 0 )
status = - EFAULT ;
unlock_page ( page ) ;
mark_page_accessed ( page ) ;
page_cache_release ( page ) ;
if ( status < 0 )
break ;
balance_dirty_pages_ratelimited ( mapping ) ;
cond_resched ( ) ;
} while ( count ) ;
* ppos = pos ;
if ( cached_page )
page_cache_release ( cached_page ) ;
/*
* For now , when the user asks for O_SYNC , we ' ll actually give O_DSYNC
*/
if ( likely ( status > = 0 ) ) {
if ( unlikely ( ( file - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
if ( ! a_ops - > writepage | | ! is_sync_kiocb ( iocb ) )
status = generic_osync_inode ( inode , mapping ,
OSYNC_METADATA | OSYNC_DATA ) ;
}
}
/*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes ( block instantiation inside i_size ) . So we sync
* the file data here , to try to honour O_DIRECT expectations .
*/
if ( unlikely ( file - > f_flags & O_DIRECT ) & & written )
status = filemap_write_and_wait ( mapping ) ;
pagevec_lru_add ( & lru_pvec ) ;
return written ? written : status ;
}
EXPORT_SYMBOL ( generic_file_buffered_write ) ;
ssize_t
__generic_file_aio_write_nolock ( struct kiocb * iocb , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
size_t ocount ; /* original count */
size_t count ; /* after file limit checks */
struct inode * inode = mapping - > host ;
unsigned long seg ;
loff_t pos ;
ssize_t written ;
ssize_t err ;
ocount = 0 ;
for ( seg = 0 ; seg < nr_segs ; seg + + ) {
const struct iovec * iv = & iov [ seg ] ;
/*
* If any segment has a negative length , or the cumulative
* length ever wraps negative then return - EINVAL .
*/
ocount + = iv - > iov_len ;
if ( unlikely ( ( ssize_t ) ( ocount | iv - > iov_len ) < 0 ) )
return - EINVAL ;
if ( access_ok ( VERIFY_READ , iv - > iov_base , iv - > iov_len ) )
continue ;
if ( seg = = 0 )
return - EFAULT ;
nr_segs = seg ;
ocount - = iv - > iov_len ; /* This segment is no good */
break ;
}
count = ocount ;
pos = * ppos ;
vfs_check_frozen ( inode - > i_sb , SB_FREEZE_WRITE ) ;
/* We can write back this queue in page reclaim */
current - > backing_dev_info = mapping - > backing_dev_info ;
written = 0 ;
err = generic_write_checks ( file , & pos , & count , S_ISBLK ( inode - > i_mode ) ) ;
if ( err )
goto out ;
if ( count = = 0 )
goto out ;
err = remove_suid ( file - > f_dentry ) ;
if ( err )
goto out ;
inode_update_time ( inode , 1 ) ;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if ( unlikely ( file - > f_flags & O_DIRECT ) ) {
written = generic_file_direct_write ( iocb , iov ,
& nr_segs , pos , ppos , count , ocount ) ;
if ( written < 0 | | written = = count )
goto out ;
/*
* direct - io write to a hole : fall through to buffered I / O
* for completing the rest of the request .
*/
pos + = written ;
count - = written ;
}
written = generic_file_buffered_write ( iocb , iov , nr_segs ,
pos , ppos , count , written ) ;
out :
current - > backing_dev_info = NULL ;
return written ? written : err ;
}
EXPORT_SYMBOL ( generic_file_aio_write_nolock ) ;
ssize_t
generic_file_aio_write_nolock ( struct kiocb * iocb , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t ret ;
loff_t pos = * ppos ;
ret = __generic_file_aio_write_nolock ( iocb , iov , nr_segs , ppos ) ;
if ( ret > 0 & & ( ( file - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
int err ;
err = sync_page_range_nolock ( inode , mapping , pos , ret ) ;
if ( err < 0 )
ret = err ;
}
return ret ;
}
ssize_t
__generic_file_write_nolock ( struct file * file , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , file ) ;
ret = __generic_file_aio_write_nolock ( & kiocb , iov , nr_segs , ppos ) ;
if ( ret = = - EIOCBQUEUED )
ret = wait_on_sync_kiocb ( & kiocb ) ;
return ret ;
}
ssize_t
generic_file_write_nolock ( struct file * file , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , file ) ;
ret = generic_file_aio_write_nolock ( & kiocb , iov , nr_segs , ppos ) ;
if ( - EIOCBQUEUED = = ret )
ret = wait_on_sync_kiocb ( & kiocb ) ;
return ret ;
}
EXPORT_SYMBOL ( generic_file_write_nolock ) ;
ssize_t generic_file_aio_write ( struct kiocb * iocb , const char __user * buf ,
size_t count , loff_t pos )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t ret ;
struct iovec local_iov = { . iov_base = ( void __user * ) buf ,
. iov_len = count } ;
BUG_ON ( iocb - > ki_pos ! = pos ) ;
down ( & inode - > i_sem ) ;
ret = __generic_file_aio_write_nolock ( iocb , & local_iov , 1 ,
& iocb - > ki_pos ) ;
up ( & inode - > i_sem ) ;
if ( ret > 0 & & ( ( file - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
ssize_t err ;
err = sync_page_range ( inode , mapping , pos , ret ) ;
if ( err < 0 )
ret = err ;
}
return ret ;
}
EXPORT_SYMBOL ( generic_file_aio_write ) ;
ssize_t generic_file_write ( struct file * file , const char __user * buf ,
size_t count , loff_t * ppos )
{
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t ret ;
struct iovec local_iov = { . iov_base = ( void __user * ) buf ,
. iov_len = count } ;
down ( & inode - > i_sem ) ;
ret = __generic_file_write_nolock ( file , & local_iov , 1 , ppos ) ;
up ( & inode - > i_sem ) ;
if ( ret > 0 & & ( ( file - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
ssize_t err ;
err = sync_page_range ( inode , mapping , * ppos - ret , ret ) ;
if ( err < 0 )
ret = err ;
}
return ret ;
}
EXPORT_SYMBOL ( generic_file_write ) ;
ssize_t generic_file_readv ( struct file * filp , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , filp ) ;
ret = __generic_file_aio_read ( & kiocb , iov , nr_segs , ppos ) ;
if ( - EIOCBQUEUED = = ret )
ret = wait_on_sync_kiocb ( & kiocb ) ;
return ret ;
}
EXPORT_SYMBOL ( generic_file_readv ) ;
ssize_t generic_file_writev ( struct file * file , const struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos )
{
struct address_space * mapping = file - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t ret ;
down ( & inode - > i_sem ) ;
ret = __generic_file_write_nolock ( file , iov , nr_segs , ppos ) ;
up ( & inode - > i_sem ) ;
if ( ret > 0 & & ( ( file - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
int err ;
err = sync_page_range ( inode , mapping , * ppos - ret , ret ) ;
if ( err < 0 )
ret = err ;
}
return ret ;
}
EXPORT_SYMBOL ( generic_file_writev ) ;
/*
* Called under i_sem for writes to S_ISREG files . Returns - EIO if something
* went wrong during pagecache shootdown .
*/
ssize_t
generic_file_direct_IO ( int rw , struct kiocb * iocb , const struct iovec * iov ,
loff_t offset , unsigned long nr_segs )
{
struct file * file = iocb - > ki_filp ;
struct address_space * mapping = file - > f_mapping ;
ssize_t retval ;
size_t write_len = 0 ;
/*
* If it ' s a write , unmap all mmappings of the file up - front . This
* will cause any pte dirty bits to be propagated into the pageframes
* for the subsequent filemap_write_and_wait ( ) .
*/
if ( rw = = WRITE ) {
write_len = iov_length ( iov , nr_segs ) ;
if ( mapping_mapped ( mapping ) )
unmap_mapping_range ( mapping , offset , write_len , 0 ) ;
}
retval = filemap_write_and_wait ( mapping ) ;
if ( retval = = 0 ) {
retval = mapping - > a_ops - > direct_IO ( rw , iocb , iov ,
offset , nr_segs ) ;
if ( rw = = WRITE & & mapping - > nrpages ) {
pgoff_t end = ( offset + write_len - 1 )
> > PAGE_CACHE_SHIFT ;
int err = invalidate_inode_pages2_range ( mapping ,
offset > > PAGE_CACHE_SHIFT , end ) ;
if ( err )
retval = err ;
}
}
return retval ;
}
EXPORT_SYMBOL_GPL ( generic_file_direct_IO ) ;