2016-06-21 09:23:11 +10:00
/*
* Copyright ( C ) 2010 Red Hat , Inc .
* Copyright ( c ) 2016 Christoph Hellwig .
*
* This program is free software ; you can redistribute it and / or modify it
* under the terms and conditions of the GNU General Public License ,
* version 2 , as published by the Free Software Foundation .
*
* This program is distributed in the hope it will be useful , but WITHOUT
* ANY WARRANTY ; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE . See the GNU General Public License for
* more details .
*/
# include <linux/module.h>
# include <linux/compiler.h>
# include <linux/fs.h>
# include <linux/iomap.h>
# include <linux/uaccess.h>
# include <linux/gfp.h>
# include <linux/mm.h>
# include <linux/swap.h>
# include <linux/pagemap.h>
# include <linux/file.h>
# include <linux/uio.h>
# include <linux/backing-dev.h>
# include <linux/buffer_head.h>
2016-11-30 14:36:01 +11:00
# include <linux/task_io_accounting_ops.h>
2016-06-21 09:31:39 +10:00
# include <linux/dax.h>
2016-06-21 09:23:11 +10:00
# include "internal.h"
/*
* Execute a iomap write on a segment of the mapping that spans a
* contiguous range of pages that have identical block mapping state .
*
* This avoids the need to map pages individually , do individual allocations
* for each page and most importantly avoid the need for filesystem specific
* locking per page . Instead , all the operations are amortised over the entire
* range of pages . It is assumed that the filesystems will lock whatever
* resources they require in the iomap_begin call , and release them in the
* iomap_end call .
*/
2016-09-19 11:24:49 +10:00
loff_t
2016-06-21 09:23:11 +10:00
iomap_apply ( struct inode * inode , loff_t pos , loff_t length , unsigned flags ,
struct iomap_ops * ops , void * data , iomap_actor_t actor )
{
struct iomap iomap = { 0 } ;
loff_t written = 0 , ret ;
/*
* Need to map a range from start position for length bytes . This can
* span multiple pages - it is only guaranteed to return a range of a
* single type of pages ( e . g . all into a hole , all mapped or all
* unwritten ) . Failure at this point has nothing to undo .
*
* If allocation is required for this range , reserve the space now so
* that the allocation is guaranteed to succeed later on . Once we copy
* the data into the page cache pages , then we cannot fail otherwise we
* expose transient stale data . If the reserve fails , we can safely
* back out at this point as there is nothing to undo .
*/
ret = ops - > iomap_begin ( inode , pos , length , flags , & iomap ) ;
if ( ret )
return ret ;
if ( WARN_ON ( iomap . offset > pos ) )
return - EIO ;
/*
* Cut down the length to the one actually provided by the filesystem ,
* as it might not be able to give us the whole size that we requested .
*/
if ( iomap . offset + iomap . length < pos + length )
length = iomap . offset + iomap . length - pos ;
/*
* Now that we have guaranteed that the space allocation will succeed .
* we can do the copy - in page by page without having to worry about
* failures exposing transient data .
*/
written = actor ( inode , pos , length , data , & iomap ) ;
/*
* Now the data has been copied , commit the range we ' ve copied . This
* should not fail unless the filesystem has had a fatal error .
*/
2016-08-17 08:42:34 +10:00
if ( ops - > iomap_end ) {
ret = ops - > iomap_end ( inode , pos , length ,
written > 0 ? written : 0 ,
flags , & iomap ) ;
}
2016-06-21 09:23:11 +10:00
return written ? written : ret ;
}
static void
iomap_write_failed ( struct inode * inode , loff_t pos , unsigned len )
{
loff_t i_size = i_size_read ( inode ) ;
/*
* Only truncate newly allocated pages beyoned EOF , even if the
* write started inside the existing inode size .
*/
if ( pos + len > i_size )
truncate_pagecache_range ( inode , max ( pos , i_size ) , pos + len ) ;
}
static int
iomap_write_begin ( struct inode * inode , loff_t pos , unsigned len , unsigned flags ,
struct page * * pagep , struct iomap * iomap )
{
pgoff_t index = pos > > PAGE_SHIFT ;
struct page * page ;
int status = 0 ;
BUG_ON ( pos + len > iomap - > offset + iomap - > length ) ;
page = grab_cache_page_write_begin ( inode - > i_mapping , index , flags ) ;
if ( ! page )
return - ENOMEM ;
status = __block_write_begin_int ( page , pos , len , NULL , iomap ) ;
if ( unlikely ( status ) ) {
unlock_page ( page ) ;
put_page ( page ) ;
page = NULL ;
iomap_write_failed ( inode , pos , len ) ;
}
* pagep = page ;
return status ;
}
static int
iomap_write_end ( struct inode * inode , loff_t pos , unsigned len ,
unsigned copied , struct page * page )
{
int ret ;
ret = generic_write_end ( NULL , inode - > i_mapping , pos , len ,
copied , page , NULL ) ;
if ( ret < len )
iomap_write_failed ( inode , pos , len ) ;
return ret ;
}
static loff_t
iomap_write_actor ( struct inode * inode , loff_t pos , loff_t length , void * data ,
struct iomap * iomap )
{
struct iov_iter * i = data ;
long status = 0 ;
ssize_t written = 0 ;
unsigned int flags = AOP_FLAG_NOFS ;
/*
* Copies from kernel address space cannot fail ( NFSD is a big user ) .
*/
if ( ! iter_is_iovec ( i ) )
flags | = AOP_FLAG_UNINTERRUPTIBLE ;
do {
struct page * page ;
unsigned long offset ; /* Offset into pagecache page */
unsigned long bytes ; /* Bytes to write to page */
size_t copied ; /* Bytes copied from user */
offset = ( pos & ( PAGE_SIZE - 1 ) ) ;
bytes = min_t ( unsigned long , PAGE_SIZE - offset ,
iov_iter_count ( i ) ) ;
again :
if ( bytes > length )
bytes = length ;
/*
* Bring in the user page that we will copy from _first_ .
* Otherwise there ' s a nasty deadlock on copying from the
* same page as we ' re writing to , without it being marked
* up - to - date .
*
* Not only is this an optimisation , but it is also required
* to check that the address is actually valid , when atomic
* usercopies are used , below .
*/
if ( unlikely ( iov_iter_fault_in_readable ( i , bytes ) ) ) {
status = - EFAULT ;
break ;
}
status = iomap_write_begin ( inode , pos , bytes , flags , & page ,
iomap ) ;
if ( unlikely ( status ) )
break ;
if ( mapping_writably_mapped ( inode - > i_mapping ) )
flush_dcache_page ( page ) ;
copied = iov_iter_copy_from_user_atomic ( page , i , offset , bytes ) ;
flush_dcache_page ( page ) ;
status = iomap_write_end ( inode , pos , bytes , copied , page ) ;
if ( unlikely ( status < 0 ) )
break ;
copied = status ;
cond_resched ( ) ;
iov_iter_advance ( i , copied ) ;
if ( unlikely ( copied = = 0 ) ) {
/*
* If we were unable to copy any data at all , we must
* fall back to a single segment length write .
*
* If we didn ' t fallback here , we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault .
*/
bytes = min_t ( unsigned long , PAGE_SIZE - offset ,
iov_iter_single_seg_count ( i ) ) ;
goto again ;
}
pos + = copied ;
written + = copied ;
length - = copied ;
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
} while ( iov_iter_count ( i ) & & length ) ;
return written ? written : status ;
}
ssize_t
iomap_file_buffered_write ( struct kiocb * iocb , struct iov_iter * iter ,
struct iomap_ops * ops )
{
struct inode * inode = iocb - > ki_filp - > f_mapping - > host ;
loff_t pos = iocb - > ki_pos , ret = 0 , written = 0 ;
while ( iov_iter_count ( iter ) ) {
ret = iomap_apply ( inode , pos , iov_iter_count ( iter ) ,
IOMAP_WRITE , ops , iter , iomap_write_actor ) ;
if ( ret < = 0 )
break ;
pos + = ret ;
written + = ret ;
}
return written ? written : ret ;
}
EXPORT_SYMBOL_GPL ( iomap_file_buffered_write ) ;
2016-09-19 10:12:45 +10:00
static struct page *
__iomap_read_page ( struct inode * inode , loff_t offset )
{
struct address_space * mapping = inode - > i_mapping ;
struct page * page ;
page = read_mapping_page ( mapping , offset > > PAGE_SHIFT , NULL ) ;
if ( IS_ERR ( page ) )
return page ;
if ( ! PageUptodate ( page ) ) {
put_page ( page ) ;
return ERR_PTR ( - EIO ) ;
}
return page ;
}
static loff_t
iomap_dirty_actor ( struct inode * inode , loff_t pos , loff_t length , void * data ,
struct iomap * iomap )
{
long status = 0 ;
ssize_t written = 0 ;
do {
struct page * page , * rpage ;
unsigned long offset ; /* Offset into pagecache page */
unsigned long bytes ; /* Bytes to write to page */
offset = ( pos & ( PAGE_SIZE - 1 ) ) ;
bytes = min_t ( unsigned long , PAGE_SIZE - offset , length ) ;
rpage = __iomap_read_page ( inode , pos ) ;
if ( IS_ERR ( rpage ) )
return PTR_ERR ( rpage ) ;
status = iomap_write_begin ( inode , pos , bytes ,
AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE ,
& page , iomap ) ;
put_page ( rpage ) ;
if ( unlikely ( status ) )
return status ;
WARN_ON_ONCE ( ! PageUptodate ( page ) ) ;
status = iomap_write_end ( inode , pos , bytes , bytes , page ) ;
if ( unlikely ( status < = 0 ) ) {
if ( WARN_ON_ONCE ( status = = 0 ) )
return - EIO ;
return status ;
}
cond_resched ( ) ;
pos + = status ;
written + = status ;
length - = status ;
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
} while ( length ) ;
return written ;
}
int
iomap_file_dirty ( struct inode * inode , loff_t pos , loff_t len ,
struct iomap_ops * ops )
{
loff_t ret ;
while ( len ) {
ret = iomap_apply ( inode , pos , len , IOMAP_WRITE , ops , NULL ,
iomap_dirty_actor ) ;
if ( ret < = 0 )
return ret ;
pos + = ret ;
len - = ret ;
}
return 0 ;
}
EXPORT_SYMBOL_GPL ( iomap_file_dirty ) ;
2016-06-21 09:23:11 +10:00
static int iomap_zero ( struct inode * inode , loff_t pos , unsigned offset ,
unsigned bytes , struct iomap * iomap )
{
struct page * page ;
int status ;
status = iomap_write_begin ( inode , pos , bytes ,
AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS , & page , iomap ) ;
if ( status )
return status ;
zero_user ( page , offset , bytes ) ;
mark_page_accessed ( page ) ;
return iomap_write_end ( inode , pos , bytes , bytes , page ) ;
}
2016-06-21 09:31:39 +10:00
static int iomap_dax_zero ( loff_t pos , unsigned offset , unsigned bytes ,
struct iomap * iomap )
{
sector_t sector = iomap - > blkno +
( ( ( pos & ~ ( PAGE_SIZE - 1 ) ) - iomap - > offset ) > > 9 ) ;
return __dax_zero_page_range ( iomap - > bdev , sector , offset , bytes ) ;
}
2016-06-21 09:23:11 +10:00
static loff_t
iomap_zero_range_actor ( struct inode * inode , loff_t pos , loff_t count ,
void * data , struct iomap * iomap )
{
bool * did_zero = data ;
loff_t written = 0 ;
int status ;
/* already zeroed? we're done. */
if ( iomap - > type = = IOMAP_HOLE | | iomap - > type = = IOMAP_UNWRITTEN )
return count ;
do {
unsigned offset , bytes ;
offset = pos & ( PAGE_SIZE - 1 ) ; /* Within page */
bytes = min_t ( unsigned , PAGE_SIZE - offset , count ) ;
2016-06-21 09:31:39 +10:00
if ( IS_DAX ( inode ) )
status = iomap_dax_zero ( pos , offset , bytes , iomap ) ;
else
status = iomap_zero ( inode , pos , offset , bytes , iomap ) ;
2016-06-21 09:23:11 +10:00
if ( status < 0 )
return status ;
pos + = bytes ;
count - = bytes ;
written + = bytes ;
if ( did_zero )
* did_zero = true ;
} while ( count > 0 ) ;
return written ;
}
int
iomap_zero_range ( struct inode * inode , loff_t pos , loff_t len , bool * did_zero ,
struct iomap_ops * ops )
{
loff_t ret ;
while ( len > 0 ) {
ret = iomap_apply ( inode , pos , len , IOMAP_ZERO ,
ops , did_zero , iomap_zero_range_actor ) ;
if ( ret < = 0 )
return ret ;
pos + = ret ;
len - = ret ;
}
return 0 ;
}
EXPORT_SYMBOL_GPL ( iomap_zero_range ) ;
int
iomap_truncate_page ( struct inode * inode , loff_t pos , bool * did_zero ,
struct iomap_ops * ops )
{
unsigned blocksize = ( 1 < < inode - > i_blkbits ) ;
unsigned off = pos & ( blocksize - 1 ) ;
/* Block boundary? Nothing to do */
if ( ! off )
return 0 ;
return iomap_zero_range ( inode , pos , blocksize - off , did_zero , ops ) ;
}
EXPORT_SYMBOL_GPL ( iomap_truncate_page ) ;
static loff_t
iomap_page_mkwrite_actor ( struct inode * inode , loff_t pos , loff_t length ,
void * data , struct iomap * iomap )
{
struct page * page = data ;
int ret ;
2016-10-24 14:20:25 +11:00
ret = __block_write_begin_int ( page , pos , length , NULL , iomap ) ;
2016-06-21 09:23:11 +10:00
if ( ret )
return ret ;
block_commit_write ( page , 0 , length ) ;
return length ;
}
int iomap_page_mkwrite ( struct vm_area_struct * vma , struct vm_fault * vmf ,
struct iomap_ops * ops )
{
struct page * page = vmf - > page ;
struct inode * inode = file_inode ( vma - > vm_file ) ;
unsigned long length ;
loff_t offset , size ;
ssize_t ret ;
lock_page ( page ) ;
size = i_size_read ( inode ) ;
if ( ( page - > mapping ! = inode - > i_mapping ) | |
( page_offset ( page ) > size ) ) {
/* We overload EFAULT to mean page got truncated */
ret = - EFAULT ;
goto out_unlock ;
}
/* page is wholly or partially inside EOF */
if ( ( ( page - > index + 1 ) < < PAGE_SHIFT ) > size )
length = size & ~ PAGE_MASK ;
else
length = PAGE_SIZE ;
offset = page_offset ( page ) ;
while ( length > 0 ) {
2016-11-10 10:26:50 +11:00
ret = iomap_apply ( inode , offset , length ,
IOMAP_WRITE | IOMAP_FAULT , ops , page ,
iomap_page_mkwrite_actor ) ;
2016-06-21 09:23:11 +10:00
if ( unlikely ( ret < = 0 ) )
goto out_unlock ;
offset + = ret ;
length - = ret ;
}
set_page_dirty ( page ) ;
wait_for_stable_page ( page ) ;
return 0 ;
out_unlock :
unlock_page ( page ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( iomap_page_mkwrite ) ;
2016-06-21 09:38:45 +10:00
struct fiemap_ctx {
struct fiemap_extent_info * fi ;
struct iomap prev ;
} ;
static int iomap_to_fiemap ( struct fiemap_extent_info * fi ,
struct iomap * iomap , u32 flags )
{
switch ( iomap - > type ) {
case IOMAP_HOLE :
/* skip holes */
return 0 ;
case IOMAP_DELALLOC :
flags | = FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN ;
break ;
case IOMAP_UNWRITTEN :
flags | = FIEMAP_EXTENT_UNWRITTEN ;
break ;
case IOMAP_MAPPED :
break ;
}
2016-08-29 11:33:58 +10:00
if ( iomap - > flags & IOMAP_F_MERGED )
flags | = FIEMAP_EXTENT_MERGED ;
2016-09-19 10:13:02 +10:00
if ( iomap - > flags & IOMAP_F_SHARED )
flags | = FIEMAP_EXTENT_SHARED ;
2016-08-29 11:33:58 +10:00
2016-06-21 09:38:45 +10:00
return fiemap_fill_next_extent ( fi , iomap - > offset ,
iomap - > blkno ! = IOMAP_NULL_BLOCK ? iomap - > blkno < < 9 : 0 ,
2016-08-29 11:33:58 +10:00
iomap - > length , flags ) ;
2016-06-21 09:38:45 +10:00
}
static loff_t
iomap_fiemap_actor ( struct inode * inode , loff_t pos , loff_t length , void * data ,
struct iomap * iomap )
{
struct fiemap_ctx * ctx = data ;
loff_t ret = length ;
if ( iomap - > type = = IOMAP_HOLE )
return length ;
ret = iomap_to_fiemap ( ctx - > fi , & ctx - > prev , 0 ) ;
ctx - > prev = * iomap ;
switch ( ret ) {
case 0 : /* success */
return length ;
case 1 : /* extent array full */
return 0 ;
default :
return ret ;
}
}
int iomap_fiemap ( struct inode * inode , struct fiemap_extent_info * fi ,
loff_t start , loff_t len , struct iomap_ops * ops )
{
struct fiemap_ctx ctx ;
loff_t ret ;
memset ( & ctx , 0 , sizeof ( ctx ) ) ;
ctx . fi = fi ;
ctx . prev . type = IOMAP_HOLE ;
ret = fiemap_check_flags ( fi , FIEMAP_FLAG_SYNC ) ;
if ( ret )
return ret ;
2016-08-17 08:41:10 +10:00
if ( fi - > fi_flags & FIEMAP_FLAG_SYNC ) {
ret = filemap_write_and_wait ( inode - > i_mapping ) ;
if ( ret )
return ret ;
}
2016-06-21 09:38:45 +10:00
while ( len > 0 ) {
2016-10-20 15:51:28 +11:00
ret = iomap_apply ( inode , start , len , IOMAP_REPORT , ops , & ctx ,
2016-06-21 09:38:45 +10:00
iomap_fiemap_actor ) ;
2016-08-17 08:41:34 +10:00
/* inode with no (attribute) mapping will give ENOENT */
if ( ret = = - ENOENT )
break ;
2016-06-21 09:38:45 +10:00
if ( ret < 0 )
return ret ;
if ( ret = = 0 )
break ;
start + = ret ;
len - = ret ;
}
if ( ctx . prev . type ! = IOMAP_HOLE ) {
ret = iomap_to_fiemap ( fi , & ctx . prev , FIEMAP_EXTENT_LAST ) ;
if ( ret < 0 )
return ret ;
}
return 0 ;
}
EXPORT_SYMBOL_GPL ( iomap_fiemap ) ;
2016-11-30 14:36:01 +11:00
/*
* Private flags for iomap_dio , must not overlap with the public ones in
* iomap . h :
*/
# define IOMAP_DIO_WRITE (1 << 30)
# define IOMAP_DIO_DIRTY (1 << 31)
struct iomap_dio {
struct kiocb * iocb ;
iomap_dio_end_io_t * end_io ;
loff_t i_size ;
loff_t size ;
atomic_t ref ;
unsigned flags ;
int error ;
union {
/* used during submission and for synchronous completion: */
struct {
struct iov_iter * iter ;
struct task_struct * waiter ;
struct request_queue * last_queue ;
blk_qc_t cookie ;
} submit ;
/* used for aio completion: */
struct {
struct work_struct work ;
} aio ;
} ;
} ;
static ssize_t iomap_dio_complete ( struct iomap_dio * dio )
{
struct kiocb * iocb = dio - > iocb ;
ssize_t ret ;
if ( dio - > end_io ) {
ret = dio - > end_io ( iocb ,
dio - > error ? dio - > error : dio - > size ,
dio - > flags ) ;
} else {
ret = dio - > error ;
}
if ( likely ( ! ret ) ) {
ret = dio - > size ;
/* check for short read */
if ( iocb - > ki_pos + ret > dio - > i_size & &
! ( dio - > flags & IOMAP_DIO_WRITE ) )
ret = dio - > i_size - iocb - > ki_pos ;
iocb - > ki_pos + = ret ;
}
inode_dio_end ( file_inode ( iocb - > ki_filp ) ) ;
kfree ( dio ) ;
return ret ;
}
static void iomap_dio_complete_work ( struct work_struct * work )
{
struct iomap_dio * dio = container_of ( work , struct iomap_dio , aio . work ) ;
struct kiocb * iocb = dio - > iocb ;
bool is_write = ( dio - > flags & IOMAP_DIO_WRITE ) ;
ssize_t ret ;
ret = iomap_dio_complete ( dio ) ;
if ( is_write & & ret > 0 )
ret = generic_write_sync ( iocb , ret ) ;
iocb - > ki_complete ( iocb , ret , 0 ) ;
}
/*
* Set an error in the dio if none is set yet . We have to use cmpxchg
* as the submission context and the completion context ( s ) can race to
* update the error .
*/
static inline void iomap_dio_set_error ( struct iomap_dio * dio , int ret )
{
cmpxchg ( & dio - > error , 0 , ret ) ;
}
static void iomap_dio_bio_end_io ( struct bio * bio )
{
struct iomap_dio * dio = bio - > bi_private ;
bool should_dirty = ( dio - > flags & IOMAP_DIO_DIRTY ) ;
if ( bio - > bi_error )
iomap_dio_set_error ( dio , bio - > bi_error ) ;
if ( atomic_dec_and_test ( & dio - > ref ) ) {
if ( is_sync_kiocb ( dio - > iocb ) ) {
struct task_struct * waiter = dio - > submit . waiter ;
WRITE_ONCE ( dio - > submit . waiter , NULL ) ;
wake_up_process ( waiter ) ;
} else if ( dio - > flags & IOMAP_DIO_WRITE ) {
struct inode * inode = file_inode ( dio - > iocb - > ki_filp ) ;
INIT_WORK ( & dio - > aio . work , iomap_dio_complete_work ) ;
queue_work ( inode - > i_sb - > s_dio_done_wq , & dio - > aio . work ) ;
} else {
iomap_dio_complete_work ( & dio - > aio . work ) ;
}
}
if ( should_dirty ) {
bio_check_pages_dirty ( bio ) ;
} else {
struct bio_vec * bvec ;
int i ;
bio_for_each_segment_all ( bvec , bio , i )
put_page ( bvec - > bv_page ) ;
bio_put ( bio ) ;
}
}
static blk_qc_t
iomap_dio_zero ( struct iomap_dio * dio , struct iomap * iomap , loff_t pos ,
unsigned len )
{
struct page * page = ZERO_PAGE ( 0 ) ;
struct bio * bio ;
bio = bio_alloc ( GFP_KERNEL , 1 ) ;
bio - > bi_bdev = iomap - > bdev ;
bio - > bi_iter . bi_sector =
iomap - > blkno + ( ( pos - iomap - > offset ) > > 9 ) ;
bio - > bi_private = dio ;
bio - > bi_end_io = iomap_dio_bio_end_io ;
get_page ( page ) ;
if ( bio_add_page ( bio , page , len , 0 ) ! = len )
BUG ( ) ;
2016-12-14 21:35:31 -08:00
bio_set_op_attrs ( bio , REQ_OP_WRITE , REQ_SYNC | REQ_IDLE ) ;
2016-11-30 14:36:01 +11:00
atomic_inc ( & dio - > ref ) ;
return submit_bio ( bio ) ;
}
static loff_t
iomap_dio_actor ( struct inode * inode , loff_t pos , loff_t length ,
void * data , struct iomap * iomap )
{
struct iomap_dio * dio = data ;
unsigned blkbits = blksize_bits ( bdev_logical_block_size ( iomap - > bdev ) ) ;
unsigned fs_block_size = ( 1 < < inode - > i_blkbits ) , pad ;
unsigned align = iov_iter_alignment ( dio - > submit . iter ) ;
struct iov_iter iter ;
struct bio * bio ;
bool need_zeroout = false ;
int nr_pages , ret ;
if ( ( pos | length | align ) & ( ( 1 < < blkbits ) - 1 ) )
return - EINVAL ;
switch ( iomap - > type ) {
case IOMAP_HOLE :
if ( WARN_ON_ONCE ( dio - > flags & IOMAP_DIO_WRITE ) )
return - EIO ;
/*FALLTHRU*/
case IOMAP_UNWRITTEN :
if ( ! ( dio - > flags & IOMAP_DIO_WRITE ) ) {
iov_iter_zero ( length , dio - > submit . iter ) ;
dio - > size + = length ;
return length ;
}
dio - > flags | = IOMAP_DIO_UNWRITTEN ;
need_zeroout = true ;
break ;
case IOMAP_MAPPED :
if ( iomap - > flags & IOMAP_F_SHARED )
dio - > flags | = IOMAP_DIO_COW ;
if ( iomap - > flags & IOMAP_F_NEW )
need_zeroout = true ;
break ;
default :
WARN_ON_ONCE ( 1 ) ;
return - EIO ;
}
/*
* Operate on a partial iter trimmed to the extent we were called for .
* We ' ll update the iter in the dio once we ' re done with this extent .
*/
iter = * dio - > submit . iter ;
iov_iter_truncate ( & iter , length ) ;
nr_pages = iov_iter_npages ( & iter , BIO_MAX_PAGES ) ;
if ( nr_pages < = 0 )
return nr_pages ;
if ( need_zeroout ) {
/* zero out from the start of the block to the write offset */
pad = pos & ( fs_block_size - 1 ) ;
if ( pad )
iomap_dio_zero ( dio , iomap , pos - pad , pad ) ;
}
do {
if ( dio - > error )
return 0 ;
bio = bio_alloc ( GFP_KERNEL , nr_pages ) ;
bio - > bi_bdev = iomap - > bdev ;
bio - > bi_iter . bi_sector =
iomap - > blkno + ( ( pos - iomap - > offset ) > > 9 ) ;
bio - > bi_private = dio ;
bio - > bi_end_io = iomap_dio_bio_end_io ;
ret = bio_iov_iter_get_pages ( bio , & iter ) ;
if ( unlikely ( ret ) ) {
bio_put ( bio ) ;
return ret ;
}
if ( dio - > flags & IOMAP_DIO_WRITE ) {
2016-12-14 21:35:31 -08:00
bio_set_op_attrs ( bio , REQ_OP_WRITE , REQ_SYNC | REQ_IDLE ) ;
2016-11-30 14:36:01 +11:00
task_io_account_write ( bio - > bi_iter . bi_size ) ;
} else {
bio_set_op_attrs ( bio , REQ_OP_READ , 0 ) ;
if ( dio - > flags & IOMAP_DIO_DIRTY )
bio_set_pages_dirty ( bio ) ;
}
dio - > size + = bio - > bi_iter . bi_size ;
pos + = bio - > bi_iter . bi_size ;
nr_pages = iov_iter_npages ( & iter , BIO_MAX_PAGES ) ;
atomic_inc ( & dio - > ref ) ;
dio - > submit . last_queue = bdev_get_queue ( iomap - > bdev ) ;
dio - > submit . cookie = submit_bio ( bio ) ;
} while ( nr_pages ) ;
if ( need_zeroout ) {
/* zero out from the end of the write to the end of the block */
pad = pos & ( fs_block_size - 1 ) ;
if ( pad )
iomap_dio_zero ( dio , iomap , pos , fs_block_size - pad ) ;
}
iov_iter_advance ( dio - > submit . iter , length ) ;
return length ;
}
ssize_t
iomap_dio_rw ( struct kiocb * iocb , struct iov_iter * iter , struct iomap_ops * ops ,
iomap_dio_end_io_t end_io )
{
struct address_space * mapping = iocb - > ki_filp - > f_mapping ;
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
size_t count = iov_iter_count ( iter ) ;
loff_t pos = iocb - > ki_pos , end = iocb - > ki_pos + count - 1 , ret = 0 ;
unsigned int flags = IOMAP_DIRECT ;
struct blk_plug plug ;
struct iomap_dio * dio ;
lockdep_assert_held ( & inode - > i_rwsem ) ;
if ( ! count )
return 0 ;
dio = kmalloc ( sizeof ( * dio ) , GFP_KERNEL ) ;
if ( ! dio )
return - ENOMEM ;
dio - > iocb = iocb ;
atomic_set ( & dio - > ref , 1 ) ;
dio - > size = 0 ;
dio - > i_size = i_size_read ( inode ) ;
dio - > end_io = end_io ;
dio - > error = 0 ;
dio - > flags = 0 ;
dio - > submit . iter = iter ;
if ( is_sync_kiocb ( iocb ) ) {
dio - > submit . waiter = current ;
dio - > submit . cookie = BLK_QC_T_NONE ;
dio - > submit . last_queue = NULL ;
}
if ( iov_iter_rw ( iter ) = = READ ) {
if ( pos > = dio - > i_size )
goto out_free_dio ;
if ( iter - > type = = ITER_IOVEC )
dio - > flags | = IOMAP_DIO_DIRTY ;
} else {
dio - > flags | = IOMAP_DIO_WRITE ;
flags | = IOMAP_WRITE ;
}
if ( mapping - > nrpages ) {
ret = filemap_write_and_wait_range ( mapping , iocb - > ki_pos , end ) ;
if ( ret )
goto out_free_dio ;
ret = invalidate_inode_pages2_range ( mapping ,
iocb - > ki_pos > > PAGE_SHIFT , end > > PAGE_SHIFT ) ;
WARN_ON_ONCE ( ret ) ;
ret = 0 ;
}
inode_dio_begin ( inode ) ;
blk_start_plug ( & plug ) ;
do {
ret = iomap_apply ( inode , pos , count , flags , ops , dio ,
iomap_dio_actor ) ;
if ( ret < = 0 ) {
/* magic error code to fall back to buffered I/O */
if ( ret = = - ENOTBLK )
ret = 0 ;
break ;
}
pos + = ret ;
} while ( ( count = iov_iter_count ( iter ) ) > 0 ) ;
blk_finish_plug ( & plug ) ;
if ( ret < 0 )
iomap_dio_set_error ( dio , ret ) ;
if ( ret > = 0 & & iov_iter_rw ( iter ) = = WRITE & & ! is_sync_kiocb ( iocb ) & &
! inode - > i_sb - > s_dio_done_wq ) {
ret = sb_init_dio_done_wq ( inode - > i_sb ) ;
if ( ret < 0 )
iomap_dio_set_error ( dio , ret ) ;
}
if ( ! atomic_dec_and_test ( & dio - > ref ) ) {
if ( ! is_sync_kiocb ( iocb ) )
return - EIOCBQUEUED ;
for ( ; ; ) {
set_current_state ( TASK_UNINTERRUPTIBLE ) ;
if ( ! READ_ONCE ( dio - > submit . waiter ) )
break ;
if ( ! ( iocb - > ki_flags & IOCB_HIPRI ) | |
! dio - > submit . last_queue | |
2016-12-14 21:35:31 -08:00
! blk_mq_poll ( dio - > submit . last_queue ,
dio - > submit . cookie ) )
2016-11-30 14:36:01 +11:00
io_schedule ( ) ;
}
__set_current_state ( TASK_RUNNING ) ;
}
/*
* Try again to invalidate clean pages which might have been cached by
* non - direct readahead , or faulted in by get_user_pages ( ) if the source
* of the write was an mmap ' ed region of the file we ' re writing . Either
* one is a pretty crazy thing to do , so we don ' t support it 100 % . If
* this invalidation fails , tough , the write still worked . . .
*/
if ( iov_iter_rw ( iter ) = = WRITE & & mapping - > nrpages ) {
ret = invalidate_inode_pages2_range ( mapping ,
iocb - > ki_pos > > PAGE_SHIFT , end > > PAGE_SHIFT ) ;
WARN_ON_ONCE ( ret ) ;
}
return iomap_dio_complete ( dio ) ;
out_free_dio :
kfree ( dio ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( iomap_dio_rw ) ;