2006-03-30 17:15:30 +04:00
/*
* " splice " : joining two ropes together by interweaving their strands .
*
* This is the " extended pipe " functionality , where a pipe is used as
* an arbitrary in - memory buffer . Think of a pipe as a small kernel
* buffer that you can use to transfer data from one end to the other .
*
* The traditional unix read / write is extended with a " splice() " operation
* that transfers data buffers to or from a pipe buffer .
*
* Named by Larry McVoy , original implementation from Linus , extended by
2006-04-11 15:56:34 +04:00
* Jens to support splicing to files , network , direct splicing , etc and
* fixing lots of bugs .
2006-03-30 17:15:30 +04:00
*
2006-09-04 17:41:16 +04:00
* Copyright ( C ) 2005 - 2006 Jens Axboe < axboe @ kernel . dk >
2006-04-11 15:56:34 +04:00
* Copyright ( C ) 2005 - 2006 Linus Torvalds < torvalds @ osdl . org >
* Copyright ( C ) 2006 Ingo Molnar < mingo @ elte . hu >
2006-03-30 17:15:30 +04:00
*
*/
# include <linux/fs.h>
# include <linux/file.h>
# include <linux/pagemap.h>
# include <linux/pipe_fs_i.h>
# include <linux/mm_inline.h>
2006-03-30 17:16:46 +04:00
# include <linux/swap.h>
2006-04-03 01:04:46 +04:00
# include <linux/writeback.h>
# include <linux/buffer_head.h>
2006-03-31 08:06:13 +04:00
# include <linux/module.h>
2006-04-03 01:04:46 +04:00
# include <linux/syscalls.h>
2006-04-26 12:59:21 +04:00
# include <linux/uio.h>
2006-03-30 17:15:30 +04:00
2006-04-26 12:59:21 +04:00
struct partial_page {
unsigned int offset ;
unsigned int len ;
} ;
/*
2006-04-26 16:39:29 +04:00
* Passed to splice_to_pipe
2006-04-26 12:59:21 +04:00
*/
struct splice_pipe_desc {
struct page * * pages ; /* page map */
struct partial_page * partial ; /* pages[] may not be contig */
int nr_pages ; /* number of pages in map */
unsigned int flags ; /* splice flags */
2006-12-13 11:34:04 +03:00
const struct pipe_buf_operations * ops ; /* ops associated with output pipe */
2006-04-26 12:59:21 +04:00
} ;
2006-04-03 01:05:09 +04:00
/*
* Attempt to steal a page from a pipe buffer . This should perhaps go into
* a vm helper function , it ' s already simplified quite a bit by the
* addition of remove_mapping ( ) . If success is returned , the caller may
* attempt to reuse this page for another destination .
*/
2006-05-03 12:41:33 +04:00
static int page_cache_pipe_buf_steal ( struct pipe_inode_info * pipe ,
2006-03-30 17:16:46 +04:00
struct pipe_buffer * buf )
{
struct page * page = buf - > page ;
2006-06-20 17:01:12 +04:00
struct address_space * mapping ;
2006-03-30 17:16:46 +04:00
2006-04-19 17:57:31 +04:00
lock_page ( page ) ;
2006-06-20 17:01:12 +04:00
mapping = page_mapping ( page ) ;
if ( mapping ) {
WARN_ON ( ! PageUptodate ( page ) ) ;
2006-03-30 17:16:46 +04:00
2006-06-20 17:01:12 +04:00
/*
* At least for ext2 with nobh option , we need to wait on
* writeback completing on this page , since we ' ll remove it
* from the pagecache . Otherwise truncate wont wait on the
* page , allowing the disk blocks to be reused by someone else
* before we actually wrote our data to them . fs corruption
* ensues .
*/
wait_on_page_writeback ( page ) ;
2006-04-03 01:10:32 +04:00
2006-06-20 17:01:12 +04:00
if ( PagePrivate ( page ) )
2006-10-28 21:38:23 +04:00
try_to_release_page ( page , GFP_KERNEL ) ;
2006-04-03 01:04:46 +04:00
2006-06-20 17:01:12 +04:00
/*
* If we succeeded in removing the mapping , set LRU flag
* and return good .
*/
if ( remove_mapping ( mapping , page ) ) {
buf - > flags | = PIPE_BUF_FLAG_LRU ;
return 0 ;
}
2006-04-19 17:57:31 +04:00
}
2006-03-30 17:16:46 +04:00
2006-06-20 17:01:12 +04:00
/*
* Raced with truncate or failed to remove page from current
* address space , unlock and return failure .
*/
unlock_page ( page ) ;
return 1 ;
2006-03-30 17:16:46 +04:00
}
2006-05-03 12:41:33 +04:00
static void page_cache_pipe_buf_release ( struct pipe_inode_info * pipe ,
2006-03-30 17:15:30 +04:00
struct pipe_buffer * buf )
{
page_cache_release ( buf - > page ) ;
2006-05-03 12:35:26 +04:00
buf - > flags & = ~ PIPE_BUF_FLAG_LRU ;
2006-03-30 17:15:30 +04:00
}
2006-05-03 12:41:33 +04:00
static int page_cache_pipe_buf_pin ( struct pipe_inode_info * pipe ,
2006-05-01 21:59:03 +04:00
struct pipe_buffer * buf )
2006-03-30 17:15:30 +04:00
{
struct page * page = buf - > page ;
2006-04-10 11:04:41 +04:00
int err ;
2006-03-30 17:15:30 +04:00
if ( ! PageUptodate ( page ) ) {
2006-04-10 11:04:41 +04:00
lock_page ( page ) ;
/*
* Page got truncated / unhashed . This will cause a 0 - byte
2006-04-11 15:57:21 +04:00
* splice , if this is the first page .
2006-04-10 11:04:41 +04:00
*/
if ( ! page - > mapping ) {
err = - ENODATA ;
goto error ;
}
2006-03-30 17:15:30 +04:00
2006-04-10 11:04:41 +04:00
/*
2006-04-11 15:57:21 +04:00
* Uh oh , read - error from disk .
2006-04-10 11:04:41 +04:00
*/
if ( ! PageUptodate ( page ) ) {
err = - EIO ;
goto error ;
}
/*
2006-05-01 21:59:03 +04:00
* Page is ok afterall , we are done .
2006-04-10 11:04:41 +04:00
*/
2006-03-30 17:15:30 +04:00
unlock_page ( page ) ;
}
2006-05-01 21:59:03 +04:00
return 0 ;
2006-04-10 11:04:41 +04:00
error :
unlock_page ( page ) ;
2006-05-01 21:59:03 +04:00
return err ;
2006-04-11 17:51:17 +04:00
}
2006-12-13 11:34:04 +03:00
static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
2006-03-30 17:15:30 +04:00
. can_merge = 0 ,
2006-05-01 21:59:03 +04:00
. map = generic_pipe_buf_map ,
. unmap = generic_pipe_buf_unmap ,
. pin = page_cache_pipe_buf_pin ,
2006-03-30 17:15:30 +04:00
. release = page_cache_pipe_buf_release ,
2006-03-30 17:16:46 +04:00
. steal = page_cache_pipe_buf_steal ,
2006-05-01 21:59:03 +04:00
. get = generic_pipe_buf_get ,
2006-03-30 17:15:30 +04:00
} ;
2006-04-26 12:59:21 +04:00
static int user_page_pipe_buf_steal ( struct pipe_inode_info * pipe ,
struct pipe_buffer * buf )
{
2006-05-01 22:02:33 +04:00
if ( ! ( buf - > flags & PIPE_BUF_FLAG_GIFT ) )
return 1 ;
2006-05-03 12:35:26 +04:00
buf - > flags | = PIPE_BUF_FLAG_LRU ;
2006-05-02 17:29:57 +04:00
return generic_pipe_buf_steal ( pipe , buf ) ;
2006-04-26 12:59:21 +04:00
}
2006-12-13 11:34:04 +03:00
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
2006-04-26 12:59:21 +04:00
. can_merge = 0 ,
2006-05-01 21:59:03 +04:00
. map = generic_pipe_buf_map ,
. unmap = generic_pipe_buf_unmap ,
. pin = generic_pipe_buf_pin ,
2006-04-26 12:59:21 +04:00
. release = page_cache_pipe_buf_release ,
. steal = user_page_pipe_buf_steal ,
2006-05-01 21:59:03 +04:00
. get = generic_pipe_buf_get ,
2006-04-26 12:59:21 +04:00
} ;
2006-04-03 01:05:09 +04:00
/*
* Pipe output worker . This sets up our pipe format with the page cache
* pipe buffer operations . Otherwise very similar to the regular pipe_writev ( ) .
*/
2006-04-26 16:39:29 +04:00
static ssize_t splice_to_pipe ( struct pipe_inode_info * pipe ,
struct splice_pipe_desc * spd )
2006-03-30 17:15:30 +04:00
{
2006-04-26 12:59:21 +04:00
int ret , do_wakeup , page_nr ;
2006-03-30 17:15:30 +04:00
ret = 0 ;
do_wakeup = 0 ;
2006-04-26 12:59:21 +04:00
page_nr = 0 ;
2006-03-30 17:15:30 +04:00
2006-04-10 17:18:35 +04:00
if ( pipe - > inode )
mutex_lock ( & pipe - > inode - > i_mutex ) ;
2006-03-30 17:15:30 +04:00
for ( ; ; ) {
2006-04-10 17:18:35 +04:00
if ( ! pipe - > readers ) {
2006-03-30 17:15:30 +04:00
send_sig ( SIGPIPE , current , 0 ) ;
if ( ! ret )
ret = - EPIPE ;
break ;
}
2006-04-11 15:53:56 +04:00
if ( pipe - > nrbufs < PIPE_BUFFERS ) {
int newbuf = ( pipe - > curbuf + pipe - > nrbufs ) & ( PIPE_BUFFERS - 1 ) ;
2006-04-10 17:18:35 +04:00
struct pipe_buffer * buf = pipe - > bufs + newbuf ;
2006-03-30 17:15:30 +04:00
2006-04-26 12:59:21 +04:00
buf - > page = spd - > pages [ page_nr ] ;
buf - > offset = spd - > partial [ page_nr ] . offset ;
buf - > len = spd - > partial [ page_nr ] . len ;
buf - > ops = spd - > ops ;
2006-05-01 22:02:33 +04:00
if ( spd - > flags & SPLICE_F_GIFT )
buf - > flags | = PIPE_BUF_FLAG_GIFT ;
2006-04-11 15:53:56 +04:00
pipe - > nrbufs + + ;
2006-04-26 12:59:21 +04:00
page_nr + + ;
ret + = buf - > len ;
2006-04-11 15:53:56 +04:00
if ( pipe - > inode )
do_wakeup = 1 ;
2006-03-30 17:15:30 +04:00
2006-04-26 12:59:21 +04:00
if ( ! - - spd - > nr_pages )
2006-03-30 17:15:30 +04:00
break ;
2006-04-11 15:53:56 +04:00
if ( pipe - > nrbufs < PIPE_BUFFERS )
2006-03-30 17:15:30 +04:00
continue ;
break ;
}
2006-04-26 12:59:21 +04:00
if ( spd - > flags & SPLICE_F_NONBLOCK ) {
2006-04-02 23:46:35 +04:00
if ( ! ret )
ret = - EAGAIN ;
break ;
}
2006-03-30 17:15:30 +04:00
if ( signal_pending ( current ) ) {
if ( ! ret )
ret = - ERESTARTSYS ;
break ;
}
if ( do_wakeup ) {
2006-04-10 11:03:32 +04:00
smp_mb ( ) ;
2006-04-10 17:18:35 +04:00
if ( waitqueue_active ( & pipe - > wait ) )
wake_up_interruptible_sync ( & pipe - > wait ) ;
kill_fasync ( & pipe - > fasync_readers , SIGIO , POLL_IN ) ;
2006-03-30 17:15:30 +04:00
do_wakeup = 0 ;
}
2006-04-10 17:18:35 +04:00
pipe - > waiting_writers + + ;
pipe_wait ( pipe ) ;
pipe - > waiting_writers - - ;
2006-03-30 17:15:30 +04:00
}
2006-04-10 17:18:35 +04:00
if ( pipe - > inode )
mutex_unlock ( & pipe - > inode - > i_mutex ) ;
2006-03-30 17:15:30 +04:00
if ( do_wakeup ) {
2006-04-10 11:03:32 +04:00
smp_mb ( ) ;
2006-04-10 17:18:35 +04:00
if ( waitqueue_active ( & pipe - > wait ) )
wake_up_interruptible ( & pipe - > wait ) ;
kill_fasync ( & pipe - > fasync_readers , SIGIO , POLL_IN ) ;
2006-03-30 17:15:30 +04:00
}
2006-04-26 12:59:21 +04:00
while ( page_nr < spd - > nr_pages )
page_cache_release ( spd - > pages [ page_nr + + ] ) ;
2006-03-30 17:15:30 +04:00
return ret ;
}
2006-04-10 17:18:35 +04:00
static int
2006-04-11 16:57:50 +04:00
__generic_file_splice_read ( struct file * in , loff_t * ppos ,
struct pipe_inode_info * pipe , size_t len ,
unsigned int flags )
2006-03-30 17:15:30 +04:00
{
struct address_space * mapping = in - > f_mapping ;
2006-04-26 12:59:21 +04:00
unsigned int loff , nr_pages ;
2006-04-10 11:03:58 +04:00
struct page * pages [ PIPE_BUFFERS ] ;
2006-04-26 12:59:21 +04:00
struct partial_page partial [ PIPE_BUFFERS ] ;
2006-03-30 17:15:30 +04:00
struct page * page ;
2006-04-19 17:55:10 +04:00
pgoff_t index , end_index ;
loff_t isize ;
2006-04-26 12:59:21 +04:00
size_t total_len ;
2006-04-27 13:05:22 +04:00
int error , page_nr ;
2006-04-26 12:59:21 +04:00
struct splice_pipe_desc spd = {
. pages = pages ,
. partial = partial ,
. flags = flags ,
. ops = & page_cache_pipe_buf_ops ,
} ;
2006-03-30 17:15:30 +04:00
2006-04-11 16:57:50 +04:00
index = * ppos > > PAGE_CACHE_SHIFT ;
2006-04-26 12:59:21 +04:00
loff = * ppos & ~ PAGE_CACHE_MASK ;
nr_pages = ( len + loff + PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
2006-03-30 17:15:30 +04:00
if ( nr_pages > PIPE_BUFFERS )
nr_pages = PIPE_BUFFERS ;
/*
2006-04-11 15:57:21 +04:00
* Initiate read - ahead on this page range . however , don ' t call into
2006-04-10 11:05:04 +04:00
* read - ahead if this is a non - zero offset ( we are likely doing small
* chunk splice and the page is already there ) for a single page .
2006-03-30 17:15:30 +04:00
*/
2006-04-27 10:44:27 +04:00
if ( ! loff | | nr_pages > 1 )
page_cache_readahead ( mapping , & in - > f_ra , in , index , nr_pages ) ;
2006-03-30 17:15:30 +04:00
/*
2006-04-11 15:57:21 +04:00
* Now fill in the holes :
2006-03-30 17:15:30 +04:00
*/
2006-04-11 15:52:47 +04:00
error = 0 ;
2006-04-26 12:59:21 +04:00
total_len = 0 ;
2006-04-20 15:05:48 +04:00
2006-04-27 13:05:22 +04:00
/*
* Lookup the ( hopefully ) full range of pages we need .
*/
spd . nr_pages = find_get_pages_contig ( mapping , index , nr_pages , pages ) ;
2006-04-20 15:05:48 +04:00
2006-04-27 13:05:22 +04:00
/*
* If find_get_pages_contig ( ) returned fewer pages than we needed ,
* allocate the rest .
*/
index + = spd . nr_pages ;
while ( spd . nr_pages < nr_pages ) {
2006-04-20 15:05:48 +04:00
/*
2006-04-27 13:05:22 +04:00
* Page could be there , find_get_pages_contig ( ) breaks on
* the first hole .
2006-03-30 17:15:30 +04:00
*/
2006-04-11 15:52:47 +04:00
page = find_get_page ( mapping , index ) ;
if ( ! page ) {
2006-05-01 21:59:54 +04:00
/*
* Make sure the read - ahead engine is notified
* about this failure .
*/
handle_ra_miss ( mapping , & in - > f_ra , index ) ;
2006-04-11 15:52:47 +04:00
/*
2006-04-27 13:05:22 +04:00
* page didn ' t exist , allocate one .
2006-04-11 15:52:47 +04:00
*/
page = page_cache_alloc_cold ( mapping ) ;
if ( ! page )
break ;
error = add_to_page_cache_lru ( page , mapping , index ,
2006-10-28 21:38:23 +04:00
GFP_KERNEL ) ;
2006-04-11 15:52:47 +04:00
if ( unlikely ( error ) ) {
page_cache_release ( page ) ;
2006-05-03 12:58:22 +04:00
if ( error = = - EEXIST )
continue ;
2006-04-11 15:52:47 +04:00
break ;
}
2006-04-27 13:05:22 +04:00
/*
* add_to_page_cache ( ) locks the page , unlock it
* to avoid convoluting the logic below even more .
*/
unlock_page ( page ) ;
2006-04-11 15:52:47 +04:00
}
2006-04-27 13:05:22 +04:00
pages [ spd . nr_pages + + ] = page ;
index + + ;
}
/*
* Now loop over the map and see if we need to start IO on any
* pages , fill in the partial map , etc .
*/
index = * ppos > > PAGE_CACHE_SHIFT ;
nr_pages = spd . nr_pages ;
spd . nr_pages = 0 ;
for ( page_nr = 0 ; page_nr < nr_pages ; page_nr + + ) {
unsigned int this_len ;
if ( ! len )
break ;
/*
* this_len is the max we ' ll use from this page
*/
this_len = min_t ( unsigned long , len , PAGE_CACHE_SIZE - loff ) ;
page = pages [ page_nr ] ;
2006-04-11 15:52:47 +04:00
/*
* If the page isn ' t uptodate , we may need to start io on it
*/
if ( ! PageUptodate ( page ) ) {
2006-04-19 17:56:12 +04:00
/*
* If in nonblock mode then dont block on waiting
* for an in - flight io page
*/
if ( flags & SPLICE_F_NONBLOCK )
break ;
2006-04-11 15:52:47 +04:00
lock_page ( page ) ;
/*
* page was truncated , stop here . if this isn ' t the
* first page , we ' ll just complete what we already
* added
*/
if ( ! page - > mapping ) {
unlock_page ( page ) ;
break ;
}
/*
* page was already under io and is now done , great
*/
if ( PageUptodate ( page ) ) {
unlock_page ( page ) ;
goto fill_it ;
}
2006-03-30 17:15:30 +04:00
2006-04-11 15:52:47 +04:00
/*
* need to read in the page
*/
error = mapping - > a_ops - > readpage ( in , page ) ;
2006-03-30 17:15:30 +04:00
if ( unlikely ( error ) ) {
2006-04-27 13:05:22 +04:00
/*
* We really should re - lookup the page here ,
* but it complicates things a lot . Instead
* lets just do what we already stored , and
* we ' ll get it the next time we are called .
*/
2006-04-11 15:52:47 +04:00
if ( error = = AOP_TRUNCATED_PAGE )
2006-04-27 13:05:22 +04:00
error = 0 ;
2006-03-30 17:15:30 +04:00
break ;
}
2006-04-19 17:55:10 +04:00
/*
* i_size must be checked after - > readpage ( ) .
*/
isize = i_size_read ( mapping - > host ) ;
end_index = ( isize - 1 ) > > PAGE_CACHE_SHIFT ;
2006-04-27 13:05:22 +04:00
if ( unlikely ( ! isize | | index > end_index ) )
2006-04-19 17:55:10 +04:00
break ;
/*
* if this is the last page , see if we need to shrink
* the length and stop
*/
if ( end_index = = index ) {
loff = PAGE_CACHE_SIZE - ( isize & ~ PAGE_CACHE_MASK ) ;
2006-04-27 13:05:22 +04:00
if ( total_len + loff > isize )
2006-04-19 17:55:10 +04:00
break ;
/*
* force quit after adding this page
*/
2006-04-27 13:05:22 +04:00
len = this_len ;
2006-04-20 15:05:48 +04:00
this_len = min ( this_len , loff ) ;
2006-04-26 12:59:21 +04:00
loff = 0 ;
2006-04-19 17:55:10 +04:00
}
2006-03-30 17:15:30 +04:00
}
2006-04-11 15:52:47 +04:00
fill_it :
2006-04-27 13:05:22 +04:00
partial [ page_nr ] . offset = loff ;
partial [ page_nr ] . len = this_len ;
2006-04-20 15:05:48 +04:00
len - = this_len ;
2006-04-26 12:59:21 +04:00
total_len + = this_len ;
2006-04-19 17:55:10 +04:00
loff = 0 ;
2006-04-27 13:05:22 +04:00
spd . nr_pages + + ;
index + + ;
2006-03-30 17:15:30 +04:00
}
2006-04-27 13:05:22 +04:00
/*
* Release any pages at the end , if we quit early . ' i ' is how far
* we got , ' nr_pages ' is how many pages are in the map .
*/
while ( page_nr < nr_pages )
page_cache_release ( pages [ page_nr + + ] ) ;
2006-04-26 12:59:21 +04:00
if ( spd . nr_pages )
2006-04-26 16:39:29 +04:00
return splice_to_pipe ( pipe , & spd ) ;
2006-03-30 17:15:30 +04:00
2006-04-11 15:52:47 +04:00
return error ;
2006-03-30 17:15:30 +04:00
}
2006-04-03 01:05:09 +04:00
/**
* generic_file_splice_read - splice data from file to a pipe
* @ in : file to splice from
* @ pipe : pipe to splice to
* @ len : number of bytes to splice
* @ flags : splice modifier flags
*
* Will read pages from given file and fill them into a pipe .
*/
2006-04-11 16:57:50 +04:00
ssize_t generic_file_splice_read ( struct file * in , loff_t * ppos ,
struct pipe_inode_info * pipe , size_t len ,
unsigned int flags )
2006-03-30 17:15:30 +04:00
{
ssize_t spliced ;
int ret ;
ret = 0 ;
spliced = 0 ;
2006-04-10 17:18:35 +04:00
2006-03-30 17:15:30 +04:00
while ( len ) {
2006-04-11 16:57:50 +04:00
ret = __generic_file_splice_read ( in , ppos , pipe , len , flags ) ;
2006-03-30 17:15:30 +04:00
2006-04-19 17:56:12 +04:00
if ( ret < 0 )
2006-03-30 17:15:30 +04:00
break ;
2006-04-19 17:56:12 +04:00
else if ( ! ret ) {
if ( spliced )
break ;
if ( flags & SPLICE_F_NONBLOCK ) {
ret = - EAGAIN ;
break ;
}
}
2006-03-30 17:15:30 +04:00
2006-04-11 16:57:50 +04:00
* ppos + = ret ;
2006-03-30 17:15:30 +04:00
len - = ret ;
spliced + = ret ;
}
if ( spliced )
return spliced ;
return ret ;
}
2006-04-03 01:06:05 +04:00
EXPORT_SYMBOL ( generic_file_splice_read ) ;
2006-03-30 17:15:30 +04:00
/*
2006-04-03 01:04:46 +04:00
* Send ' sd - > len ' bytes to socket from ' sd - > file ' at position ' sd - > pos '
2006-04-25 17:42:00 +04:00
* using sendpage ( ) . Return the number of bytes sent .
2006-03-30 17:15:30 +04:00
*/
2006-05-03 12:41:33 +04:00
static int pipe_to_sendpage ( struct pipe_inode_info * pipe ,
2006-03-30 17:15:30 +04:00
struct pipe_buffer * buf , struct splice_desc * sd )
{
struct file * file = sd - > file ;
loff_t pos = sd - > pos ;
2006-05-01 21:59:03 +04:00
int ret , more ;
2006-03-30 17:15:30 +04:00
2006-05-03 12:41:33 +04:00
ret = buf - > ops - > pin ( pipe , buf ) ;
2006-05-01 21:59:03 +04:00
if ( ! ret ) {
more = ( sd - > flags & SPLICE_F_MORE ) | | sd - > len < sd - > total_len ;
2006-03-30 17:15:30 +04:00
2006-05-01 21:59:03 +04:00
ret = file - > f_op - > sendpage ( file , buf - > page , buf - > offset ,
sd - > len , & pos , more ) ;
}
2006-03-30 17:15:30 +04:00
2006-04-25 17:42:00 +04:00
return ret ;
2006-03-30 17:15:30 +04:00
}
/*
* This is a little more tricky than the file - > pipe splicing . There are
* basically three cases :
*
* - Destination page already exists in the address space and there
* are users of it . For that case we have no other option that
* copying the data . Tough luck .
* - Destination page already exists in the address space , but there
* are no users of it . Make sure it ' s uptodate , then drop it . Fall
* through to last case .
* - Destination page does not exist , we can add the pipe page to
* the page cache and avoid the copy .
*
2006-04-03 01:05:09 +04:00
* If asked to move pages to the output file ( SPLICE_F_MOVE is set in
* sd - > flags ) , we attempt to migrate pages from the pipe to the output
* file address space page cache . This is possible if no one else has
* the pipe page referenced outside of the pipe and page cache . If
* SPLICE_F_MOVE isn ' t set , or we cannot move the page , we simply create
* a new page in the output file page cache and fill / dirty that .
2006-03-30 17:15:30 +04:00
*/
2006-05-03 12:41:33 +04:00
static int pipe_to_file ( struct pipe_inode_info * pipe , struct pipe_buffer * buf ,
2006-03-30 17:15:30 +04:00
struct splice_desc * sd )
{
struct file * file = sd - > file ;
struct address_space * mapping = file - > f_mapping ;
2006-04-25 17:42:00 +04:00
unsigned int offset , this_len ;
2006-03-30 17:15:30 +04:00
struct page * page ;
pgoff_t index ;
2006-04-03 01:11:04 +04:00
int ret ;
2006-03-30 17:15:30 +04:00
/*
2006-04-10 11:04:41 +04:00
* make sure the data in this buffer is uptodate
2006-03-30 17:15:30 +04:00
*/
2006-05-03 12:41:33 +04:00
ret = buf - > ops - > pin ( pipe , buf ) ;
2006-05-01 21:59:03 +04:00
if ( unlikely ( ret ) )
return ret ;
2006-03-30 17:15:30 +04:00
index = sd - > pos > > PAGE_CACHE_SHIFT ;
offset = sd - > pos & ~ PAGE_CACHE_MASK ;
2006-04-25 17:42:00 +04:00
this_len = sd - > len ;
if ( this_len + offset > PAGE_CACHE_SIZE )
this_len = PAGE_CACHE_SIZE - offset ;
2007-03-27 10:55:08 +04:00
find_page :
page = find_lock_page ( mapping , index ) ;
if ( ! page ) {
ret = - ENOMEM ;
page = page_cache_alloc_cold ( mapping ) ;
if ( unlikely ( ! page ) )
goto out_ret ;
2006-04-03 01:05:09 +04:00
/*
2007-03-27 10:55:08 +04:00
* This will also lock the page
2006-04-03 01:05:09 +04:00
*/
2007-03-27 10:55:08 +04:00
ret = add_to_page_cache_lru ( page , mapping , index ,
GFP_KERNEL ) ;
if ( unlikely ( ret ) )
goto out ;
}
2006-04-19 17:57:31 +04:00
2006-04-25 17:42:00 +04:00
ret = mapping - > a_ops - > prepare_write ( file , page , offset , offset + this_len ) ;
2006-05-03 12:35:10 +04:00
if ( unlikely ( ret ) ) {
loff_t isize = i_size_read ( mapping - > host ) ;
if ( ret ! = AOP_TRUNCATED_PAGE )
unlock_page ( page ) ;
2006-04-03 01:04:46 +04:00
page_cache_release ( page ) ;
2006-05-03 12:35:10 +04:00
if ( ret = = AOP_TRUNCATED_PAGE )
goto find_page ;
/*
* prepare_write ( ) may have instantiated a few blocks
* outside i_size . Trim these off again .
*/
if ( sd - > pos + this_len > isize )
vmtruncate ( mapping - > host , isize ) ;
2006-10-11 12:03:09 +04:00
goto out_ret ;
2006-05-03 12:35:10 +04:00
}
2006-03-30 17:15:30 +04:00
2006-05-01 21:50:48 +04:00
if ( buf - > page ! = page ) {
2006-05-01 21:59:03 +04:00
/*
* Careful , - > map ( ) uses KM_USER0 !
*/
2006-05-03 12:41:33 +04:00
char * src = buf - > ops - > map ( pipe , buf , 1 ) ;
2006-05-01 21:59:03 +04:00
char * dst = kmap_atomic ( page , KM_USER1 ) ;
2006-03-30 17:16:46 +04:00
2006-04-25 17:42:00 +04:00
memcpy ( dst + offset , src + buf - > offset , this_len ) ;
2006-03-30 17:16:46 +04:00
flush_dcache_page ( page ) ;
2006-05-01 21:59:03 +04:00
kunmap_atomic ( dst , KM_USER1 ) ;
2006-05-03 12:41:33 +04:00
buf - > ops - > unmap ( pipe , buf , src ) ;
2006-03-30 17:16:46 +04:00
}
2006-03-30 17:15:30 +04:00
2006-04-25 17:42:00 +04:00
ret = mapping - > a_ops - > commit_write ( file , page , offset , offset + this_len ) ;
2006-05-01 21:50:48 +04:00
if ( ! ret ) {
/*
* Return the number of bytes written and mark page as
* accessed , we are now done !
*/
ret = this_len ;
mark_page_accessed ( page ) ;
balance_dirty_pages_ratelimited ( mapping ) ;
} else if ( ret = = AOP_TRUNCATED_PAGE ) {
2006-04-03 01:04:46 +04:00
page_cache_release ( page ) ;
goto find_page ;
2006-05-01 21:50:48 +04:00
}
2006-03-30 17:15:30 +04:00
out :
2006-05-01 21:50:48 +04:00
page_cache_release ( page ) ;
2006-04-19 17:57:31 +04:00
unlock_page ( page ) ;
2006-10-11 12:03:09 +04:00
out_ret :
2006-03-30 17:15:30 +04:00
return ret ;
}
2006-04-03 01:05:09 +04:00
/*
* Pipe input worker . Most of this logic works like a regular pipe , the
* key here is the ' actor ' worker passed in that actually moves the data
* to the wanted destination . See pipe_to_file / pipe_to_sendpage above .
*/
2007-03-21 15:11:02 +03:00
ssize_t __splice_from_pipe ( struct pipe_inode_info * pipe ,
struct file * out , loff_t * ppos , size_t len ,
unsigned int flags , splice_actor * actor )
2006-03-30 17:15:30 +04:00
{
int ret , do_wakeup , err ;
struct splice_desc sd ;
ret = 0 ;
do_wakeup = 0 ;
sd . total_len = len ;
sd . flags = flags ;
sd . file = out ;
2006-04-11 16:57:50 +04:00
sd . pos = * ppos ;
2006-03-30 17:15:30 +04:00
for ( ; ; ) {
2006-04-11 15:53:56 +04:00
if ( pipe - > nrbufs ) {
struct pipe_buffer * buf = pipe - > bufs + pipe - > curbuf ;
2006-12-13 11:34:04 +03:00
const struct pipe_buf_operations * ops = buf - > ops ;
2006-03-30 17:15:30 +04:00
sd . len = buf - > len ;
if ( sd . len > sd . total_len )
sd . len = sd . total_len ;
2006-04-10 17:18:35 +04:00
err = actor ( pipe , buf , & sd ) ;
2006-04-25 17:42:00 +04:00
if ( err < = 0 ) {
2006-03-30 17:15:30 +04:00
if ( ! ret & & err ! = - ENODATA )
ret = err ;
break ;
}
2006-04-25 17:42:00 +04:00
ret + = err ;
buf - > offset + = err ;
buf - > len - = err ;
sd . len - = err ;
sd . pos + = err ;
sd . total_len - = err ;
if ( sd . len )
continue ;
2006-04-11 15:57:21 +04:00
2006-03-30 17:15:30 +04:00
if ( ! buf - > len ) {
buf - > ops = NULL ;
2006-04-10 17:18:35 +04:00
ops - > release ( pipe , buf ) ;
2006-04-11 15:53:56 +04:00
pipe - > curbuf = ( pipe - > curbuf + 1 ) & ( PIPE_BUFFERS - 1 ) ;
pipe - > nrbufs - - ;
if ( pipe - > inode )
do_wakeup = 1 ;
2006-03-30 17:15:30 +04:00
}
if ( ! sd . total_len )
break ;
}
2006-04-11 15:53:56 +04:00
if ( pipe - > nrbufs )
2006-03-30 17:15:30 +04:00
continue ;
2006-04-10 17:18:35 +04:00
if ( ! pipe - > writers )
2006-03-30 17:15:30 +04:00
break ;
2006-04-10 17:18:35 +04:00
if ( ! pipe - > waiting_writers ) {
2006-03-30 17:15:30 +04:00
if ( ret )
break ;
}
2006-04-02 23:46:35 +04:00
if ( flags & SPLICE_F_NONBLOCK ) {
if ( ! ret )
ret = - EAGAIN ;
break ;
}
2006-03-30 17:15:30 +04:00
if ( signal_pending ( current ) ) {
if ( ! ret )
ret = - ERESTARTSYS ;
break ;
}
if ( do_wakeup ) {
2006-04-10 11:03:32 +04:00
smp_mb ( ) ;
2006-04-10 17:18:35 +04:00
if ( waitqueue_active ( & pipe - > wait ) )
wake_up_interruptible_sync ( & pipe - > wait ) ;
kill_fasync ( & pipe - > fasync_writers , SIGIO , POLL_OUT ) ;
2006-03-30 17:15:30 +04:00
do_wakeup = 0 ;
}
2006-04-10 17:18:35 +04:00
pipe_wait ( pipe ) ;
2006-03-30 17:15:30 +04:00
}
if ( do_wakeup ) {
2006-04-10 11:03:32 +04:00
smp_mb ( ) ;
2006-04-10 17:18:35 +04:00
if ( waitqueue_active ( & pipe - > wait ) )
wake_up_interruptible ( & pipe - > wait ) ;
kill_fasync ( & pipe - > fasync_writers , SIGIO , POLL_OUT ) ;
2006-03-30 17:15:30 +04:00
}
return ret ;
}
2007-03-21 15:11:02 +03:00
EXPORT_SYMBOL ( __splice_from_pipe ) ;
2006-03-30 17:15:30 +04:00
2006-10-17 20:43:07 +04:00
ssize_t splice_from_pipe ( struct pipe_inode_info * pipe , struct file * out ,
loff_t * ppos , size_t len , unsigned int flags ,
splice_actor * actor )
{
ssize_t ret ;
struct inode * inode = out - > f_mapping - > host ;
/*
* The actor worker might be calling - > prepare_write and
* - > commit_write . Most of the time , these expect i_mutex to
* be held . Since this may result in an ABBA deadlock with
* pipe - > inode , we have to order lock acquiry here .
*/
inode_double_lock ( inode , pipe - > inode ) ;
ret = __splice_from_pipe ( pipe , out , ppos , len , flags , actor ) ;
inode_double_unlock ( inode , pipe - > inode ) ;
return ret ;
}
/**
* generic_file_splice_write_nolock - generic_file_splice_write without mutexes
* @ pipe : pipe info
* @ out : file to write to
* @ len : number of bytes to splice
* @ flags : splice modifier flags
*
* Will either move or copy pages ( determined by @ flags options ) from
* the given pipe inode to the given file . The caller is responsible
* for acquiring i_mutex on both inodes .
*
*/
ssize_t
generic_file_splice_write_nolock ( struct pipe_inode_info * pipe , struct file * out ,
loff_t * ppos , size_t len , unsigned int flags )
{
struct address_space * mapping = out - > f_mapping ;
struct inode * inode = mapping - > host ;
ssize_t ret ;
int err ;
2006-12-08 13:36:35 +03:00
err = remove_suid ( out - > f_path . dentry ) ;
2006-10-17 21:43:22 +04:00
if ( unlikely ( err ) )
return err ;
2006-10-17 20:43:07 +04:00
ret = __splice_from_pipe ( pipe , out , ppos , len , flags , pipe_to_file ) ;
if ( ret > 0 ) {
* ppos + = ret ;
/*
* If file or inode is SYNC and we actually wrote some data ,
* sync it .
*/
if ( unlikely ( ( out - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
err = generic_osync_inode ( inode , mapping ,
OSYNC_METADATA | OSYNC_DATA ) ;
if ( err )
ret = err ;
}
}
return ret ;
}
EXPORT_SYMBOL ( generic_file_splice_write_nolock ) ;
2006-04-03 01:05:09 +04:00
/**
* generic_file_splice_write - splice data from a pipe to a file
2006-04-10 17:18:35 +04:00
* @ pipe : pipe info
2006-04-03 01:05:09 +04:00
* @ out : file to write to
* @ len : number of bytes to splice
* @ flags : splice modifier flags
*
* Will either move or copy pages ( determined by @ flags options ) from
* the given pipe inode to the given file .
*
*/
2006-04-10 17:18:35 +04:00
ssize_t
generic_file_splice_write ( struct pipe_inode_info * pipe , struct file * out ,
2006-04-11 16:57:50 +04:00
loff_t * ppos , size_t len , unsigned int flags )
2006-03-30 17:15:30 +04:00
{
2006-04-03 01:04:46 +04:00
struct address_space * mapping = out - > f_mapping ;
2006-10-17 21:43:22 +04:00
struct inode * inode = mapping - > host ;
2006-04-10 17:18:35 +04:00
ssize_t ret ;
2006-10-17 21:43:22 +04:00
int err ;
2006-12-08 13:36:35 +03:00
err = should_remove_suid ( out - > f_path . dentry ) ;
2006-10-17 21:43:22 +04:00
if ( unlikely ( err ) ) {
mutex_lock ( & inode - > i_mutex ) ;
2006-12-08 13:36:35 +03:00
err = __remove_suid ( out - > f_path . dentry , err ) ;
2006-10-17 21:43:22 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
if ( err )
return err ;
}
2006-04-10 17:18:35 +04:00
2006-04-26 16:39:29 +04:00
ret = splice_from_pipe ( pipe , out , ppos , len , flags , pipe_to_file ) ;
2006-04-19 17:57:05 +04:00
if ( ret > 0 ) {
* ppos + = ret ;
/*
* If file or inode is SYNC and we actually wrote some data ,
* sync it .
*/
if ( unlikely ( ( out - > f_flags & O_SYNC ) | | IS_SYNC ( inode ) ) ) {
mutex_lock ( & inode - > i_mutex ) ;
err = generic_osync_inode ( inode , mapping ,
OSYNC_METADATA | OSYNC_DATA ) ;
mutex_unlock ( & inode - > i_mutex ) ;
2006-04-03 01:04:46 +04:00
2006-04-19 17:57:05 +04:00
if ( err )
ret = err ;
}
2006-04-03 01:04:46 +04:00
}
return ret ;
2006-03-30 17:15:30 +04:00
}
2006-04-03 01:06:05 +04:00
EXPORT_SYMBOL ( generic_file_splice_write ) ;
2006-04-03 01:05:09 +04:00
/**
* generic_splice_sendpage - splice data from a pipe to a socket
* @ inode : pipe inode
* @ out : socket to write to
* @ len : number of bytes to splice
* @ flags : splice modifier flags
*
* Will send @ len bytes from the pipe to a network socket . No data copying
* is involved .
*
*/
2006-04-10 17:18:35 +04:00
ssize_t generic_splice_sendpage ( struct pipe_inode_info * pipe , struct file * out ,
2006-04-11 16:57:50 +04:00
loff_t * ppos , size_t len , unsigned int flags )
2006-03-30 17:15:30 +04:00
{
2006-04-26 16:39:29 +04:00
return splice_from_pipe ( pipe , out , ppos , len , flags , pipe_to_sendpage ) ;
2006-03-30 17:15:30 +04:00
}
2006-04-03 01:06:05 +04:00
EXPORT_SYMBOL ( generic_splice_sendpage ) ;
2006-03-31 08:06:13 +04:00
2006-04-03 01:05:09 +04:00
/*
* Attempt to initiate a splice from pipe to file .
*/
2006-04-10 17:18:35 +04:00
static long do_splice_from ( struct pipe_inode_info * pipe , struct file * out ,
2006-04-11 16:57:50 +04:00
loff_t * ppos , size_t len , unsigned int flags )
2006-03-30 17:15:30 +04:00
{
int ret ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( ! out - > f_op | | ! out - > f_op - > splice_write ) )
2006-03-30 17:15:30 +04:00
return - EINVAL ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( ! ( out - > f_mode & FMODE_WRITE ) ) )
2006-03-30 17:15:30 +04:00
return - EBADF ;
2006-04-11 16:57:50 +04:00
ret = rw_verify_area ( WRITE , out , ppos , len ) ;
2006-03-30 17:15:30 +04:00
if ( unlikely ( ret < 0 ) )
return ret ;
2006-04-11 16:57:50 +04:00
return out - > f_op - > splice_write ( pipe , out , ppos , len , flags ) ;
2006-03-30 17:15:30 +04:00
}
2006-04-03 01:05:09 +04:00
/*
* Attempt to initiate a splice from a file to a pipe .
*/
2006-04-11 16:57:50 +04:00
static long do_splice_to ( struct file * in , loff_t * ppos ,
struct pipe_inode_info * pipe , size_t len ,
unsigned int flags )
2006-03-30 17:15:30 +04:00
{
2006-04-11 16:57:50 +04:00
loff_t isize , left ;
2006-03-30 17:15:30 +04:00
int ret ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( ! in - > f_op | | ! in - > f_op - > splice_read ) )
2006-03-30 17:15:30 +04:00
return - EINVAL ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( ! ( in - > f_mode & FMODE_READ ) ) )
2006-03-30 17:15:30 +04:00
return - EBADF ;
2006-04-11 16:57:50 +04:00
ret = rw_verify_area ( READ , in , ppos , len ) ;
2006-03-30 17:15:30 +04:00
if ( unlikely ( ret < 0 ) )
return ret ;
isize = i_size_read ( in - > f_mapping - > host ) ;
2006-04-11 16:57:50 +04:00
if ( unlikely ( * ppos > = isize ) )
2006-03-30 17:15:30 +04:00
return 0 ;
2006-04-11 16:57:50 +04:00
left = isize - * ppos ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( left < len ) )
2006-03-30 17:15:30 +04:00
len = left ;
2006-04-11 16:57:50 +04:00
return in - > f_op - > splice_read ( in , ppos , pipe , len , flags ) ;
2006-03-30 17:15:30 +04:00
}
2006-04-11 16:57:50 +04:00
long do_splice_direct ( struct file * in , loff_t * ppos , struct file * out ,
size_t len , unsigned int flags )
2006-04-11 15:52:07 +04:00
{
struct pipe_inode_info * pipe ;
long ret , bytes ;
2006-04-11 16:57:50 +04:00
loff_t out_off ;
2006-04-11 15:52:07 +04:00
umode_t i_mode ;
int i ;
/*
* We require the input being a regular file , as we don ' t want to
* randomly drop data for eg socket - > socket splicing . Use the
* piped splicing for that !
*/
2006-12-08 13:36:35 +03:00
i_mode = in - > f_path . dentry - > d_inode - > i_mode ;
2006-04-11 15:52:07 +04:00
if ( unlikely ( ! S_ISREG ( i_mode ) & & ! S_ISBLK ( i_mode ) ) )
return - EINVAL ;
/*
* neither in nor out is a pipe , setup an internal pipe attached to
* ' out ' and transfer the wanted data from ' in ' to ' out ' through that
*/
pipe = current - > splice_pipe ;
2006-04-11 15:56:09 +04:00
if ( unlikely ( ! pipe ) ) {
2006-04-11 15:52:07 +04:00
pipe = alloc_pipe_info ( NULL ) ;
if ( ! pipe )
return - ENOMEM ;
/*
* We don ' t have an immediate reader , but we ' ll read the stuff
2006-04-26 16:39:29 +04:00
* out of the pipe right after the splice_to_pipe ( ) . So set
2006-04-11 15:52:07 +04:00
* PIPE_READERS appropriately .
*/
pipe - > readers = 1 ;
current - > splice_pipe = pipe ;
}
/*
2006-04-11 15:57:21 +04:00
* Do the splice .
2006-04-11 15:52:07 +04:00
*/
ret = 0 ;
bytes = 0 ;
2006-04-11 16:57:50 +04:00
out_off = 0 ;
2006-04-11 15:52:07 +04:00
while ( len ) {
size_t read_len , max_read_len ;
/*
* Do at most PIPE_BUFFERS pages worth of transfer :
*/
max_read_len = min ( len , ( size_t ) ( PIPE_BUFFERS * PAGE_SIZE ) ) ;
2006-04-11 16:57:50 +04:00
ret = do_splice_to ( in , ppos , pipe , max_read_len , flags ) ;
2006-04-11 15:52:07 +04:00
if ( unlikely ( ret < 0 ) )
goto out_release ;
read_len = ret ;
/*
* NOTE : nonblocking mode only applies to the input . We
* must not do the output in nonblocking mode as then we
* could get stuck data in the internal pipe :
*/
2006-04-11 16:57:50 +04:00
ret = do_splice_from ( pipe , out , & out_off , read_len ,
2006-04-11 15:52:07 +04:00
flags & ~ SPLICE_F_NONBLOCK ) ;
if ( unlikely ( ret < 0 ) )
goto out_release ;
bytes + = ret ;
len - = ret ;
/*
* In nonblocking mode , if we got back a short read then
* that was due to either an IO error or due to the
* pagecache entry not being there . In the IO error case
* the _next_ splice attempt will produce a clean IO error
* return value ( not a short read ) , so in both cases it ' s
* correct to break out of the loop here :
*/
if ( ( flags & SPLICE_F_NONBLOCK ) & & ( read_len < max_read_len ) )
break ;
}
pipe - > nrbufs = pipe - > curbuf = 0 ;
return bytes ;
out_release :
/*
* If we did an incomplete transfer we must release
* the pipe buffers in question :
*/
for ( i = 0 ; i < PIPE_BUFFERS ; i + + ) {
struct pipe_buffer * buf = pipe - > bufs + i ;
if ( buf - > ops ) {
buf - > ops - > release ( pipe , buf ) ;
buf - > ops = NULL ;
}
}
pipe - > nrbufs = pipe - > curbuf = 0 ;
/*
* If we transferred some data , return the number of bytes :
*/
if ( bytes > 0 )
return bytes ;
return ret ;
}
EXPORT_SYMBOL ( do_splice_direct ) ;
2006-11-04 14:49:32 +03:00
/*
* After the inode slimming patch , i_pipe / i_bdev / i_cdev share the same
* location , so checking - > i_pipe is not enough to verify that this is a
* pipe .
*/
static inline struct pipe_inode_info * pipe_info ( struct inode * inode )
{
if ( S_ISFIFO ( inode - > i_mode ) )
return inode - > i_pipe ;
return NULL ;
}
2006-04-03 01:05:09 +04:00
/*
* Determine where to splice to / from .
*/
2006-04-10 17:18:58 +04:00
static long do_splice ( struct file * in , loff_t __user * off_in ,
struct file * out , loff_t __user * off_out ,
size_t len , unsigned int flags )
2006-03-30 17:15:30 +04:00
{
2006-04-10 17:18:35 +04:00
struct pipe_inode_info * pipe ;
2006-04-11 16:57:50 +04:00
loff_t offset , * off ;
2006-04-19 17:57:05 +04:00
long ret ;
2006-03-30 17:15:30 +04:00
2006-12-08 13:36:35 +03:00
pipe = pipe_info ( in - > f_path . dentry - > d_inode ) ;
2006-04-10 17:18:58 +04:00
if ( pipe ) {
if ( off_in )
return - ESPIPE ;
2006-04-11 15:52:07 +04:00
if ( off_out ) {
if ( out - > f_op - > llseek = = no_llseek )
return - EINVAL ;
2006-04-11 16:57:50 +04:00
if ( copy_from_user ( & offset , off_out , sizeof ( loff_t ) ) )
2006-04-11 15:52:07 +04:00
return - EFAULT ;
2006-04-11 16:57:50 +04:00
off = & offset ;
} else
off = & out - > f_pos ;
2006-04-10 17:18:58 +04:00
2006-04-19 17:57:05 +04:00
ret = do_splice_from ( pipe , out , off , len , flags ) ;
if ( off_out & & copy_to_user ( off_out , off , sizeof ( loff_t ) ) )
ret = - EFAULT ;
return ret ;
2006-04-10 17:18:58 +04:00
}
2006-03-30 17:15:30 +04:00
2006-12-08 13:36:35 +03:00
pipe = pipe_info ( out - > f_path . dentry - > d_inode ) ;
2006-04-10 17:18:58 +04:00
if ( pipe ) {
if ( off_out )
return - ESPIPE ;
2006-04-11 15:52:07 +04:00
if ( off_in ) {
if ( in - > f_op - > llseek = = no_llseek )
return - EINVAL ;
2006-04-11 16:57:50 +04:00
if ( copy_from_user ( & offset , off_in , sizeof ( loff_t ) ) )
2006-04-11 15:52:07 +04:00
return - EFAULT ;
2006-04-11 16:57:50 +04:00
off = & offset ;
} else
off = & in - > f_pos ;
2006-04-10 17:18:58 +04:00
2006-04-19 17:57:05 +04:00
ret = do_splice_to ( in , off , pipe , len , flags ) ;
if ( off_in & & copy_to_user ( off_in , off , sizeof ( loff_t ) ) )
ret = - EFAULT ;
return ret ;
2006-04-10 17:18:58 +04:00
}
2006-03-30 17:15:30 +04:00
return - EINVAL ;
}
2006-04-26 12:59:21 +04:00
/*
* Map an iov into an array of pages and offset / length tupples . With the
* partial_page structure , we can map several non - contiguous ranges into
* our ones pages [ ] map instead of splitting that operation into pieces .
* Could easily be exported as a generic helper for other users , in which
* case one would probably want to add a ' max_nr_pages ' parameter as well .
*/
static int get_iovec_page_array ( const struct iovec __user * iov ,
unsigned int nr_vecs , struct page * * pages ,
2006-05-01 22:02:33 +04:00
struct partial_page * partial , int aligned )
2006-04-26 12:59:21 +04:00
{
int buffers = 0 , error = 0 ;
/*
* It ' s ok to take the mmap_sem for reading , even
* across a " get_user() " .
*/
down_read ( & current - > mm - > mmap_sem ) ;
while ( nr_vecs ) {
unsigned long off , npages ;
void __user * base ;
size_t len ;
int i ;
/*
* Get user address base and length for this iovec .
*/
error = get_user ( base , & iov - > iov_base ) ;
if ( unlikely ( error ) )
break ;
error = get_user ( len , & iov - > iov_len ) ;
if ( unlikely ( error ) )
break ;
/*
* Sanity check this iovec . 0 read succeeds .
*/
if ( unlikely ( ! len ) )
break ;
error = - EFAULT ;
if ( unlikely ( ! base ) )
break ;
/*
* Get this base offset and number of pages , then map
* in the user pages .
*/
off = ( unsigned long ) base & ~ PAGE_MASK ;
2006-05-01 22:02:33 +04:00
/*
* If asked for alignment , the offset must be zero and the
* length a multiple of the PAGE_SIZE .
*/
error = - EINVAL ;
if ( aligned & & ( off | | len & ~ PAGE_MASK ) )
break ;
2006-04-26 12:59:21 +04:00
npages = ( off + len + PAGE_SIZE - 1 ) > > PAGE_SHIFT ;
if ( npages > PIPE_BUFFERS - buffers )
npages = PIPE_BUFFERS - buffers ;
error = get_user_pages ( current , current - > mm ,
( unsigned long ) base , npages , 0 , 0 ,
& pages [ buffers ] , NULL ) ;
if ( unlikely ( error < = 0 ) )
break ;
/*
* Fill this contiguous range into the partial page map .
*/
for ( i = 0 ; i < error ; i + + ) {
2006-05-02 14:57:18 +04:00
const int plen = min_t ( size_t , len , PAGE_SIZE - off ) ;
2006-04-26 12:59:21 +04:00
partial [ buffers ] . offset = off ;
partial [ buffers ] . len = plen ;
off = 0 ;
len - = plen ;
buffers + + ;
}
/*
* We didn ' t complete this iov , stop here since it probably
* means we have to move some of this into a pipe to
* be able to continue .
*/
if ( len )
break ;
/*
* Don ' t continue if we mapped fewer pages than we asked for ,
* or if we mapped the max number of pages that we have
* room for .
*/
if ( error < npages | | buffers = = PIPE_BUFFERS )
break ;
nr_vecs - - ;
iov + + ;
}
up_read ( & current - > mm - > mmap_sem ) ;
if ( buffers )
return buffers ;
return error ;
}
/*
* vmsplice splices a user address range into a pipe . It can be thought of
* as splice - from - memory , where the regular splice is splice - from - file ( or
* to file ) . In both cases the output is a pipe , naturally .
*
* Note that vmsplice only supports splicing _from_ user memory to a pipe ,
* not the other way around . Splicing from user memory is a simple operation
* that can be supported without any funky alignment restrictions or nasty
* vm tricks . We simply map in the user memory and fill them into a pipe .
* The reverse isn ' t quite as easy , though . There are two possible solutions
* for that :
*
* - memcpy ( ) the data internally , at which point we might as well just
* do a regular read ( ) on the buffer anyway .
* - Lots of nasty vm tricks , that are neither fast nor flexible ( it
* has restriction limitations on both ends of the pipe ) .
*
* Alas , it isn ' t here .
*
*/
static long do_vmsplice ( struct file * file , const struct iovec __user * iov ,
unsigned long nr_segs , unsigned int flags )
{
2006-11-04 14:49:32 +03:00
struct pipe_inode_info * pipe ;
2006-04-26 12:59:21 +04:00
struct page * pages [ PIPE_BUFFERS ] ;
struct partial_page partial [ PIPE_BUFFERS ] ;
struct splice_pipe_desc spd = {
. pages = pages ,
. partial = partial ,
. flags = flags ,
. ops = & user_page_pipe_buf_ops ,
} ;
2006-12-08 13:36:35 +03:00
pipe = pipe_info ( file - > f_path . dentry - > d_inode ) ;
2006-11-04 14:49:32 +03:00
if ( ! pipe )
2006-04-26 12:59:21 +04:00
return - EBADF ;
if ( unlikely ( nr_segs > UIO_MAXIOV ) )
return - EINVAL ;
else if ( unlikely ( ! nr_segs ) )
return 0 ;
2006-05-01 22:02:33 +04:00
spd . nr_pages = get_iovec_page_array ( iov , nr_segs , pages , partial ,
flags & SPLICE_F_GIFT ) ;
2006-04-26 12:59:21 +04:00
if ( spd . nr_pages < = 0 )
return spd . nr_pages ;
2006-04-26 16:39:29 +04:00
return splice_to_pipe ( pipe , & spd ) ;
2006-04-26 12:59:21 +04:00
}
asmlinkage long sys_vmsplice ( int fd , const struct iovec __user * iov ,
unsigned long nr_segs , unsigned int flags )
{
struct file * file ;
long error ;
int fput ;
error = - EBADF ;
file = fget_light ( fd , & fput ) ;
if ( file ) {
if ( file - > f_mode & FMODE_WRITE )
error = do_vmsplice ( file , iov , nr_segs , flags ) ;
fput_light ( file , fput ) ;
}
return error ;
}
2006-04-10 17:18:58 +04:00
asmlinkage long sys_splice ( int fd_in , loff_t __user * off_in ,
int fd_out , loff_t __user * off_out ,
size_t len , unsigned int flags )
2006-03-30 17:15:30 +04:00
{
long error ;
struct file * in , * out ;
int fput_in , fput_out ;
if ( unlikely ( ! len ) )
return 0 ;
error = - EBADF ;
2006-04-10 17:18:58 +04:00
in = fget_light ( fd_in , & fput_in ) ;
2006-03-30 17:15:30 +04:00
if ( in ) {
if ( in - > f_mode & FMODE_READ ) {
2006-04-10 17:18:58 +04:00
out = fget_light ( fd_out , & fput_out ) ;
2006-03-30 17:15:30 +04:00
if ( out ) {
if ( out - > f_mode & FMODE_WRITE )
2006-04-10 17:18:58 +04:00
error = do_splice ( in , off_in ,
out , off_out ,
len , flags ) ;
2006-03-30 17:15:30 +04:00
fput_light ( out , fput_out ) ;
}
}
fput_light ( in , fput_in ) ;
}
return error ;
}
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
/*
* Make sure there ' s data to read . Wait for input if we can , otherwise
* return an appropriate error .
*/
static int link_ipipe_prep ( struct pipe_inode_info * pipe , unsigned int flags )
{
int ret ;
/*
* Check - > nrbufs without the inode lock first . This function
* is speculative anyways , so missing one is ok .
*/
if ( pipe - > nrbufs )
return 0 ;
ret = 0 ;
mutex_lock ( & pipe - > inode - > i_mutex ) ;
while ( ! pipe - > nrbufs ) {
if ( signal_pending ( current ) ) {
ret = - ERESTARTSYS ;
break ;
}
if ( ! pipe - > writers )
break ;
if ( ! pipe - > waiting_writers ) {
if ( flags & SPLICE_F_NONBLOCK ) {
ret = - EAGAIN ;
break ;
}
}
pipe_wait ( pipe ) ;
}
mutex_unlock ( & pipe - > inode - > i_mutex ) ;
return ret ;
}
/*
* Make sure there ' s writeable room . Wait for room if we can , otherwise
* return an appropriate error .
*/
static int link_opipe_prep ( struct pipe_inode_info * pipe , unsigned int flags )
{
int ret ;
/*
* Check - > nrbufs without the inode lock first . This function
* is speculative anyways , so missing one is ok .
*/
if ( pipe - > nrbufs < PIPE_BUFFERS )
return 0 ;
ret = 0 ;
mutex_lock ( & pipe - > inode - > i_mutex ) ;
while ( pipe - > nrbufs > = PIPE_BUFFERS ) {
if ( ! pipe - > readers ) {
send_sig ( SIGPIPE , current , 0 ) ;
ret = - EPIPE ;
break ;
}
if ( flags & SPLICE_F_NONBLOCK ) {
ret = - EAGAIN ;
break ;
}
if ( signal_pending ( current ) ) {
ret = - ERESTARTSYS ;
break ;
}
pipe - > waiting_writers + + ;
pipe_wait ( pipe ) ;
pipe - > waiting_writers - - ;
}
mutex_unlock ( & pipe - > inode - > i_mutex ) ;
return ret ;
}
2006-04-11 17:51:17 +04:00
/*
* Link contents of ipipe to opipe .
*/
static int link_pipe ( struct pipe_inode_info * ipipe ,
struct pipe_inode_info * opipe ,
size_t len , unsigned int flags )
{
struct pipe_buffer * ibuf , * obuf ;
2006-07-10 13:00:01 +04:00
int ret = 0 , i = 0 , nbuf ;
2006-04-11 17:51:17 +04:00
/*
* Potential ABBA deadlock , work around it by ordering lock
* grabbing by inode address . Otherwise two different processes
* could deadlock ( one doing tee from A - > B , the other from B - > A ) .
*/
2006-10-17 12:31:38 +04:00
inode_double_lock ( ipipe - > inode , opipe - > inode ) ;
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
do {
2006-04-11 17:51:17 +04:00
if ( ! opipe - > readers ) {
send_sig ( SIGPIPE , current , 0 ) ;
if ( ! ret )
ret = - EPIPE ;
break ;
}
2006-07-10 13:00:01 +04:00
/*
* If we have iterated all input buffers or ran out of
* output room , break .
*/
if ( i > = ipipe - > nrbufs | | opipe - > nrbufs > = PIPE_BUFFERS )
break ;
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
ibuf = ipipe - > bufs + ( ( ipipe - > curbuf + i ) & ( PIPE_BUFFERS - 1 ) ) ;
nbuf = ( opipe - > curbuf + opipe - > nrbufs ) & ( PIPE_BUFFERS - 1 ) ;
2006-04-11 17:51:17 +04:00
/*
2006-07-10 13:00:01 +04:00
* Get a reference to this pipe buffer ,
* so we can copy the contents over .
2006-04-11 17:51:17 +04:00
*/
2006-07-10 13:00:01 +04:00
ibuf - > ops - > get ( ipipe , ibuf ) ;
obuf = opipe - > bufs + nbuf ;
* obuf = * ibuf ;
2006-04-19 17:56:40 +04:00
/*
2006-07-10 13:00:01 +04:00
* Don ' t inherit the gift flag , we need to
* prevent multiple steals of this page .
2006-04-19 17:56:40 +04:00
*/
2006-07-10 13:00:01 +04:00
obuf - > flags & = ~ PIPE_BUF_FLAG_GIFT ;
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
if ( obuf - > len > len )
obuf - > len = len ;
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
opipe - > nrbufs + + ;
ret + = obuf - > len ;
len - = obuf - > len ;
i + + ;
} while ( len ) ;
2006-04-11 17:51:17 +04:00
2006-10-17 12:31:38 +04:00
inode_double_unlock ( ipipe - > inode , opipe - > inode ) ;
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
/*
* If we put data in the output pipe , wakeup any potential readers .
*/
if ( ret > 0 ) {
2006-04-11 17:51:17 +04:00
smp_mb ( ) ;
if ( waitqueue_active ( & opipe - > wait ) )
wake_up_interruptible ( & opipe - > wait ) ;
kill_fasync ( & opipe - > fasync_readers , SIGIO , POLL_IN ) ;
}
return ret ;
}
/*
* This is a tee ( 1 ) implementation that works on pipes . It doesn ' t copy
* any data , it simply references the ' in ' pages on the ' out ' pipe .
* The ' flags ' used are the SPLICE_F_ * variants , currently the only
* applicable one is SPLICE_F_NONBLOCK .
*/
static long do_tee ( struct file * in , struct file * out , size_t len ,
unsigned int flags )
{
2006-12-08 13:36:35 +03:00
struct pipe_inode_info * ipipe = pipe_info ( in - > f_path . dentry - > d_inode ) ;
struct pipe_inode_info * opipe = pipe_info ( out - > f_path . dentry - > d_inode ) ;
2006-07-10 13:00:01 +04:00
int ret = - EINVAL ;
2006-04-11 17:51:17 +04:00
/*
2006-07-10 13:00:01 +04:00
* Duplicate the contents of ipipe to opipe without actually
* copying the data .
2006-04-11 17:51:17 +04:00
*/
2006-07-10 13:00:01 +04:00
if ( ipipe & & opipe & & ipipe ! = opipe ) {
/*
* Keep going , unless we encounter an error . The ipipe / opipe
* ordering doesn ' t really matter .
*/
ret = link_ipipe_prep ( ipipe , flags ) ;
if ( ! ret ) {
ret = link_opipe_prep ( opipe , flags ) ;
if ( ! ret ) {
ret = link_pipe ( ipipe , opipe , len , flags ) ;
if ( ! ret & & ( flags & SPLICE_F_NONBLOCK ) )
ret = - EAGAIN ;
}
}
}
2006-04-11 17:51:17 +04:00
2006-07-10 13:00:01 +04:00
return ret ;
2006-04-11 17:51:17 +04:00
}
asmlinkage long sys_tee ( int fdin , int fdout , size_t len , unsigned int flags )
{
struct file * in ;
int error , fput_in ;
if ( unlikely ( ! len ) )
return 0 ;
error = - EBADF ;
in = fget_light ( fdin , & fput_in ) ;
if ( in ) {
if ( in - > f_mode & FMODE_READ ) {
int fput_out ;
struct file * out = fget_light ( fdout , & fput_out ) ;
if ( out ) {
if ( out - > f_mode & FMODE_WRITE )
error = do_tee ( in , out , len , flags ) ;
fput_light ( out , fput_out ) ;
}
}
fput_light ( in , fput_in ) ;
}
return error ;
}