2005-04-17 02:20:36 +04:00
/*
* linux / fs / read_write . c
*
* Copyright ( C ) 1991 , 1992 Linus Torvalds
*/
# include <linux/slab.h>
# include <linux/stat.h>
# include <linux/fcntl.h>
# include <linux/file.h>
# include <linux/uio.h>
# include <linux/smp_lock.h>
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
# include <linux/fsnotify.h>
2005-04-17 02:20:36 +04:00
# include <linux/security.h>
# include <linux/module.h>
# include <linux/syscalls.h>
2006-01-05 03:20:40 +03:00
# include <linux/pagemap.h>
2007-06-04 11:59:47 +04:00
# include <linux/splice.h>
2006-10-01 10:28:47 +04:00
# include "read_write.h"
2005-04-17 02:20:36 +04:00
# include <asm/uaccess.h>
# include <asm/unistd.h>
2006-03-28 13:56:42 +04:00
const struct file_operations generic_ro_fops = {
2005-04-17 02:20:36 +04:00
. llseek = generic_file_llseek ,
2006-10-01 10:28:48 +04:00
. read = do_sync_read ,
. aio_read = generic_file_aio_read ,
2005-04-17 02:20:36 +04:00
. mmap = generic_file_readonly_mmap ,
2007-06-01 16:52:37 +04:00
. splice_read = generic_file_splice_read ,
2005-04-17 02:20:36 +04:00
} ;
EXPORT_SYMBOL ( generic_ro_fops ) ;
2010-10-02 01:20:22 +04:00
static int
__negative_fpos_check ( struct file * file , loff_t pos , size_t count )
{
/*
* pos or pos + count is negative here , check overflow .
* too big " count " will be caught in rw_verify_area ( ) .
*/
if ( ( pos < 0 ) & & ( pos + count < pos ) )
return - EOVERFLOW ;
if ( file - > f_mode & FMODE_UNSIGNED_OFFSET )
return 0 ;
return - EINVAL ;
}
2008-08-11 17:37:17 +04:00
/**
* generic_file_llseek_unlocked - lockless generic llseek implementation
* @ file : file structure to seek on
* @ offset : file offset to seek to
* @ origin : type of seek
*
* Updates the file offset to the value specified by @ offset and @ origin .
* Locking must be provided by the caller .
*/
2008-06-27 13:05:24 +04:00
loff_t
generic_file_llseek_unlocked ( struct file * file , loff_t offset , int origin )
2005-04-17 02:20:36 +04:00
{
struct inode * inode = file - > f_mapping - > host ;
switch ( origin ) {
2008-08-11 17:37:17 +04:00
case SEEK_END :
offset + = inode - > i_size ;
break ;
case SEEK_CUR :
2008-11-11 04:08:08 +03:00
/*
* Here we special - case the lseek ( fd , 0 , SEEK_CUR )
* position - querying operation . Avoid rewriting the " same "
* f_pos value back to the file because a concurrent read ( ) ,
* write ( ) or lseek ( ) might have altered it
*/
if ( offset = = 0 )
return file - > f_pos ;
2008-08-11 17:37:17 +04:00
offset + = file - > f_pos ;
break ;
2005-04-17 02:20:36 +04:00
}
2008-08-11 17:37:17 +04:00
2010-10-02 01:20:22 +04:00
if ( offset < 0 & & __negative_fpos_check ( file , offset , 0 ) )
return - EINVAL ;
if ( offset > inode - > i_sb - > s_maxbytes )
2008-08-11 17:37:17 +04:00
return - EINVAL ;
/* Special lock needed here? */
if ( offset ! = file - > f_pos ) {
file - > f_pos = offset ;
file - > f_version = 0 ;
2005-04-17 02:20:36 +04:00
}
2008-08-11 17:37:17 +04:00
return offset ;
2005-04-17 02:20:36 +04:00
}
2008-06-27 13:05:24 +04:00
EXPORT_SYMBOL ( generic_file_llseek_unlocked ) ;
2005-04-17 02:20:36 +04:00
2008-08-11 17:37:17 +04:00
/**
* generic_file_llseek - generic llseek implementation for regular files
* @ file : file structure to seek on
* @ offset : file offset to seek to
* @ origin : type of seek
*
* This is a generic implemenation of - > llseek useable for all normal local
* filesystems . It just updates the file offset to the value specified by
* @ offset and @ origin under i_mutex .
*/
2008-06-27 13:05:24 +04:00
loff_t generic_file_llseek ( struct file * file , loff_t offset , int origin )
2005-04-17 02:20:36 +04:00
{
2008-08-11 17:37:17 +04:00
loff_t rval ;
2008-06-27 13:05:24 +04:00
mutex_lock ( & file - > f_dentry - > d_inode - > i_mutex ) ;
2008-08-11 17:37:17 +04:00
rval = generic_file_llseek_unlocked ( file , offset , origin ) ;
2008-06-27 13:05:24 +04:00
mutex_unlock ( & file - > f_dentry - > d_inode - > i_mutex ) ;
2008-08-11 17:37:17 +04:00
return rval ;
2005-04-17 02:20:36 +04:00
}
2008-06-27 13:05:24 +04:00
EXPORT_SYMBOL ( generic_file_llseek ) ;
2005-04-17 02:20:36 +04:00
2010-05-27 01:44:48 +04:00
/**
* noop_llseek - No Operation Performed llseek implementation
* @ file : file structure to seek on
* @ offset : file offset to seek to
* @ origin : type of seek
*
* This is an implementation of - > llseek useable for the rare special case when
* userspace expects the seek to succeed but the ( device ) file is actually not
* able to perform the seek . In this case you use noop_llseek ( ) instead of
* falling back to the default implementation of - > llseek .
*/
loff_t noop_llseek ( struct file * file , loff_t offset , int origin )
{
return file - > f_pos ;
}
EXPORT_SYMBOL ( noop_llseek ) ;
2005-04-17 02:20:36 +04:00
loff_t no_llseek ( struct file * file , loff_t offset , int origin )
{
return - ESPIPE ;
}
EXPORT_SYMBOL ( no_llseek ) ;
loff_t default_llseek ( struct file * file , loff_t offset , int origin )
{
2008-04-22 17:09:22 +04:00
loff_t retval ;
2005-04-17 02:20:36 +04:00
2010-07-08 00:55:17 +04:00
mutex_lock ( & file - > f_dentry - > d_inode - > i_mutex ) ;
2005-04-17 02:20:36 +04:00
switch ( origin ) {
2007-05-08 11:24:13 +04:00
case SEEK_END :
2006-12-08 13:36:35 +03:00
offset + = i_size_read ( file - > f_path . dentry - > d_inode ) ;
2005-04-17 02:20:36 +04:00
break ;
2007-05-08 11:24:13 +04:00
case SEEK_CUR :
2008-11-11 04:08:08 +03:00
if ( offset = = 0 ) {
retval = file - > f_pos ;
goto out ;
}
2005-04-17 02:20:36 +04:00
offset + = file - > f_pos ;
}
retval = - EINVAL ;
2010-10-02 01:20:22 +04:00
if ( offset > = 0 | | ! __negative_fpos_check ( file , offset , 0 ) ) {
2005-04-17 02:20:36 +04:00
if ( offset ! = file - > f_pos ) {
file - > f_pos = offset ;
file - > f_version = 0 ;
}
retval = offset ;
}
2008-11-11 04:08:08 +03:00
out :
2010-07-08 00:55:17 +04:00
mutex_unlock ( & file - > f_dentry - > d_inode - > i_mutex ) ;
2005-04-17 02:20:36 +04:00
return retval ;
}
EXPORT_SYMBOL ( default_llseek ) ;
loff_t vfs_llseek ( struct file * file , loff_t offset , int origin )
{
loff_t ( * fn ) ( struct file * , loff_t , int ) ;
fn = no_llseek ;
if ( file - > f_mode & FMODE_LSEEK ) {
if ( file - > f_op & & file - > f_op - > llseek )
fn = file - > f_op - > llseek ;
}
return fn ( file , offset , origin ) ;
}
EXPORT_SYMBOL ( vfs_llseek ) ;
2009-01-14 16:14:21 +03:00
SYSCALL_DEFINE3 ( lseek , unsigned int , fd , off_t , offset , unsigned int , origin )
2005-04-17 02:20:36 +04:00
{
off_t retval ;
struct file * file ;
int fput_needed ;
retval = - EBADF ;
file = fget_light ( fd , & fput_needed ) ;
if ( ! file )
goto bad ;
retval = - EINVAL ;
2007-05-08 11:24:15 +04:00
if ( origin < = SEEK_MAX ) {
2005-04-17 02:20:36 +04:00
loff_t res = vfs_llseek ( file , offset , origin ) ;
retval = res ;
if ( res ! = ( loff_t ) retval )
retval = - EOVERFLOW ; /* LFS: should only happen on 32 bit platforms */
}
fput_light ( file , fput_needed ) ;
bad :
return retval ;
}
# ifdef __ARCH_WANT_SYS_LLSEEK
2009-01-14 16:14:21 +03:00
SYSCALL_DEFINE5 ( llseek , unsigned int , fd , unsigned long , offset_high ,
unsigned long , offset_low , loff_t __user * , result ,
unsigned int , origin )
2005-04-17 02:20:36 +04:00
{
int retval ;
struct file * file ;
loff_t offset ;
int fput_needed ;
retval = - EBADF ;
file = fget_light ( fd , & fput_needed ) ;
if ( ! file )
goto bad ;
retval = - EINVAL ;
2007-05-08 11:24:15 +04:00
if ( origin > SEEK_MAX )
2005-04-17 02:20:36 +04:00
goto out_putf ;
offset = vfs_llseek ( file , ( ( loff_t ) offset_high < < 32 ) | offset_low ,
origin ) ;
retval = ( int ) offset ;
if ( offset > = 0 ) {
retval = - EFAULT ;
if ( ! copy_to_user ( result , & offset , sizeof ( offset ) ) )
retval = 0 ;
}
out_putf :
fput_light ( file , fput_needed ) ;
bad :
return retval ;
}
# endif
2010-10-02 01:20:22 +04:00
2006-01-05 03:20:40 +03:00
/*
* rw_verify_area doesn ' t like huge counts . We limit
* them to something that fits in " int " so that others
* won ' t have to do range checks all the time .
*/
# define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
2005-04-17 02:20:36 +04:00
int rw_verify_area ( int read_write , struct file * file , loff_t * ppos , size_t count )
{
struct inode * inode ;
loff_t pos ;
2008-01-12 14:05:48 +03:00
int retval = - EINVAL ;
2005-04-17 02:20:36 +04:00
2007-02-12 11:52:24 +03:00
inode = file - > f_path . dentry - > d_inode ;
2006-01-05 03:20:40 +03:00
if ( unlikely ( ( ssize_t ) count < 0 ) )
2008-01-12 14:05:48 +03:00
return retval ;
2005-04-17 02:20:36 +04:00
pos = * ppos ;
2010-10-02 01:20:22 +04:00
if ( unlikely ( ( pos < 0 ) | | ( loff_t ) ( pos + count ) < 0 ) ) {
retval = __negative_fpos_check ( file , pos , count ) ;
if ( retval )
return retval ;
}
2005-04-17 02:20:36 +04:00
2007-10-02 01:41:11 +04:00
if ( unlikely ( inode - > i_flock & & mandatory_lock ( inode ) ) ) {
2008-01-12 14:05:48 +03:00
retval = locks_mandatory_area (
2006-01-05 03:20:40 +03:00
read_write = = READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE ,
inode , file , pos , count ) ;
if ( retval < 0 )
return retval ;
}
2008-01-12 14:05:48 +03:00
retval = security_file_permission ( file ,
read_write = = READ ? MAY_READ : MAY_WRITE ) ;
if ( retval )
return retval ;
2006-01-05 03:20:40 +03:00
return count > MAX_RW_COUNT ? MAX_RW_COUNT : count ;
2005-04-17 02:20:36 +04:00
}
2005-06-23 11:10:27 +04:00
static void wait_on_retry_sync_kiocb ( struct kiocb * iocb )
{
set_current_state ( TASK_UNINTERRUPTIBLE ) ;
if ( ! kiocbIsKicked ( iocb ) )
schedule ( ) ;
else
kiocbClearKicked ( iocb ) ;
__set_current_state ( TASK_RUNNING ) ;
}
2005-04-17 02:20:36 +04:00
ssize_t do_sync_read ( struct file * filp , char __user * buf , size_t len , loff_t * ppos )
{
2006-10-01 10:28:46 +04:00
struct iovec iov = { . iov_base = buf , . iov_len = len } ;
2005-04-17 02:20:36 +04:00
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , filp ) ;
kiocb . ki_pos = * ppos ;
2006-10-01 10:28:46 +04:00
kiocb . ki_left = len ;
2010-03-24 20:09:19 +03:00
kiocb . ki_nbytes = len ;
2006-10-01 10:28:46 +04:00
for ( ; ; ) {
ret = filp - > f_op - > aio_read ( & kiocb , & iov , 1 , kiocb . ki_pos ) ;
if ( ret ! = - EIOCBRETRY )
break ;
2005-06-23 11:10:27 +04:00
wait_on_retry_sync_kiocb ( & kiocb ) ;
2006-10-01 10:28:46 +04:00
}
2005-06-23 11:10:27 +04:00
2005-04-17 02:20:36 +04:00
if ( - EIOCBQUEUED = = ret )
ret = wait_on_sync_kiocb ( & kiocb ) ;
* ppos = kiocb . ki_pos ;
return ret ;
}
EXPORT_SYMBOL ( do_sync_read ) ;
ssize_t vfs_read ( struct file * file , char __user * buf , size_t count , loff_t * pos )
{
ssize_t ret ;
if ( ! ( file - > f_mode & FMODE_READ ) )
return - EBADF ;
if ( ! file - > f_op | | ( ! file - > f_op - > read & & ! file - > f_op - > aio_read ) )
return - EINVAL ;
if ( unlikely ( ! access_ok ( VERIFY_WRITE , buf , count ) ) )
return - EFAULT ;
ret = rw_verify_area ( READ , file , pos , count ) ;
2006-01-05 03:20:40 +03:00
if ( ret > = 0 ) {
count = ret ;
2008-01-12 14:05:48 +03:00
if ( file - > f_op - > read )
ret = file - > f_op - > read ( file , buf , count , pos ) ;
else
ret = do_sync_read ( file , buf , count , pos ) ;
if ( ret > 0 ) {
2009-12-18 05:24:21 +03:00
fsnotify_access ( file ) ;
2008-01-12 14:05:48 +03:00
add_rchar ( current , ret ) ;
2005-04-17 02:20:36 +04:00
}
2008-01-12 14:05:48 +03:00
inc_syscr ( current ) ;
2005-04-17 02:20:36 +04:00
}
return ret ;
}
EXPORT_SYMBOL ( vfs_read ) ;
ssize_t do_sync_write ( struct file * filp , const char __user * buf , size_t len , loff_t * ppos )
{
2006-10-01 10:28:46 +04:00
struct iovec iov = { . iov_base = ( void __user * ) buf , . iov_len = len } ;
2005-04-17 02:20:36 +04:00
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , filp ) ;
kiocb . ki_pos = * ppos ;
2006-10-01 10:28:46 +04:00
kiocb . ki_left = len ;
2010-03-24 20:09:19 +03:00
kiocb . ki_nbytes = len ;
2006-10-01 10:28:46 +04:00
for ( ; ; ) {
ret = filp - > f_op - > aio_write ( & kiocb , & iov , 1 , kiocb . ki_pos ) ;
if ( ret ! = - EIOCBRETRY )
break ;
2005-06-23 11:10:27 +04:00
wait_on_retry_sync_kiocb ( & kiocb ) ;
2006-10-01 10:28:46 +04:00
}
2005-06-23 11:10:27 +04:00
2005-04-17 02:20:36 +04:00
if ( - EIOCBQUEUED = = ret )
ret = wait_on_sync_kiocb ( & kiocb ) ;
* ppos = kiocb . ki_pos ;
return ret ;
}
EXPORT_SYMBOL ( do_sync_write ) ;
ssize_t vfs_write ( struct file * file , const char __user * buf , size_t count , loff_t * pos )
{
ssize_t ret ;
if ( ! ( file - > f_mode & FMODE_WRITE ) )
return - EBADF ;
if ( ! file - > f_op | | ( ! file - > f_op - > write & & ! file - > f_op - > aio_write ) )
return - EINVAL ;
if ( unlikely ( ! access_ok ( VERIFY_READ , buf , count ) ) )
return - EFAULT ;
ret = rw_verify_area ( WRITE , file , pos , count ) ;
2006-01-05 03:20:40 +03:00
if ( ret > = 0 ) {
count = ret ;
2008-01-12 14:05:48 +03:00
if ( file - > f_op - > write )
ret = file - > f_op - > write ( file , buf , count , pos ) ;
else
ret = do_sync_write ( file , buf , count , pos ) ;
if ( ret > 0 ) {
2009-12-18 05:24:21 +03:00
fsnotify_modify ( file ) ;
2008-01-12 14:05:48 +03:00
add_wchar ( current , ret ) ;
2005-04-17 02:20:36 +04:00
}
2008-01-12 14:05:48 +03:00
inc_syscw ( current ) ;
2005-04-17 02:20:36 +04:00
}
return ret ;
}
EXPORT_SYMBOL ( vfs_write ) ;
static inline loff_t file_pos_read ( struct file * file )
{
return file - > f_pos ;
}
static inline void file_pos_write ( struct file * file , loff_t pos )
{
file - > f_pos = pos ;
}
2009-01-14 16:14:22 +03:00
SYSCALL_DEFINE3 ( read , unsigned int , fd , char __user * , buf , size_t , count )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
loff_t pos = file_pos_read ( file ) ;
ret = vfs_read ( file , buf , count , & pos ) ;
file_pos_write ( file , pos ) ;
fput_light ( file , fput_needed ) ;
}
return ret ;
}
2009-01-14 16:14:22 +03:00
SYSCALL_DEFINE3 ( write , unsigned int , fd , const char __user * , buf ,
size_t , count )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
loff_t pos = file_pos_read ( file ) ;
ret = vfs_write ( file , buf , count , & pos ) ;
file_pos_write ( file , pos ) ;
fput_light ( file , fput_needed ) ;
}
return ret ;
}
2009-01-14 16:14:02 +03:00
SYSCALL_DEFINE ( pread64 ) ( unsigned int fd , char __user * buf ,
size_t count , loff_t pos )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
if ( pos < 0 )
return - EINVAL ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
ret = - ESPIPE ;
if ( file - > f_mode & FMODE_PREAD )
ret = vfs_read ( file , buf , count , & pos ) ;
fput_light ( file , fput_needed ) ;
}
return ret ;
}
2009-01-14 16:14:02 +03:00
# ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_pread64 ( long fd , long buf , long count , loff_t pos )
{
return SYSC_pread64 ( ( unsigned int ) fd , ( char __user * ) buf ,
( size_t ) count , pos ) ;
}
SYSCALL_ALIAS ( sys_pread64 , SyS_pread64 ) ;
# endif
2005-04-17 02:20:36 +04:00
2009-01-14 16:14:02 +03:00
SYSCALL_DEFINE ( pwrite64 ) ( unsigned int fd , const char __user * buf ,
size_t count , loff_t pos )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
if ( pos < 0 )
return - EINVAL ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
ret = - ESPIPE ;
if ( file - > f_mode & FMODE_PWRITE )
ret = vfs_write ( file , buf , count , & pos ) ;
fput_light ( file , fput_needed ) ;
}
return ret ;
}
2009-01-14 16:14:02 +03:00
# ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_pwrite64 ( long fd , long buf , long count , loff_t pos )
{
return SYSC_pwrite64 ( ( unsigned int ) fd , ( const char __user * ) buf ,
( size_t ) count , pos ) ;
}
SYSCALL_ALIAS ( sys_pwrite64 , SyS_pwrite64 ) ;
# endif
2005-04-17 02:20:36 +04:00
/*
* Reduce an iovec ' s length in - place . Return the resulting number of segments
*/
unsigned long iov_shorten ( struct iovec * iov , unsigned long nr_segs , size_t to )
{
unsigned long seg = 0 ;
size_t len = 0 ;
while ( seg < nr_segs ) {
seg + + ;
if ( len + iov - > iov_len > = to ) {
iov - > iov_len = to - len ;
break ;
}
len + = iov - > iov_len ;
iov + + ;
}
return seg ;
}
2008-01-29 07:58:27 +03:00
EXPORT_SYMBOL ( iov_shorten ) ;
2005-04-17 02:20:36 +04:00
2006-10-01 10:28:47 +04:00
ssize_t do_sync_readv_writev ( struct file * filp , const struct iovec * iov ,
unsigned long nr_segs , size_t len , loff_t * ppos , iov_fn_t fn )
{
struct kiocb kiocb ;
ssize_t ret ;
init_sync_kiocb ( & kiocb , filp ) ;
kiocb . ki_pos = * ppos ;
kiocb . ki_left = len ;
kiocb . ki_nbytes = len ;
for ( ; ; ) {
ret = fn ( & kiocb , iov , nr_segs , kiocb . ki_pos ) ;
if ( ret ! = - EIOCBRETRY )
break ;
wait_on_retry_sync_kiocb ( & kiocb ) ;
}
if ( ret = = - EIOCBQUEUED )
ret = wait_on_sync_kiocb ( & kiocb ) ;
* ppos = kiocb . ki_pos ;
return ret ;
}
/* Do it by hand, with file-ops */
ssize_t do_loop_readv_writev ( struct file * filp , struct iovec * iov ,
unsigned long nr_segs , loff_t * ppos , io_fn_t fn )
{
struct iovec * vector = iov ;
ssize_t ret = 0 ;
while ( nr_segs > 0 ) {
void __user * base ;
size_t len ;
ssize_t nr ;
base = vector - > iov_base ;
len = vector - > iov_len ;
vector + + ;
nr_segs - - ;
nr = fn ( filp , base , len , ppos ) ;
if ( nr < 0 ) {
if ( ! ret )
ret = nr ;
break ;
}
ret + = nr ;
if ( nr ! = len )
break ;
}
return ret ;
}
2005-04-17 02:20:36 +04:00
/* A write operation does a read from user space and vice versa */
# define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
2006-10-01 10:28:49 +04:00
ssize_t rw_copy_check_uvector ( int type , const struct iovec __user * uvector ,
unsigned long nr_segs , unsigned long fast_segs ,
struct iovec * fast_pointer ,
struct iovec * * ret_pointer )
{
unsigned long seg ;
ssize_t ret ;
struct iovec * iov = fast_pointer ;
/*
* SuS says " The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0 , or greater than { IOV_MAX } . Linux has
* traditionally returned zero for zero segments , so . . .
*/
if ( nr_segs = = 0 ) {
ret = 0 ;
goto out ;
}
/*
* First get the " struct iovec " from user memory and
* verify all the pointers
*/
if ( nr_segs > UIO_MAXIOV ) {
ret = - EINVAL ;
goto out ;
}
if ( nr_segs > fast_segs ) {
iov = kmalloc ( nr_segs * sizeof ( struct iovec ) , GFP_KERNEL ) ;
if ( iov = = NULL ) {
ret = - ENOMEM ;
goto out ;
}
}
if ( copy_from_user ( iov , uvector , nr_segs * sizeof ( * uvector ) ) ) {
ret = - EFAULT ;
goto out ;
}
/*
* According to the Single Unix Specification we should return EINVAL
* if an element length is < 0 when cast to ssize_t or if the
* total length would overflow the ssize_t return value of the
* system call .
*/
ret = 0 ;
for ( seg = 0 ; seg < nr_segs ; seg + + ) {
void __user * buf = iov [ seg ] . iov_base ;
ssize_t len = ( ssize_t ) iov [ seg ] . iov_len ;
/* see if we we're about to use an invalid len or if
* it ' s about to overflow ssize_t */
if ( len < 0 | | ( ret + len < ret ) ) {
ret = - EINVAL ;
goto out ;
}
if ( unlikely ( ! access_ok ( vrfy_dir ( type ) , buf , len ) ) ) {
ret = - EFAULT ;
goto out ;
}
ret + = len ;
}
out :
* ret_pointer = iov ;
return ret ;
}
2005-04-17 02:20:36 +04:00
static ssize_t do_readv_writev ( int type , struct file * file ,
const struct iovec __user * uvector ,
unsigned long nr_segs , loff_t * pos )
{
size_t tot_len ;
struct iovec iovstack [ UIO_FASTIOV ] ;
2006-10-01 10:28:47 +04:00
struct iovec * iov = iovstack ;
2005-04-17 02:20:36 +04:00
ssize_t ret ;
io_fn_t fn ;
iov_fn_t fnv ;
2006-10-01 10:28:49 +04:00
if ( ! file - > f_op ) {
ret = - EINVAL ;
2005-04-17 02:20:36 +04:00
goto out ;
}
2006-10-01 10:28:49 +04:00
ret = rw_copy_check_uvector ( type , uvector , nr_segs ,
ARRAY_SIZE ( iovstack ) , iovstack , & iov ) ;
if ( ret < = 0 )
2005-04-17 02:20:36 +04:00
goto out ;
2006-10-01 10:28:49 +04:00
tot_len = ret ;
2005-04-17 02:20:36 +04:00
ret = rw_verify_area ( type , file , pos , tot_len ) ;
2006-01-05 03:20:40 +03:00
if ( ret < 0 )
2005-09-28 19:21:28 +04:00
goto out ;
2005-04-17 02:20:36 +04:00
fnv = NULL ;
if ( type = = READ ) {
fn = file - > f_op - > read ;
2006-10-01 10:28:47 +04:00
fnv = file - > f_op - > aio_read ;
2005-04-17 02:20:36 +04:00
} else {
fn = ( io_fn_t ) file - > f_op - > write ;
2006-10-01 10:28:47 +04:00
fnv = file - > f_op - > aio_write ;
2005-04-17 02:20:36 +04:00
}
2006-10-01 10:28:47 +04:00
if ( fnv )
ret = do_sync_readv_writev ( file , iov , nr_segs , tot_len ,
pos , fnv ) ;
else
ret = do_loop_readv_writev ( file , iov , nr_segs , pos , fn ) ;
2005-04-17 02:20:36 +04:00
out :
if ( iov ! = iovstack )
kfree ( iov ) ;
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
if ( ( ret + ( type = = READ ) ) > 0 ) {
if ( type = = READ )
2009-12-18 05:24:21 +03:00
fsnotify_access ( file ) ;
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
else
2009-12-18 05:24:21 +03:00
fsnotify_modify ( file ) ;
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 01:06:03 +04:00
}
2005-04-17 02:20:36 +04:00
return ret ;
}
ssize_t vfs_readv ( struct file * file , const struct iovec __user * vec ,
unsigned long vlen , loff_t * pos )
{
if ( ! ( file - > f_mode & FMODE_READ ) )
return - EBADF ;
2006-10-01 10:28:47 +04:00
if ( ! file - > f_op | | ( ! file - > f_op - > aio_read & & ! file - > f_op - > read ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
return do_readv_writev ( READ , file , vec , vlen , pos ) ;
}
EXPORT_SYMBOL ( vfs_readv ) ;
ssize_t vfs_writev ( struct file * file , const struct iovec __user * vec ,
unsigned long vlen , loff_t * pos )
{
if ( ! ( file - > f_mode & FMODE_WRITE ) )
return - EBADF ;
2006-10-01 10:28:47 +04:00
if ( ! file - > f_op | | ( ! file - > f_op - > aio_write & & ! file - > f_op - > write ) )
2005-04-17 02:20:36 +04:00
return - EINVAL ;
return do_readv_writev ( WRITE , file , vec , vlen , pos ) ;
}
EXPORT_SYMBOL ( vfs_writev ) ;
2009-01-14 16:14:22 +03:00
SYSCALL_DEFINE3 ( readv , unsigned long , fd , const struct iovec __user * , vec ,
unsigned long , vlen )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
loff_t pos = file_pos_read ( file ) ;
ret = vfs_readv ( file , vec , vlen , & pos ) ;
file_pos_write ( file , pos ) ;
fput_light ( file , fput_needed ) ;
}
if ( ret > 0 )
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:46:45 +03:00
add_rchar ( current , ret ) ;
inc_syscr ( current ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
2009-01-14 16:14:22 +03:00
SYSCALL_DEFINE3 ( writev , unsigned long , fd , const struct iovec __user * , vec ,
unsigned long , vlen )
2005-04-17 02:20:36 +04:00
{
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
loff_t pos = file_pos_read ( file ) ;
ret = vfs_writev ( file , vec , vlen , & pos ) ;
file_pos_write ( file , pos ) ;
fput_light ( file , fput_needed ) ;
}
if ( ret > 0 )
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:46:45 +03:00
add_wchar ( current , ret ) ;
inc_syscw ( current ) ;
2005-04-17 02:20:36 +04:00
return ret ;
}
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 19:03:22 +04:00
static inline loff_t pos_from_hilo ( unsigned long high , unsigned long low )
{
# define HALF_LONG_BITS (BITS_PER_LONG / 2)
return ( ( ( loff_t ) high < < HALF_LONG_BITS ) < < HALF_LONG_BITS ) | low ;
}
2009-04-03 03:59:23 +04:00
SYSCALL_DEFINE5 ( preadv , unsigned long , fd , const struct iovec __user * , vec ,
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 19:03:22 +04:00
unsigned long , vlen , unsigned long , pos_l , unsigned long , pos_h )
2009-04-03 03:59:23 +04:00
{
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 19:03:22 +04:00
loff_t pos = pos_from_hilo ( pos_h , pos_l ) ;
2009-04-03 03:59:23 +04:00
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
if ( pos < 0 )
return - EINVAL ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
ret = - ESPIPE ;
if ( file - > f_mode & FMODE_PREAD )
ret = vfs_readv ( file , vec , vlen , & pos ) ;
fput_light ( file , fput_needed ) ;
}
if ( ret > 0 )
add_rchar ( current , ret ) ;
inc_syscr ( current ) ;
return ret ;
}
SYSCALL_DEFINE5 ( pwritev , unsigned long , fd , const struct iovec __user * , vec ,
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 19:03:22 +04:00
unsigned long , vlen , unsigned long , pos_l , unsigned long , pos_h )
2009-04-03 03:59:23 +04:00
{
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 19:03:22 +04:00
loff_t pos = pos_from_hilo ( pos_h , pos_l ) ;
2009-04-03 03:59:23 +04:00
struct file * file ;
ssize_t ret = - EBADF ;
int fput_needed ;
if ( pos < 0 )
return - EINVAL ;
file = fget_light ( fd , & fput_needed ) ;
if ( file ) {
ret = - ESPIPE ;
if ( file - > f_mode & FMODE_PWRITE )
ret = vfs_writev ( file , vec , vlen , & pos ) ;
fput_light ( file , fput_needed ) ;
}
if ( ret > 0 )
add_wchar ( current , ret ) ;
inc_syscw ( current ) ;
return ret ;
}
2005-04-17 02:20:36 +04:00
static ssize_t do_sendfile ( int out_fd , int in_fd , loff_t * ppos ,
size_t count , loff_t max )
{
struct file * in_file , * out_file ;
struct inode * in_inode , * out_inode ;
loff_t pos ;
ssize_t retval ;
2007-06-01 16:52:37 +04:00
int fput_needed_in , fput_needed_out , fl ;
2005-04-17 02:20:36 +04:00
/*
* Get input file , and verify that it is ok . .
*/
retval = - EBADF ;
in_file = fget_light ( in_fd , & fput_needed_in ) ;
if ( ! in_file )
goto out ;
if ( ! ( in_file - > f_mode & FMODE_READ ) )
goto fput_in ;
retval = - ESPIPE ;
if ( ! ppos )
ppos = & in_file - > f_pos ;
else
if ( ! ( in_file - > f_mode & FMODE_PREAD ) )
goto fput_in ;
retval = rw_verify_area ( READ , in_file , ppos , count ) ;
2006-01-05 03:20:40 +03:00
if ( retval < 0 )
2005-04-17 02:20:36 +04:00
goto fput_in ;
2006-01-05 03:20:40 +03:00
count = retval ;
2005-04-17 02:20:36 +04:00
/*
* Get output file , and verify that it is ok . .
*/
retval = - EBADF ;
out_file = fget_light ( out_fd , & fput_needed_out ) ;
if ( ! out_file )
goto fput_in ;
if ( ! ( out_file - > f_mode & FMODE_WRITE ) )
goto fput_out ;
retval = - EINVAL ;
2009-05-07 17:37:36 +04:00
in_inode = in_file - > f_path . dentry - > d_inode ;
2006-12-08 13:36:35 +03:00
out_inode = out_file - > f_path . dentry - > d_inode ;
2005-04-17 02:20:36 +04:00
retval = rw_verify_area ( WRITE , out_file , & out_file - > f_pos , count ) ;
2006-01-05 03:20:40 +03:00
if ( retval < 0 )
2005-04-17 02:20:36 +04:00
goto fput_out ;
2006-01-05 03:20:40 +03:00
count = retval ;
2005-04-17 02:20:36 +04:00
if ( ! max )
max = min ( in_inode - > i_sb - > s_maxbytes , out_inode - > i_sb - > s_maxbytes ) ;
pos = * ppos ;
if ( unlikely ( pos + count > max ) ) {
retval = - EOVERFLOW ;
if ( pos > = max )
goto fput_out ;
count = max - pos ;
}
2007-06-11 14:18:52 +04:00
fl = 0 ;
2007-06-01 16:52:37 +04:00
#if 0
2007-06-11 14:18:52 +04:00
/*
* We need to debate whether we can enable this or not . The
* man page documents EAGAIN return for the output at least ,
* and the application is arguably buggy if it doesn ' t expect
* EAGAIN on a non - blocking file descriptor .
*/
if ( in_file - > f_flags & O_NONBLOCK )
fl = SPLICE_F_NONBLOCK ;
2007-06-01 16:52:37 +04:00
# endif
2007-06-11 14:18:52 +04:00
retval = do_splice_direct ( in_file , ppos , out_file , count , fl ) ;
2005-04-17 02:20:36 +04:00
if ( retval > 0 ) {
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:46:45 +03:00
add_rchar ( current , retval ) ;
add_wchar ( current , retval ) ;
2005-04-17 02:20:36 +04:00
}
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 12:46:45 +03:00
inc_syscr ( current ) ;
inc_syscw ( current ) ;
2005-04-17 02:20:36 +04:00
if ( * ppos > max )
retval = - EOVERFLOW ;
fput_out :
fput_light ( out_file , fput_needed_out ) ;
fput_in :
fput_light ( in_file , fput_needed_in ) ;
out :
return retval ;
}
2009-01-14 16:14:18 +03:00
SYSCALL_DEFINE4 ( sendfile , int , out_fd , int , in_fd , off_t __user * , offset , size_t , count )
2005-04-17 02:20:36 +04:00
{
loff_t pos ;
off_t off ;
ssize_t ret ;
if ( offset ) {
if ( unlikely ( get_user ( off , offset ) ) )
return - EFAULT ;
pos = off ;
ret = do_sendfile ( out_fd , in_fd , & pos , count , MAX_NON_LFS ) ;
if ( unlikely ( put_user ( pos , offset ) ) )
return - EFAULT ;
return ret ;
}
return do_sendfile ( out_fd , in_fd , NULL , count , 0 ) ;
}
2009-01-14 16:14:18 +03:00
SYSCALL_DEFINE4 ( sendfile64 , int , out_fd , int , in_fd , loff_t __user * , offset , size_t , count )
2005-04-17 02:20:36 +04:00
{
loff_t pos ;
ssize_t ret ;
if ( offset ) {
if ( unlikely ( copy_from_user ( & pos , offset , sizeof ( loff_t ) ) ) )
return - EFAULT ;
ret = do_sendfile ( out_fd , in_fd , & pos , count , 0 ) ;
if ( unlikely ( put_user ( pos , offset ) ) )
return - EFAULT ;
return ret ;
}
return do_sendfile ( out_fd , in_fd , NULL , count , 0 ) ;
}