2006-03-31 14:30:42 +04:00
/*
* High - level sync ( ) - related operations
*/
# include <linux/kernel.h>
# include <linux/file.h>
# include <linux/fs.h>
# include <linux/module.h>
2006-10-18 21:55:46 +04:00
# include <linux/sched.h>
2006-03-31 14:30:42 +04:00
# include <linux/writeback.h>
# include <linux/syscalls.h>
# include <linux/linkage.h>
# include <linux/pagemap.h>
2006-08-29 22:05:54 +04:00
# include <linux/quotaops.h>
# include <linux/buffer_head.h>
2009-04-27 18:43:48 +04:00
# include "internal.h"
2006-03-31 14:30:42 +04:00
# define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
SYNC_FILE_RANGE_WAIT_AFTER )
2009-04-27 18:43:52 +04:00
/*
* Do the filesystem syncing work . For simple filesystems sync_inodes_sb ( sb , 0 )
* just dirties buffers with inodes so we have to submit IO for these buffers
* via __sync_blockdev ( ) . This also speeds up the wait = = 1 case since in that
* case write_inode ( ) functions do sync_dirty_buffer ( ) and thus effectively
* write one block at a time .
*/
2009-04-27 18:43:53 +04:00
static int __sync_filesystem ( struct super_block * sb , int wait )
2009-04-27 18:43:52 +04:00
{
2009-04-27 18:43:55 +04:00
/* Avoid doing twice syncing and cache pruning for quota sync */
if ( ! wait )
writeout_quota_sb ( sb , - 1 ) ;
else
sync_quota_sb ( sb , - 1 ) ;
2009-04-27 18:43:52 +04:00
sync_inodes_sb ( sb , wait ) ;
if ( sb - > s_op - > sync_fs )
sb - > s_op - > sync_fs ( sb , wait ) ;
return __sync_blockdev ( sb - > s_bdev , wait ) ;
}
/*
* Write out and wait upon all dirty data associated with this
* superblock . Filesystem data as well as the underlying block
* device . Takes the superblock lock .
*/
2009-04-27 18:43:53 +04:00
int sync_filesystem ( struct super_block * sb )
2009-04-27 18:43:52 +04:00
{
int ret ;
2009-05-05 17:41:25 +04:00
/*
* We need to be protected against the filesystem going from
* r / o to r / w or vice versa .
*/
WARN_ON ( ! rwsem_is_locked ( & sb - > s_umount ) ) ;
/*
* No point in syncing out anything if the filesystem is read - only .
*/
if ( sb - > s_flags & MS_RDONLY )
return 0 ;
2009-04-27 18:43:53 +04:00
ret = __sync_filesystem ( sb , 0 ) ;
2009-04-27 18:43:52 +04:00
if ( ret < 0 )
return ret ;
2009-04-27 18:43:53 +04:00
return __sync_filesystem ( sb , 1 ) ;
2009-04-27 18:43:52 +04:00
}
2009-04-27 18:43:53 +04:00
EXPORT_SYMBOL_GPL ( sync_filesystem ) ;
2009-04-27 18:43:52 +04:00
/*
* Sync all the data for all the filesystems ( called by sys_sync ( ) and
* emergency sync )
*
* This operation is careful to avoid the livelock which could easily happen
* if two or more filesystems are being continuously dirtied . s_need_sync
* is used only here . We set it against all filesystems and then clear it as
* we sync them . So redirtied filesystems are skipped .
*
* But if process A is currently running sync_filesystems and then process B
* calls sync_filesystems as well , process B will set all the s_need_sync
* flags again , which will cause process A to resync everything . Fix that with
* a local mutex .
*/
static void sync_filesystems ( int wait )
{
struct super_block * sb ;
static DEFINE_MUTEX ( mutex ) ;
mutex_lock ( & mutex ) ; /* Could be down_interruptible */
spin_lock ( & sb_lock ) ;
2009-05-05 17:41:25 +04:00
list_for_each_entry ( sb , & super_blocks , s_list )
2009-04-27 18:43:52 +04:00
sb - > s_need_sync = 1 ;
restart :
list_for_each_entry ( sb , & super_blocks , s_list ) {
if ( ! sb - > s_need_sync )
continue ;
sb - > s_need_sync = 0 ;
sb - > s_count + + ;
spin_unlock ( & sb_lock ) ;
2009-05-05 17:41:25 +04:00
2009-04-27 18:43:52 +04:00
down_read ( & sb - > s_umount ) ;
2009-05-05 17:41:25 +04:00
if ( ! ( sb - > s_flags & MS_RDONLY ) & & sb - > s_root )
2009-04-27 18:43:53 +04:00
__sync_filesystem ( sb , wait ) ;
2009-04-27 18:43:52 +04:00
up_read ( & sb - > s_umount ) ;
2009-05-05 17:41:25 +04:00
2009-04-27 18:43:52 +04:00
/* restart only when sb is no longer on the list */
spin_lock ( & sb_lock ) ;
if ( __put_super_and_need_restart ( sb ) )
goto restart ;
}
spin_unlock ( & sb_lock ) ;
mutex_unlock ( & mutex ) ;
}
2009-07-05 23:08:08 +04:00
/*
* sync everything . Start out by waking pdflush , because that writes back
* all queues in parallel .
*/
2009-04-27 18:43:51 +04:00
SYSCALL_DEFINE0 ( sync )
2006-08-29 22:05:54 +04:00
{
2009-07-05 23:08:08 +04:00
wakeup_pdflush ( 0 ) ;
2009-04-27 18:43:51 +04:00
sync_filesystems ( 0 ) ;
sync_filesystems ( 1 ) ;
2006-08-29 22:05:54 +04:00
if ( unlikely ( laptop_mode ) )
laptop_sync_completion ( ) ;
return 0 ;
}
2009-03-17 11:38:40 +03:00
static void do_sync_work ( struct work_struct * work )
{
2009-04-27 18:43:51 +04:00
/*
* Sync twice to reduce the possibility we skipped some inodes / pages
* because they were temporarily locked
*/
sync_filesystems ( 0 ) ;
sync_filesystems ( 0 ) ;
printk ( " Emergency Sync complete \n " ) ;
2009-03-17 11:38:40 +03:00
kfree ( work ) ;
}
2006-08-29 22:05:54 +04:00
void emergency_sync ( void )
{
2009-03-17 11:38:40 +03:00
struct work_struct * work ;
work = kmalloc ( sizeof ( * work ) , GFP_ATOMIC ) ;
if ( work ) {
INIT_WORK ( work , do_sync_work ) ;
schedule_work ( work ) ;
}
2006-08-29 22:05:54 +04:00
}
/*
* Generic function to fsync a file .
*
* filp may be NULL if called via the msync of a vma .
*/
int file_fsync ( struct file * filp , struct dentry * dentry , int datasync )
{
struct inode * inode = dentry - > d_inode ;
struct super_block * sb ;
int ret , err ;
/* sync the inode to buffers */
ret = write_inode_now ( inode , 0 ) ;
/* sync the superblock to buffers */
sb = inode - > i_sb ;
2008-04-29 11:59:42 +04:00
if ( sb - > s_dirt & & sb - > s_op - > write_super )
2006-08-29 22:05:54 +04:00
sb - > s_op - > write_super ( sb ) ;
/* .. finally sync the buffers to disk */
err = sync_blockdev ( sb - > s_bdev ) ;
if ( ! ret )
ret = err ;
return ret ;
}
2008-12-22 23:11:15 +03:00
/**
* vfs_fsync - perform a fsync or fdatasync on a file
* @ file : file to sync
* @ dentry : dentry of @ file
* @ data : only perform a fdatasync operation
*
* Write back data and metadata for @ file to disk . If @ datasync is
* set only metadata needed to access modified file data is written .
*
* In case this function is called from nfsd @ file may be % NULL and
* only @ dentry is set . This can only happen when the filesystem
* implements the export_operations API .
*/
int vfs_fsync ( struct file * file , struct dentry * dentry , int datasync )
2006-08-29 22:05:54 +04:00
{
2008-12-22 23:11:15 +03:00
const struct file_operations * fop ;
struct address_space * mapping ;
int err , ret ;
/*
* Get mapping and operations from the file in case we have
* as file , or get the default values for them in case we
* don ' t have a struct file available . Damn nfsd . .
*/
if ( file ) {
mapping = file - > f_mapping ;
fop = file - > f_op ;
} else {
mapping = dentry - > d_inode - > i_mapping ;
fop = dentry - > d_inode - > i_fop ;
}
2006-08-29 22:05:54 +04:00
2008-12-22 23:11:15 +03:00
if ( ! fop | | ! fop - > fsync ) {
2006-08-29 22:05:54 +04:00
ret = - EINVAL ;
goto out ;
}
ret = filemap_fdatawrite ( mapping ) ;
/*
* We need to protect against concurrent writers , which could cause
* livelocks in fsync_buffers_list ( ) .
*/
mutex_lock ( & mapping - > host - > i_mutex ) ;
2008-12-22 23:11:15 +03:00
err = fop - > fsync ( file , dentry , datasync ) ;
2006-08-29 22:05:54 +04:00
if ( ! ret )
ret = err ;
mutex_unlock ( & mapping - > host - > i_mutex ) ;
err = filemap_fdatawait ( mapping ) ;
if ( ! ret )
ret = err ;
out :
return ret ;
}
2008-12-22 23:11:15 +03:00
EXPORT_SYMBOL ( vfs_fsync ) ;
2006-08-29 22:05:54 +04:00
2008-12-22 23:11:15 +03:00
static int do_fsync ( unsigned int fd , int datasync )
2006-08-29 22:05:54 +04:00
{
struct file * file ;
int ret = - EBADF ;
file = fget ( fd ) ;
if ( file ) {
2008-12-22 23:11:15 +03:00
ret = vfs_fsync ( file , file - > f_path . dentry , datasync ) ;
2006-08-29 22:05:54 +04:00
fput ( file ) ;
}
return ret ;
}
2009-01-14 16:14:11 +03:00
SYSCALL_DEFINE1 ( fsync , unsigned int , fd )
2006-08-29 22:05:54 +04:00
{
2008-12-22 23:11:15 +03:00
return do_fsync ( fd , 0 ) ;
2006-08-29 22:05:54 +04:00
}
2009-01-14 16:14:11 +03:00
SYSCALL_DEFINE1 ( fdatasync , unsigned int , fd )
2006-08-29 22:05:54 +04:00
{
2008-12-22 23:11:15 +03:00
return do_fsync ( fd , 1 ) ;
2006-08-29 22:05:54 +04:00
}
2006-03-31 14:30:42 +04:00
/*
* sys_sync_file_range ( ) permits finely controlled syncing over a segment of
* a file in the range offset . . ( offset + nbytes - 1 ) inclusive . If nbytes is
* zero then sys_sync_file_range ( ) will operate from offset out to EOF .
*
* The flag bits are :
*
* SYNC_FILE_RANGE_WAIT_BEFORE : wait upon writeout of all pages in the range
* before performing the write .
*
* SYNC_FILE_RANGE_WRITE : initiate writeout of all those dirty pages in the
2008-07-24 08:27:36 +04:00
* range which are not presently under writeback . Note that this may block for
* significant periods due to exhaustion of disk request structures .
2006-03-31 14:30:42 +04:00
*
* SYNC_FILE_RANGE_WAIT_AFTER : wait upon writeout of all pages in the range
* after performing the write .
*
* Useful combinations of the flag bits are :
*
* SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE : ensures that all pages
* in the range which were dirty on entry to sys_sync_file_range ( ) are placed
* under writeout . This is a start - write - for - data - integrity operation .
*
* SYNC_FILE_RANGE_WRITE : start writeout of all dirty pages in the range which
* are not presently under writeout . This is an asynchronous flush - to - disk
* operation . Not suitable for data integrity operations .
*
* SYNC_FILE_RANGE_WAIT_BEFORE ( or SYNC_FILE_RANGE_WAIT_AFTER ) : wait for
* completion of writeout of all pages in the range . This will be used after an
* earlier SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE operation to wait
* for that operation to complete and to return the result .
*
* SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER :
* a traditional sync ( ) operation . This is a write - for - data - integrity operation
* which will ensure that all pages in the range which were dirty on entry to
* sys_sync_file_range ( ) are committed to disk .
*
*
* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
* I / O errors or ENOSPC conditions and will return those to the caller , after
* clearing the EIO and ENOSPC flags in the address_space .
*
* It should be noted that none of these operations write out the file ' s
* metadata . So unless the application is strictly performing overwrites of
* already - instantiated disk blocks , there are no guarantees here that the data
* will be available after a crash .
*/
2009-01-14 16:14:02 +03:00
SYSCALL_DEFINE ( sync_file_range ) ( int fd , loff_t offset , loff_t nbytes ,
unsigned int flags )
2006-03-31 14:30:42 +04:00
{
int ret ;
struct file * file ;
loff_t endbyte ; /* inclusive */
int fput_needed ;
umode_t i_mode ;
ret = - EINVAL ;
if ( flags & ~ VALID_FLAGS )
goto out ;
endbyte = offset + nbytes ;
if ( ( s64 ) offset < 0 )
goto out ;
if ( ( s64 ) endbyte < 0 )
goto out ;
if ( endbyte < offset )
goto out ;
if ( sizeof ( pgoff_t ) = = 4 ) {
if ( offset > = ( 0x100000000ULL < < PAGE_CACHE_SHIFT ) ) {
/*
* The range starts outside a 32 bit machine ' s
* pagecache addressing capabilities . Let it " succeed "
*/
ret = 0 ;
goto out ;
}
if ( endbyte > = ( 0x100000000ULL < < PAGE_CACHE_SHIFT ) ) {
/*
* Out to EOF
*/
nbytes = 0 ;
}
}
if ( nbytes = = 0 )
[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request. Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".
To make all this sane, the patch changes range of writeback_control.
So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.
And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.
This patch does,
- Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
-1 is usually ok for range_end (type is long long). But, if someone did,
range_end += val; range_end is "val - 1"
u64val = range_end >> bits; u64val is "~(0ULL)"
or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
things, and uses LLONG_MAX for range_end.
- All callers of ->writepages() sets range_start/end or range_cyclic.
- Fix updates of ->writeback_index. It seems already bit strange.
If it starts at 0 and ended by check of nr_to_write, this last
index may reduce chance to scan end of file. So, this updates
->writeback_index only if range_cyclic is true or whole-file is
scanned.
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:26 +04:00
endbyte = LLONG_MAX ;
2006-03-31 14:30:42 +04:00
else
endbyte - - ; /* inclusive */
ret = - EBADF ;
file = fget_light ( fd , & fput_needed ) ;
if ( ! file )
goto out ;
2006-12-08 13:36:35 +03:00
i_mode = file - > f_path . dentry - > d_inode - > i_mode ;
2006-03-31 14:30:42 +04:00
ret = - ESPIPE ;
if ( ! S_ISREG ( i_mode ) & & ! S_ISBLK ( i_mode ) & & ! S_ISDIR ( i_mode ) & &
! S_ISLNK ( i_mode ) )
goto out_put ;
2007-05-08 11:27:10 +04:00
ret = do_sync_mapping_range ( file - > f_mapping , offset , endbyte , flags ) ;
2006-03-31 14:30:42 +04:00
out_put :
fput_light ( file , fput_needed ) ;
out :
return ret ;
}
2009-01-14 16:14:02 +03:00
# ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_sync_file_range ( long fd , loff_t offset , loff_t nbytes ,
long flags )
{
return SYSC_sync_file_range ( ( int ) fd , offset , nbytes ,
( unsigned int ) flags ) ;
}
SYSCALL_ALIAS ( sys_sync_file_range , SyS_sync_file_range ) ;
# endif
2006-03-31 14:30:42 +04:00
Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM
Not all the world is an i386. Many architectures need 64-bit arguments to be
aligned in suitable pairs of registers, and the original
sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an
argument register for padding after the first integer. Since we don't
normally have more than 6 arguments for system calls, that left no room for
the final argument on some architectures.
Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which
all fits nicely. In fact, ARM already had that, but called it
sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement
the needed compatibility routine. And stop the missing syscall check from
bitching about the absence of sys_sync_file_range() if we've implemented
sys_sync_file_range2() instead.
Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64.
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-28 01:10:09 +04:00
/* It would be nice if people remember that not all the world's an i386
when they introduce new system calls */
2009-01-14 16:14:02 +03:00
SYSCALL_DEFINE ( sync_file_range2 ) ( int fd , unsigned int flags ,
loff_t offset , loff_t nbytes )
Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM
Not all the world is an i386. Many architectures need 64-bit arguments to be
aligned in suitable pairs of registers, and the original
sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an
argument register for padding after the first integer. Since we don't
normally have more than 6 arguments for system calls, that left no room for
the final argument on some architectures.
Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which
all fits nicely. In fact, ARM already had that, but called it
sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement
the needed compatibility routine. And stop the missing syscall check from
bitching about the absence of sys_sync_file_range() if we've implemented
sys_sync_file_range2() instead.
Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64.
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-28 01:10:09 +04:00
{
return sys_sync_file_range ( fd , offset , nbytes , flags ) ;
}
2009-01-14 16:14:02 +03:00
# ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_sync_file_range2 ( long fd , long flags ,
loff_t offset , loff_t nbytes )
{
return SYSC_sync_file_range2 ( ( int ) fd , ( unsigned int ) flags ,
offset , nbytes ) ;
}
SYSCALL_ALIAS ( sys_sync_file_range2 , SyS_sync_file_range2 ) ;
# endif
Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM
Not all the world is an i386. Many architectures need 64-bit arguments to be
aligned in suitable pairs of registers, and the original
sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an
argument register for padding after the first integer. Since we don't
normally have more than 6 arguments for system calls, that left no room for
the final argument on some architectures.
Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which
all fits nicely. In fact, ARM already had that, but called it
sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement
the needed compatibility routine. And stop the missing syscall check from
bitching about the absence of sys_sync_file_range() if we've implemented
sys_sync_file_range2() instead.
Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64.
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-28 01:10:09 +04:00
2006-03-31 14:30:42 +04:00
/*
* ` endbyte ' is inclusive
*/
2007-03-01 22:01:55 +03:00
int do_sync_mapping_range ( struct address_space * mapping , loff_t offset ,
loff_t endbyte , unsigned int flags )
2006-03-31 14:30:42 +04:00
{
int ret ;
if ( ! mapping ) {
ret = - EINVAL ;
goto out ;
}
ret = 0 ;
if ( flags & SYNC_FILE_RANGE_WAIT_BEFORE ) {
ret = wait_on_page_writeback_range ( mapping ,
offset > > PAGE_CACHE_SHIFT ,
endbyte > > PAGE_CACHE_SHIFT ) ;
if ( ret < 0 )
goto out ;
}
if ( flags & SYNC_FILE_RANGE_WRITE ) {
ret = __filemap_fdatawrite_range ( mapping , offset , endbyte ,
2009-01-07 01:39:12 +03:00
WB_SYNC_ALL ) ;
2006-03-31 14:30:42 +04:00
if ( ret < 0 )
goto out ;
}
if ( flags & SYNC_FILE_RANGE_WAIT_AFTER ) {
ret = wait_on_page_writeback_range ( mapping ,
offset > > PAGE_CACHE_SHIFT ,
endbyte > > PAGE_CACHE_SHIFT ) ;
}
out :
return ret ;
}
2007-03-01 22:01:55 +03:00
EXPORT_SYMBOL_GPL ( do_sync_mapping_range ) ;