2007-06-12 17:07:21 +04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
2007-06-12 14:35:45 +04:00
# include <linux/fs.h>
# include <linux/pagemap.h>
# include <linux/highmem.h>
# include <linux/time.h>
# include <linux/init.h>
# include <linux/string.h>
# include <linux/backing-dev.h>
# include <linux/mpage.h>
2011-01-14 15:07:43 +03:00
# include <linux/falloc.h>
2007-06-12 14:35:45 +04:00
# include <linux/swap.h>
# include <linux/writeback.h>
# include <linux/statfs.h>
# include <linux/compat.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2013-01-29 10:04:50 +04:00
# include <linux/btrfs.h>
2015-02-22 19:58:50 +03:00
# include <linux/uio.h>
2007-06-12 14:35:45 +04:00
# include "ctree.h"
# include "disk-io.h"
# include "transaction.h"
# include "btrfs_inode.h"
# include "print-tree.h"
2008-09-06 00:13:11 +04:00
# include "tree-log.h"
# include "locking.h"
2012-08-29 22:27:18 +04:00
# include "volumes.h"
2014-05-14 04:30:47 +04:00
# include "qgroup.h"
2007-06-12 14:35:45 +04:00
2012-11-26 13:24:43 +04:00
static struct kmem_cache * btrfs_inode_defrag_cachep ;
2011-05-24 23:35:30 +04:00
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node ;
/* objectid */
u64 ino ;
/*
* transid where the defrag was added , we search for
* extents newer than this
*/
u64 transid ;
/* root objectid */
u64 root ;
/* last offset we were able to defrag */
u64 last_offset ;
/* if we've wrapped around back to zero once already */
int cycled ;
} ;
2012-05-24 14:58:27 +04:00
static int __compare_inode_defrag ( struct inode_defrag * defrag1 ,
struct inode_defrag * defrag2 )
{
if ( defrag1 - > root > defrag2 - > root )
return 1 ;
else if ( defrag1 - > root < defrag2 - > root )
return - 1 ;
else if ( defrag1 - > ino > defrag2 - > ino )
return 1 ;
else if ( defrag1 - > ino < defrag2 - > ino )
return - 1 ;
else
return 0 ;
}
2011-05-24 23:35:30 +04:00
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you ' re inserting a record for an older transid than an
* existing record , the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
2012-11-26 13:25:38 +04:00
static int __btrfs_add_inode_defrag ( struct inode * inode ,
2011-05-24 23:35:30 +04:00
struct inode_defrag * defrag )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct inode_defrag * entry ;
struct rb_node * * p ;
struct rb_node * parent = NULL ;
2012-05-24 14:58:27 +04:00
int ret ;
2011-05-24 23:35:30 +04:00
p = & root - > fs_info - > defrag_inodes . rb_node ;
while ( * p ) {
parent = * p ;
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-05-24 14:58:27 +04:00
ret = __compare_inode_defrag ( defrag , entry ) ;
if ( ret < 0 )
2011-05-24 23:35:30 +04:00
p = & parent - > rb_left ;
2012-05-24 14:58:27 +04:00
else if ( ret > 0 )
2011-05-24 23:35:30 +04:00
p = & parent - > rb_right ;
else {
/* if we're reinserting an entry for
* an old defrag run , make sure to
* lower the transid of our existing record
*/
if ( defrag - > transid < entry - > transid )
entry - > transid = defrag - > transid ;
if ( defrag - > last_offset > entry - > last_offset )
entry - > last_offset = defrag - > last_offset ;
2012-11-26 13:25:38 +04:00
return - EEXIST ;
2011-05-24 23:35:30 +04:00
}
}
2012-05-23 22:13:11 +04:00
set_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ;
2011-05-24 23:35:30 +04:00
rb_link_node ( & defrag - > rb_node , parent , p ) ;
rb_insert_color ( & defrag - > rb_node , & root - > fs_info - > defrag_inodes ) ;
2012-11-26 13:25:38 +04:00
return 0 ;
}
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
static inline int __need_auto_defrag ( struct btrfs_root * root )
{
if ( ! btrfs_test_opt ( root , AUTO_DEFRAG ) )
return 0 ;
if ( btrfs_fs_closing ( root - > fs_info ) )
return 0 ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
return 1 ;
2011-05-24 23:35:30 +04:00
}
/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag ( struct btrfs_trans_handle * trans ,
struct inode * inode )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct inode_defrag * defrag ;
u64 transid ;
2012-11-26 13:25:38 +04:00
int ret ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
if ( ! __need_auto_defrag ( root ) )
2011-05-24 23:35:30 +04:00
return 0 ;
2012-05-23 22:13:11 +04:00
if ( test_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) )
2011-05-24 23:35:30 +04:00
return 0 ;
if ( trans )
transid = trans - > transid ;
else
transid = BTRFS_I ( inode ) - > root - > last_trans ;
2012-11-26 13:24:43 +04:00
defrag = kmem_cache_zalloc ( btrfs_inode_defrag_cachep , GFP_NOFS ) ;
2011-05-24 23:35:30 +04:00
if ( ! defrag )
return - ENOMEM ;
2011-05-31 21:08:14 +04:00
defrag - > ino = btrfs_ino ( inode ) ;
2011-05-24 23:35:30 +04:00
defrag - > transid = transid ;
defrag - > root = root - > root_key . objectid ;
spin_lock ( & root - > fs_info - > defrag_inodes_lock ) ;
2012-11-26 13:25:38 +04:00
if ( ! test_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ) {
/*
* If we set IN_DEFRAG flag and evict the inode from memory ,
* and then re - read this inode , this new inode doesn ' t have
* IN_DEFRAG flag . At the case , we may find the existed defrag .
*/
ret = __btrfs_add_inode_defrag ( inode , defrag ) ;
if ( ret )
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
} else {
2012-11-26 13:24:43 +04:00
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
2012-11-26 13:25:38 +04:00
}
2011-05-24 23:35:30 +04:00
spin_unlock ( & root - > fs_info - > defrag_inodes_lock ) ;
2011-07-18 16:19:35 +04:00
return 0 ;
2011-05-24 23:35:30 +04:00
}
/*
2012-11-26 13:25:38 +04:00
* Requeue the defrag object . If there is a defrag object that points to
* the same inode in the tree , we will merge them together ( by
* __btrfs_add_inode_defrag ( ) ) and free the one that we want to requeue .
2011-05-24 23:35:30 +04:00
*/
2013-04-26 00:41:01 +04:00
static void btrfs_requeue_inode_defrag ( struct inode * inode ,
struct inode_defrag * defrag )
2012-11-26 13:25:38 +04:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
int ret ;
if ( ! __need_auto_defrag ( root ) )
goto out ;
/*
* Here we don ' t check the IN_DEFRAG flag , because we need merge
* them together .
*/
spin_lock ( & root - > fs_info - > defrag_inodes_lock ) ;
ret = __btrfs_add_inode_defrag ( inode , defrag ) ;
spin_unlock ( & root - > fs_info - > defrag_inodes_lock ) ;
if ( ret )
goto out ;
return ;
out :
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
}
2011-05-24 23:35:30 +04:00
/*
2012-11-26 13:26:20 +04:00
* pick the defragable inode that we want , if it doesn ' t exist , we will get
* the next one .
2011-05-24 23:35:30 +04:00
*/
2012-11-26 13:26:20 +04:00
static struct inode_defrag *
btrfs_pick_defrag_inode ( struct btrfs_fs_info * fs_info , u64 root , u64 ino )
2011-05-24 23:35:30 +04:00
{
struct inode_defrag * entry = NULL ;
2012-05-24 14:58:27 +04:00
struct inode_defrag tmp ;
2011-05-24 23:35:30 +04:00
struct rb_node * p ;
struct rb_node * parent = NULL ;
2012-05-24 14:58:27 +04:00
int ret ;
tmp . ino = ino ;
tmp . root = root ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
spin_lock ( & fs_info - > defrag_inodes_lock ) ;
p = fs_info - > defrag_inodes . rb_node ;
2011-05-24 23:35:30 +04:00
while ( p ) {
parent = p ;
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-05-24 14:58:27 +04:00
ret = __compare_inode_defrag ( & tmp , entry ) ;
if ( ret < 0 )
2011-05-24 23:35:30 +04:00
p = parent - > rb_left ;
2012-05-24 14:58:27 +04:00
else if ( ret > 0 )
2011-05-24 23:35:30 +04:00
p = parent - > rb_right ;
else
2012-11-26 13:26:20 +04:00
goto out ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
if ( parent & & __compare_inode_defrag ( & tmp , entry ) > 0 ) {
parent = rb_next ( parent ) ;
if ( parent )
2011-05-24 23:35:30 +04:00
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-11-26 13:26:20 +04:00
else
entry = NULL ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
out :
if ( entry )
rb_erase ( parent , & fs_info - > defrag_inodes ) ;
spin_unlock ( & fs_info - > defrag_inodes_lock ) ;
return entry ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
void btrfs_cleanup_defrag_inodes ( struct btrfs_fs_info * fs_info )
2011-05-24 23:35:30 +04:00
{
struct inode_defrag * defrag ;
2012-11-26 13:26:20 +04:00
struct rb_node * node ;
spin_lock ( & fs_info - > defrag_inodes_lock ) ;
node = rb_first ( & fs_info - > defrag_inodes ) ;
while ( node ) {
rb_erase ( node , & fs_info - > defrag_inodes ) ;
defrag = rb_entry ( node , struct inode_defrag , rb_node ) ;
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
2015-01-08 17:20:54 +03:00
cond_resched_lock ( & fs_info - > defrag_inodes_lock ) ;
2012-11-26 13:26:20 +04:00
node = rb_first ( & fs_info - > defrag_inodes ) ;
}
spin_unlock ( & fs_info - > defrag_inodes_lock ) ;
}
# define BTRFS_DEFRAG_BATCH 1024
static int __btrfs_run_defrag_inode ( struct btrfs_fs_info * fs_info ,
struct inode_defrag * defrag )
{
2011-05-24 23:35:30 +04:00
struct btrfs_root * inode_root ;
struct inode * inode ;
struct btrfs_key key ;
struct btrfs_ioctl_defrag_range_args range ;
int num_defrag ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
int index ;
int ret ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
/* get the inode */
key . objectid = defrag - > root ;
2014-06-04 20:41:45 +04:00
key . type = BTRFS_ROOT_ITEM_KEY ;
2012-11-26 13:26:20 +04:00
key . offset = ( u64 ) - 1 ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
index = srcu_read_lock ( & fs_info - > subvol_srcu ) ;
2012-11-26 13:26:20 +04:00
inode_root = btrfs_read_fs_root_no_name ( fs_info , & key ) ;
if ( IS_ERR ( inode_root ) ) {
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
ret = PTR_ERR ( inode_root ) ;
goto cleanup ;
}
2012-11-26 13:26:20 +04:00
key . objectid = defrag - > ino ;
2014-06-04 20:41:45 +04:00
key . type = BTRFS_INODE_ITEM_KEY ;
2012-11-26 13:26:20 +04:00
key . offset = 0 ;
inode = btrfs_iget ( fs_info - > sb , & key , inode_root , NULL ) ;
if ( IS_ERR ( inode ) ) {
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
ret = PTR_ERR ( inode ) ;
goto cleanup ;
2012-11-26 13:26:20 +04:00
}
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
srcu_read_unlock ( & fs_info - > subvol_srcu , index ) ;
2012-11-26 13:26:20 +04:00
/* do a chunk of defrag */
clear_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ;
2011-05-24 23:35:30 +04:00
memset ( & range , 0 , sizeof ( range ) ) ;
range . len = ( u64 ) - 1 ;
2012-11-26 13:26:20 +04:00
range . start = defrag - > last_offset ;
2012-11-26 13:27:29 +04:00
sb_start_write ( fs_info - > sb ) ;
2012-11-26 13:26:20 +04:00
num_defrag = btrfs_defrag_file ( inode , NULL , & range , defrag - > transid ,
BTRFS_DEFRAG_BATCH ) ;
2012-11-26 13:27:29 +04:00
sb_end_write ( fs_info - > sb ) ;
2012-11-26 13:26:20 +04:00
/*
* if we filled the whole defrag batch , there
* must be more work to do . Queue this defrag
* again
*/
if ( num_defrag = = BTRFS_DEFRAG_BATCH ) {
defrag - > last_offset = range . start ;
btrfs_requeue_inode_defrag ( inode , defrag ) ;
} else if ( defrag - > last_offset & & ! defrag - > cycled ) {
/*
* we didn ' t fill our defrag batch , but
* we didn ' t start at zero . Make sure we loop
* around to the start of the file .
*/
defrag - > last_offset = 0 ;
defrag - > cycled = 1 ;
btrfs_requeue_inode_defrag ( inode , defrag ) ;
} else {
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
}
iput ( inode ) ;
return 0 ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
cleanup :
srcu_read_unlock ( & fs_info - > subvol_srcu , index ) ;
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
return ret ;
2012-11-26 13:26:20 +04:00
}
/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes ( struct btrfs_fs_info * fs_info )
{
struct inode_defrag * defrag ;
u64 first_ino = 0 ;
u64 root_objectid = 0 ;
2011-05-24 23:35:30 +04:00
atomic_inc ( & fs_info - > defrag_running ) ;
2013-10-31 09:03:04 +04:00
while ( 1 ) {
2013-02-21 10:32:52 +04:00
/* Pause the auto defragger. */
if ( test_bit ( BTRFS_FS_STATE_REMOUNTING ,
& fs_info - > fs_state ) )
break ;
2012-11-26 13:26:20 +04:00
if ( ! __need_auto_defrag ( fs_info - > tree_root ) )
break ;
2011-05-24 23:35:30 +04:00
/* find an inode to defrag */
2012-11-26 13:26:20 +04:00
defrag = btrfs_pick_defrag_inode ( fs_info , root_objectid ,
first_ino ) ;
2011-05-24 23:35:30 +04:00
if ( ! defrag ) {
2012-11-26 13:26:20 +04:00
if ( root_objectid | | first_ino ) {
2012-05-24 14:58:27 +04:00
root_objectid = 0 ;
2011-05-24 23:35:30 +04:00
first_ino = 0 ;
continue ;
} else {
break ;
}
}
first_ino = defrag - > ino + 1 ;
2012-05-24 14:58:27 +04:00
root_objectid = defrag - > root ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
__btrfs_run_defrag_inode ( fs_info , defrag ) ;
2011-05-24 23:35:30 +04:00
}
atomic_dec ( & fs_info - > defrag_running ) ;
/*
* during unmount , we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up ( & fs_info - > transaction_wait ) ;
return 0 ;
}
2007-06-12 14:35:45 +04:00
2008-09-29 23:18:18 +04:00
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code .
*/
2009-01-06 05:25:51 +03:00
static noinline int btrfs_copy_from_user ( loff_t pos , int num_pages ,
2011-01-25 22:57:24 +03:00
size_t write_bytes ,
2008-09-06 00:09:51 +04:00
struct page * * prepared_pages ,
2010-05-23 19:07:21 +04:00
struct iov_iter * i )
2007-06-12 14:35:45 +04:00
{
2010-12-09 12:30:14 +03:00
size_t copied = 0 ;
2011-01-25 22:57:24 +03:00
size_t total_copied = 0 ;
2010-05-23 19:07:21 +04:00
int pg = 0 ;
2007-06-12 14:35:45 +04:00
int offset = pos & ( PAGE_CACHE_SIZE - 1 ) ;
2010-05-23 19:07:21 +04:00
while ( write_bytes > 0 ) {
2007-06-12 14:35:45 +04:00
size_t count = min_t ( size_t ,
PAGE_CACHE_SIZE - offset , write_bytes ) ;
2010-05-23 19:07:21 +04:00
struct page * page = prepared_pages [ pg ] ;
2010-12-09 12:30:14 +03:00
/*
* Copy data from userspace to the current page
*/
copied = iov_iter_copy_from_user_atomic ( page , i , offset , count ) ;
2010-05-23 19:07:21 +04:00
2007-06-12 14:35:45 +04:00
/* Flush processor's dcache for this page */
flush_dcache_page ( page ) ;
2011-03-07 19:10:24 +03:00
/*
* if we get a partial write , we can end up with
* partially up to date pages . These add
* a lot of complexity , so make sure they don ' t
* happen by forcing this copy to be retried .
*
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
if ( ! PageUptodate ( page ) & & copied < count )
copied = 0 ;
2010-05-23 19:07:21 +04:00
iov_iter_advance ( i , copied ) ;
write_bytes - = copied ;
2010-12-09 12:30:14 +03:00
total_copied + = copied ;
2007-06-12 14:35:45 +04:00
2014-04-03 22:29:04 +04:00
/* Return to btrfs_file_write_iter to fault page */
2011-01-25 20:42:37 +03:00
if ( unlikely ( copied = = 0 ) )
2010-12-09 12:30:14 +03:00
break ;
2010-05-23 19:07:21 +04:00
2014-09-30 03:33:33 +04:00
if ( copied < PAGE_CACHE_SIZE - offset ) {
2010-05-23 19:07:21 +04:00
offset + = copied ;
} else {
pg + + ;
offset = 0 ;
}
2007-06-12 14:35:45 +04:00
}
2010-12-09 12:30:14 +03:00
return total_copied ;
2007-06-12 14:35:45 +04:00
}
2008-09-29 23:18:18 +04:00
/*
* unlocks pages after btrfs_file_write is done with them
*/
2013-04-26 00:41:01 +04:00
static void btrfs_drop_pages ( struct page * * pages , size_t num_pages )
2007-06-12 14:35:45 +04:00
{
size_t i ;
for ( i = 0 ; i < num_pages ; i + + ) {
2008-09-29 23:18:18 +04:00
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
2014-06-05 03:10:31 +04:00
* clear it here . There should be no need to mark the pages
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page ( )
2008-09-29 23:18:18 +04:00
*/
2008-07-21 18:29:44 +04:00
ClearPageChecked ( pages [ i ] ) ;
2007-06-12 14:35:45 +04:00
unlock_page ( pages [ i ] ) ;
page_cache_release ( pages [ i ] ) ;
}
}
2008-09-29 23:18:18 +04:00
/*
* after copy_from_user , pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
* any next extents ( if required ) .
*
* this also makes the decision about creating an inline extent vs
* doing real data extents , marking pages dirty and delalloc as required .
*/
2011-04-06 21:05:22 +04:00
int btrfs_dirty_pages ( struct btrfs_root * root , struct inode * inode ,
2013-04-26 00:41:01 +04:00
struct page * * pages , size_t num_pages ,
loff_t pos , size_t write_bytes ,
struct extent_state * * cached )
2007-06-12 14:35:45 +04:00
{
int err = 0 ;
2007-08-28 00:49:44 +04:00
int i ;
2007-10-16 00:15:53 +04:00
u64 num_bytes ;
2007-08-28 00:49:44 +04:00
u64 start_pos ;
u64 end_of_last_block ;
u64 end_pos = pos + write_bytes ;
loff_t isize = i_size_read ( inode ) ;
2007-06-12 14:35:45 +04:00
2007-10-16 00:14:19 +04:00
start_pos = pos & ~ ( ( u64 ) root - > sectorsize - 1 ) ;
2013-02-26 12:10:22 +04:00
num_bytes = ALIGN ( write_bytes + pos - start_pos , root - > sectorsize ) ;
2007-06-12 14:35:45 +04:00
2007-10-16 00:15:53 +04:00
end_of_last_block = start_pos + num_bytes - 1 ;
2010-02-03 22:33:23 +03:00
err = btrfs_set_extent_delalloc ( inode , start_pos , end_of_last_block ,
2011-04-06 21:05:22 +04:00
cached ) ;
2011-01-25 22:57:24 +03:00
if ( err )
return err ;
2009-09-12 00:12:44 +04:00
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
for ( i = 0 ; i < num_pages ; i + + ) {
struct page * p = pages [ i ] ;
SetPageUptodate ( p ) ;
ClearPageChecked ( p ) ;
set_page_dirty ( p ) ;
2007-08-28 00:49:44 +04:00
}
2011-01-25 20:42:37 +03:00
/*
* we ' ve only changed i_size in ram , and we haven ' t updated
* the disk i_size . There is no need to log the inode
* at this time .
*/
if ( end_pos > isize )
2007-08-28 00:49:44 +04:00
i_size_write ( inode , end_pos ) ;
2010-05-16 18:48:46 +04:00
return 0 ;
2007-06-12 14:35:45 +04:00
}
2008-09-29 23:18:18 +04:00
/*
* this drops all the extents in the cache that intersect the range
* [ start , end ] . Existing extents are split as required .
*/
2012-08-31 04:06:49 +04:00
void btrfs_drop_extent_cache ( struct inode * inode , u64 start , u64 end ,
int skip_pinned )
2007-08-28 00:49:44 +04:00
{
struct extent_map * em ;
2008-04-17 19:29:12 +04:00
struct extent_map * split = NULL ;
struct extent_map * split2 = NULL ;
2007-08-28 00:49:44 +04:00
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
2008-02-15 18:40:50 +03:00
u64 len = end - start + 1 ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
u64 gen ;
2008-04-17 19:29:12 +04:00
int ret ;
int testend = 1 ;
2008-09-26 18:05:38 +04:00
unsigned long flags ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
int compressed = 0 ;
2013-04-06 00:51:15 +04:00
bool modified ;
2007-08-28 00:49:44 +04:00
2008-07-17 20:53:50 +04:00
WARN_ON ( end < start ) ;
2008-04-17 19:29:12 +04:00
if ( end = = ( u64 ) - 1 ) {
2008-02-15 18:40:50 +03:00
len = ( u64 ) - 1 ;
2008-04-17 19:29:12 +04:00
testend = 0 ;
}
2009-01-06 05:25:51 +03:00
while ( 1 ) {
2012-08-31 04:06:49 +04:00
int no_splits = 0 ;
2013-04-06 00:51:15 +04:00
modified = false ;
2008-04-17 19:29:12 +04:00
if ( ! split )
2011-04-21 02:48:27 +04:00
split = alloc_extent_map ( ) ;
2008-04-17 19:29:12 +04:00
if ( ! split2 )
2011-04-21 02:48:27 +04:00
split2 = alloc_extent_map ( ) ;
2012-08-31 04:06:49 +04:00
if ( ! split | | ! split2 )
no_splits = 1 ;
2008-04-17 19:29:12 +04:00
2009-09-03 00:24:52 +04:00
write_lock ( & em_tree - > lock ) ;
2008-02-15 18:40:50 +03:00
em = lookup_extent_mapping ( em_tree , start , len ) ;
2008-01-25 00:13:08 +03:00
if ( ! em ) {
2009-09-03 00:24:52 +04:00
write_unlock ( & em_tree - > lock ) ;
2007-08-28 00:49:44 +04:00
break ;
2008-01-25 00:13:08 +03:00
}
2008-09-26 18:05:38 +04:00
flags = em - > flags ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
gen = em - > generation ;
2008-09-26 18:05:38 +04:00
if ( skip_pinned & & test_bit ( EXTENT_FLAG_PINNED , & em - > flags ) ) {
2009-11-12 12:36:44 +03:00
if ( testend & & em - > start + em - > len > = start + len ) {
2008-09-26 18:05:38 +04:00
free_extent_map ( em ) ;
2009-09-11 20:27:37 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-09-26 18:05:38 +04:00
break ;
}
2009-11-12 12:36:44 +03:00
start = em - > start + em - > len ;
if ( testend )
2008-09-26 18:05:38 +04:00
len = start + len - ( em - > start + em - > len ) ;
free_extent_map ( em ) ;
2009-09-11 20:27:37 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-09-26 18:05:38 +04:00
continue ;
}
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
compressed = test_bit ( EXTENT_FLAG_COMPRESSED , & em - > flags ) ;
2008-07-31 23:42:54 +04:00
clear_bit ( EXTENT_FLAG_PINNED , & em - > flags ) ;
2013-03-15 18:46:39 +04:00
clear_bit ( EXTENT_FLAG_LOGGING , & flags ) ;
2013-04-06 00:51:15 +04:00
modified = ! list_empty ( & em - > list ) ;
2012-08-31 04:06:49 +04:00
if ( no_splits )
goto next ;
2008-04-17 19:29:12 +04:00
2013-07-11 18:34:59 +04:00
if ( em - > start < start ) {
2008-04-17 19:29:12 +04:00
split - > start = em - > start ;
split - > len = start - em - > start ;
2013-07-11 18:34:59 +04:00
if ( em - > block_start < EXTENT_MAP_LAST_BYTE ) {
split - > orig_start = em - > orig_start ;
split - > block_start = em - > block_start ;
if ( compressed )
split - > block_len = em - > block_len ;
else
split - > block_len = split - > len ;
split - > orig_block_len = max ( split - > block_len ,
em - > orig_block_len ) ;
split - > ram_bytes = em - > ram_bytes ;
} else {
split - > orig_start = split - > start ;
split - > block_len = 0 ;
split - > block_start = em - > block_start ;
split - > orig_block_len = 0 ;
split - > ram_bytes = split - > len ;
}
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
split - > generation = gen ;
2008-04-17 19:29:12 +04:00
split - > bdev = em - > bdev ;
2008-09-26 18:05:38 +04:00
split - > flags = flags ;
2010-12-17 09:21:50 +03:00
split - > compress_type = em - > compress_type ;
2014-02-25 18:15:13 +04:00
replace_extent_mapping ( em_tree , em , split , modified ) ;
2008-04-17 19:29:12 +04:00
free_extent_map ( split ) ;
split = split2 ;
split2 = NULL ;
}
2013-07-11 18:34:59 +04:00
if ( testend & & em - > start + em - > len > start + len ) {
2008-04-17 19:29:12 +04:00
u64 diff = start + len - em - > start ;
split - > start = start + len ;
split - > len = em - > start + em - > len - ( start + len ) ;
split - > bdev = em - > bdev ;
2008-09-26 18:05:38 +04:00
split - > flags = flags ;
2010-12-17 09:21:50 +03:00
split - > compress_type = em - > compress_type ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
split - > generation = gen ;
2013-07-11 18:34:59 +04:00
if ( em - > block_start < EXTENT_MAP_LAST_BYTE ) {
split - > orig_block_len = max ( em - > block_len ,
2012-12-03 19:31:19 +04:00
em - > orig_block_len ) ;
2008-04-17 19:29:12 +04:00
2013-07-11 18:34:59 +04:00
split - > ram_bytes = em - > ram_bytes ;
if ( compressed ) {
split - > block_len = em - > block_len ;
split - > block_start = em - > block_start ;
split - > orig_start = em - > orig_start ;
} else {
split - > block_len = split - > len ;
split - > block_start = em - > block_start
+ diff ;
split - > orig_start = em - > orig_start ;
}
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
} else {
2013-07-11 18:34:59 +04:00
split - > ram_bytes = split - > len ;
split - > orig_start = split - > start ;
split - > block_len = 0 ;
split - > block_start = em - > block_start ;
split - > orig_block_len = 0 ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
}
2008-04-17 19:29:12 +04:00
2014-02-25 18:15:13 +04:00
if ( extent_map_in_tree ( em ) ) {
replace_extent_mapping ( em_tree , em , split ,
modified ) ;
} else {
ret = add_extent_mapping ( em_tree , split ,
modified ) ;
ASSERT ( ret = = 0 ) ; /* Logic error */
}
2008-04-17 19:29:12 +04:00
free_extent_map ( split ) ;
split = NULL ;
}
2012-08-31 04:06:49 +04:00
next :
2014-02-25 18:15:13 +04:00
if ( extent_map_in_tree ( em ) )
remove_extent_mapping ( em_tree , em ) ;
2009-09-03 00:24:52 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-01-25 00:13:08 +03:00
2007-08-28 00:49:44 +04:00
/* once for us */
free_extent_map ( em ) ;
/* once for the tree*/
free_extent_map ( em ) ;
}
2008-04-17 19:29:12 +04:00
if ( split )
free_extent_map ( split ) ;
if ( split2 )
free_extent_map ( split2 ) ;
2007-08-28 00:49:44 +04:00
}
2007-06-12 14:35:45 +04:00
/*
* this is very complex , but the basic idea is to drop all extents
* in the range start - end . hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file .
*
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split . Anything entirely inside the range
* is deleted from the tree .
*/
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
int __btrfs_drop_extents ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root , struct inode * inode ,
struct btrfs_path * path , u64 start , u64 end ,
2014-01-07 15:42:27 +04:00
u64 * drop_end , int drop_cache ,
int replace_extent ,
u32 extent_item_size ,
int * key_inserted )
2007-06-12 14:35:45 +04:00
{
2007-10-16 00:14:19 +04:00
struct extent_buffer * leaf ;
2009-11-12 12:34:08 +03:00
struct btrfs_file_extent_item * fi ;
2007-11-30 18:09:33 +03:00
struct btrfs_key key ;
2009-11-12 12:34:08 +03:00
struct btrfs_key new_key ;
2011-04-20 06:31:50 +04:00
u64 ino = btrfs_ino ( inode ) ;
2009-11-12 12:34:08 +03:00
u64 search_start = start ;
u64 disk_bytenr = 0 ;
u64 num_bytes = 0 ;
u64 extent_offset = 0 ;
u64 extent_end = 0 ;
int del_nr = 0 ;
int del_slot = 0 ;
int extent_type ;
2007-06-28 23:57:36 +04:00
int recow ;
2007-11-30 18:09:33 +03:00
int ret ;
2012-04-27 22:31:29 +04:00
int modify_tree = - 1 ;
2014-04-02 15:51:05 +04:00
int update_refs ;
2012-09-14 22:51:22 +04:00
int found = 0 ;
2014-01-07 15:42:27 +04:00
int leafs_visited = 0 ;
2007-06-12 14:35:45 +04:00
2009-09-11 20:27:37 +04:00
if ( drop_cache )
btrfs_drop_extent_cache ( inode , start , end - 1 , 0 ) ;
2007-08-28 00:49:44 +04:00
2014-02-10 03:45:12 +04:00
if ( start > = BTRFS_I ( inode ) - > disk_i_size & & ! replace_extent )
2012-04-27 22:31:29 +04:00
modify_tree = 0 ;
2014-04-02 15:51:05 +04:00
update_refs = ( test_bit ( BTRFS_ROOT_REF_COWS , & root - > state ) | |
root = = root - > fs_info - > tree_root ) ;
2009-01-06 05:25:51 +03:00
while ( 1 ) {
2007-06-28 23:57:36 +04:00
recow = 0 ;
2011-04-20 06:31:50 +04:00
ret = btrfs_lookup_file_extent ( trans , root , path , ino ,
2012-04-27 22:31:29 +04:00
search_start , modify_tree ) ;
2007-06-12 14:35:45 +04:00
if ( ret < 0 )
2009-11-12 12:34:08 +03:00
break ;
if ( ret > 0 & & path - > slots [ 0 ] > 0 & & search_start = = start ) {
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] - 1 ) ;
2011-04-20 06:31:50 +04:00
if ( key . objectid = = ino & &
2009-11-12 12:34:08 +03:00
key . type = = BTRFS_EXTENT_DATA_KEY )
path - > slots [ 0 ] - - ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
ret = 0 ;
2014-01-07 15:42:27 +04:00
leafs_visited + + ;
2007-06-18 17:57:58 +04:00
next_slot :
2007-10-16 00:14:19 +04:00
leaf = path - > nodes [ 0 ] ;
2009-11-12 12:34:08 +03:00
if ( path - > slots [ 0 ] > = btrfs_header_nritems ( leaf ) ) {
BUG_ON ( del_nr > 0 ) ;
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 )
break ;
if ( ret > 0 ) {
ret = 0 ;
break ;
2007-06-18 17:57:58 +04:00
}
2014-01-07 15:42:27 +04:00
leafs_visited + + ;
2009-11-12 12:34:08 +03:00
leaf = path - > nodes [ 0 ] ;
recow = 1 ;
}
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
2011-04-20 06:31:50 +04:00
if ( key . objectid > ino | |
2009-11-12 12:34:08 +03:00
key . type > BTRFS_EXTENT_DATA_KEY | | key . offset > = end )
break ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
extent_type = btrfs_file_extent_type ( leaf , fi ) ;
if ( extent_type = = BTRFS_FILE_EXTENT_REG | |
extent_type = = BTRFS_FILE_EXTENT_PREALLOC ) {
disk_bytenr = btrfs_file_extent_disk_bytenr ( leaf , fi ) ;
num_bytes = btrfs_file_extent_disk_num_bytes ( leaf , fi ) ;
extent_offset = btrfs_file_extent_offset ( leaf , fi ) ;
extent_end = key . offset +
btrfs_file_extent_num_bytes ( leaf , fi ) ;
} else if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
extent_end = key . offset +
2014-01-04 09:07:00 +04:00
btrfs_file_extent_inline_len ( leaf ,
path - > slots [ 0 ] , fi ) ;
2007-06-18 17:57:58 +04:00
} else {
2009-11-12 12:34:08 +03:00
WARN_ON ( 1 ) ;
2007-06-18 17:57:58 +04:00
extent_end = search_start ;
2007-06-12 14:35:45 +04:00
}
Btrfs: fix leaf corruption caused by ENOSPC while hole punching
While running a stress test with multiple threads writing to the same btrfs
file system, I ended up with a situation where a leaf was corrupted in that
it had 2 file extent item keys that had the same exact key. I was able to
detect this quickly thanks to the following patch which triggers an assertion
as soon as a leaf is marked dirty if there are duplicated keys or out of order
keys:
Btrfs: check if items are ordered when a leaf is marked dirty
(https://patchwork.kernel.org/patch/3955431/)
Basically while running the test, I got the following in dmesg:
[28877.415877] WARNING: CPU: 2 PID: 10706 at fs/btrfs/file.c:553 btrfs_drop_extent_cache+0x435/0x440 [btrfs]()
(...)
[28877.415917] Call Trace:
[28877.415922] [<ffffffff816f1189>] dump_stack+0x4e/0x68
[28877.415926] [<ffffffff8104a32c>] warn_slowpath_common+0x8c/0xc0
[28877.415929] [<ffffffff8104a37a>] warn_slowpath_null+0x1a/0x20
[28877.415944] [<ffffffffa03775a5>] btrfs_drop_extent_cache+0x435/0x440 [btrfs]
[28877.415949] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[28877.415962] [<ffffffffa03777d9>] fill_holes+0x229/0x3e0 [btrfs]
[28877.415972] [<ffffffffa0345865>] ? block_rsv_add_bytes+0x55/0x80 [btrfs]
[28877.415984] [<ffffffffa03792cb>] btrfs_fallocate+0xb6b/0xc20 [btrfs]
(...)
[29854.132560] BTRFS critical (device sdc): corrupt leaf, bad key order: block=955232256,root=1, slot=24
[29854.132565] BTRFS info (device sdc): leaf 955232256 total ptrs 40 free space 778
(...)
[29854.132637] item 23 key (3486 108 667648) itemoff 2694 itemsize 53
[29854.132638] extent data disk bytenr 14574411776 nr 286720
[29854.132639] extent data offset 0 nr 286720 ram 286720
[29854.132640] item 24 key (3486 108 954368) itemoff 2641 itemsize 53
[29854.132641] extent data disk bytenr 0 nr 0
[29854.132643] extent data offset 0 nr 0 ram 0
[29854.132644] item 25 key (3486 108 954368) itemoff 2588 itemsize 53
[29854.132645] extent data disk bytenr 8699670528 nr 77824
[29854.132646] extent data offset 0 nr 77824 ram 77824
[29854.132647] item 26 key (3486 108 1146880) itemoff 2535 itemsize 53
[29854.132648] extent data disk bytenr 8699670528 nr 77824
[29854.132649] extent data offset 0 nr 77824 ram 77824
(...)
[29854.132707] kernel BUG at fs/btrfs/ctree.h:3901!
(...)
[29854.132771] Call Trace:
[29854.132779] [<ffffffffa0342b5c>] setup_items_for_insert+0x2dc/0x400 [btrfs]
[29854.132791] [<ffffffffa0378537>] __btrfs_drop_extents+0xba7/0xdd0 [btrfs]
[29854.132794] [<ffffffff8109c0d6>] ? trace_hardirqs_on_caller+0x16/0x1d0
[29854.132797] [<ffffffff8109c29d>] ? trace_hardirqs_on+0xd/0x10
[29854.132800] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[29854.132810] [<ffffffffa036783b>] insert_reserved_file_extent.constprop.66+0xab/0x310 [btrfs]
[29854.132820] [<ffffffffa036a6c6>] __btrfs_prealloc_file_range+0x116/0x340 [btrfs]
[29854.132830] [<ffffffffa0374d53>] btrfs_prealloc_file_range+0x23/0x30 [btrfs]
(...)
So this is caused by getting an -ENOSPC error while punching a file hole, more
specifically, we get -ENOSPC error from __btrfs_drop_extents in the while loop
of file.c:btrfs_punch_hole() when it's unable to modify the btree to delete one
or more file extent items due to lack of enough free space. When this happens,
in btrfs_punch_hole(), we attempt to reclaim free space by switching our transaction
block reservation object to root->fs_info->trans_block_rsv, end our transaction and
start a new transaction basically - and, we keep increasing our current offset
(cur_offset) as long as it's smaller than the end of the target range (lockend) -
this makes use leave the loop with cur_offset == drop_end which in turn makes us
call fill_holes() for inserting a file extent item that represents a 0 bytes range
hole (and this insertion succeeds, as in the meanwhile more space became available).
This 0 bytes file hole extent item is a problem because any subsequent caller of
__btrfs_drop_extents (regular file writes, or fallocate calls for e.g.), with a
start file offset that is equal to the offset of the hole, will not remove this
extent item due to the following conditional in the while loop of
__btrfs_drop_extents:
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
}
This later makes the call to setup_items_for_insert() (at the very end of
__btrfs_drop_extents), insert a new file extent item with the same offset as
the 0 bytes file hole extent item that follows it. Needless is to say that this
causes chaos, either when reading the leaf from disk (btree_readpage_end_io_hook),
where we perform leaf sanity checks or in subsequent operations that manipulate
file extent items, as in the fallocate call as shown by the dmesg trace above.
Without my other patch to perform the leaf sanity checks once a leaf is marked
as dirty (if the integrity checker is enabled), it would have been much harder
to debug this issue.
This change might fix a few similar issues reported by users in the mailing
list regarding assertion failures in btrfs_set_item_key_safe calls performed
by __btrfs_drop_extents, such as the following report:
http://comments.gmane.org/gmane.comp.file-systems.btrfs/32938
Asking fill_holes() to create a 0 bytes wide file hole item also produced the
first warning in the trace above, as we passed a range to btrfs_drop_extent_cache
that has an end smaller (by -1) than its start.
On 3.14 kernels this issue manifests itself through leaf corruption, as we get
duplicated file extent item keys in a leaf when calling setup_items_for_insert(),
but on older kernels, setup_items_for_insert() isn't called by __btrfs_drop_extents(),
instead we have callers of __btrfs_drop_extents(), namely the functions
inode.c:insert_inline_extent() and inode.c:insert_reserved_file_extent(), calling
btrfs_insert_empty_item() to insert the new file extent item, which would fail with
error -EEXIST, instead of inserting a duplicated key - which is still a serious
issue as it would make all similar file extent item replace operations keep
failing if they target the same file range.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-04-29 16:18:40 +04:00
/*
* Don ' t skip extent items representing 0 byte lengths . They
* used to be created ( bug ) if while punching holes we hit
* - ENOSPC condition . So if we find one here , just ensure we
* delete it , otherwise we would insert a new file extent item
* with the same key ( offset ) as that 0 bytes length file
* extent item in the call to setup_items_for_insert ( ) later
* in this function .
*/
if ( extent_end = = key . offset & & extent_end > = search_start )
goto delete_extent_item ;
2009-11-12 12:34:08 +03:00
if ( extent_end < = search_start ) {
path - > slots [ 0 ] + + ;
2007-06-18 17:57:58 +04:00
goto next_slot ;
2007-06-12 14:35:45 +04:00
}
2012-09-14 22:51:22 +04:00
found = 1 ;
2009-11-12 12:34:08 +03:00
search_start = max ( key . offset , start ) ;
2012-04-27 22:31:29 +04:00
if ( recow | | ! modify_tree ) {
modify_tree = - 1 ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2007-06-12 14:35:45 +04:00
}
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
/*
* | - range to drop - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start > key . offset & & end < extent_end ) {
BUG_ON ( del_nr > 0 ) ;
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2009-11-12 12:34:08 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
new_key . offset = start ;
ret = btrfs_duplicate_item ( trans , root , path ,
& new_key ) ;
if ( ret = = - EAGAIN ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
if ( ret < 0 )
break ;
leaf = path - > nodes [ 0 ] ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
struct btrfs_file_extent_item ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
extent_offset + = start - key . offset ;
btrfs_set_file_extent_offset ( leaf , fi , extent_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - start ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
if ( update_refs & & disk_bytenr > 0 ) {
2008-11-07 06:02:51 +03:00
ret = btrfs_inc_extent_ref ( trans , root ,
2009-11-12 12:34:08 +03:00
disk_bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
new_key . objectid ,
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 09:52:54 +03:00
start - extent_offset ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-11-07 06:02:51 +03:00
}
2009-11-12 12:34:08 +03:00
key . offset = start ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
/*
* | - - - - range to drop - - - - - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start < = key . offset & & end < extent_end ) {
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
new_key . offset = end ;
2014-11-12 07:43:09 +03:00
btrfs_set_item_key_safe ( root - > fs_info , path , & new_key ) ;
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
extent_offset + = end - key . offset ;
btrfs_set_file_extent_offset ( leaf , fi , extent_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - end ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2012-08-29 20:24:27 +04:00
if ( update_refs & & disk_bytenr > 0 )
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode , end - key . offset ) ;
break ;
2007-06-12 14:35:45 +04:00
}
2008-11-07 06:02:51 +03:00
2009-11-12 12:34:08 +03:00
search_start = extent_end ;
/*
* | - - - - range to drop - - - - - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start > key . offset & & end > = extent_end ) {
BUG_ON ( del_nr > 0 ) ;
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2007-06-18 17:57:58 +04:00
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2012-08-29 20:24:27 +04:00
if ( update_refs & & disk_bytenr > 0 )
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode , extent_end - start ) ;
if ( end = = extent_end )
break ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
2009-11-12 12:34:08 +03:00
path - > slots [ 0 ] + + ;
goto next_slot ;
2008-09-23 21:14:14 +04:00
}
2009-11-12 12:34:08 +03:00
/*
* | - - - - range to drop - - - - - |
* | - - - - - - extent - - - - - - |
*/
if ( start < = key . offset & & end > = extent_end ) {
Btrfs: fix leaf corruption caused by ENOSPC while hole punching
While running a stress test with multiple threads writing to the same btrfs
file system, I ended up with a situation where a leaf was corrupted in that
it had 2 file extent item keys that had the same exact key. I was able to
detect this quickly thanks to the following patch which triggers an assertion
as soon as a leaf is marked dirty if there are duplicated keys or out of order
keys:
Btrfs: check if items are ordered when a leaf is marked dirty
(https://patchwork.kernel.org/patch/3955431/)
Basically while running the test, I got the following in dmesg:
[28877.415877] WARNING: CPU: 2 PID: 10706 at fs/btrfs/file.c:553 btrfs_drop_extent_cache+0x435/0x440 [btrfs]()
(...)
[28877.415917] Call Trace:
[28877.415922] [<ffffffff816f1189>] dump_stack+0x4e/0x68
[28877.415926] [<ffffffff8104a32c>] warn_slowpath_common+0x8c/0xc0
[28877.415929] [<ffffffff8104a37a>] warn_slowpath_null+0x1a/0x20
[28877.415944] [<ffffffffa03775a5>] btrfs_drop_extent_cache+0x435/0x440 [btrfs]
[28877.415949] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[28877.415962] [<ffffffffa03777d9>] fill_holes+0x229/0x3e0 [btrfs]
[28877.415972] [<ffffffffa0345865>] ? block_rsv_add_bytes+0x55/0x80 [btrfs]
[28877.415984] [<ffffffffa03792cb>] btrfs_fallocate+0xb6b/0xc20 [btrfs]
(...)
[29854.132560] BTRFS critical (device sdc): corrupt leaf, bad key order: block=955232256,root=1, slot=24
[29854.132565] BTRFS info (device sdc): leaf 955232256 total ptrs 40 free space 778
(...)
[29854.132637] item 23 key (3486 108 667648) itemoff 2694 itemsize 53
[29854.132638] extent data disk bytenr 14574411776 nr 286720
[29854.132639] extent data offset 0 nr 286720 ram 286720
[29854.132640] item 24 key (3486 108 954368) itemoff 2641 itemsize 53
[29854.132641] extent data disk bytenr 0 nr 0
[29854.132643] extent data offset 0 nr 0 ram 0
[29854.132644] item 25 key (3486 108 954368) itemoff 2588 itemsize 53
[29854.132645] extent data disk bytenr 8699670528 nr 77824
[29854.132646] extent data offset 0 nr 77824 ram 77824
[29854.132647] item 26 key (3486 108 1146880) itemoff 2535 itemsize 53
[29854.132648] extent data disk bytenr 8699670528 nr 77824
[29854.132649] extent data offset 0 nr 77824 ram 77824
(...)
[29854.132707] kernel BUG at fs/btrfs/ctree.h:3901!
(...)
[29854.132771] Call Trace:
[29854.132779] [<ffffffffa0342b5c>] setup_items_for_insert+0x2dc/0x400 [btrfs]
[29854.132791] [<ffffffffa0378537>] __btrfs_drop_extents+0xba7/0xdd0 [btrfs]
[29854.132794] [<ffffffff8109c0d6>] ? trace_hardirqs_on_caller+0x16/0x1d0
[29854.132797] [<ffffffff8109c29d>] ? trace_hardirqs_on+0xd/0x10
[29854.132800] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[29854.132810] [<ffffffffa036783b>] insert_reserved_file_extent.constprop.66+0xab/0x310 [btrfs]
[29854.132820] [<ffffffffa036a6c6>] __btrfs_prealloc_file_range+0x116/0x340 [btrfs]
[29854.132830] [<ffffffffa0374d53>] btrfs_prealloc_file_range+0x23/0x30 [btrfs]
(...)
So this is caused by getting an -ENOSPC error while punching a file hole, more
specifically, we get -ENOSPC error from __btrfs_drop_extents in the while loop
of file.c:btrfs_punch_hole() when it's unable to modify the btree to delete one
or more file extent items due to lack of enough free space. When this happens,
in btrfs_punch_hole(), we attempt to reclaim free space by switching our transaction
block reservation object to root->fs_info->trans_block_rsv, end our transaction and
start a new transaction basically - and, we keep increasing our current offset
(cur_offset) as long as it's smaller than the end of the target range (lockend) -
this makes use leave the loop with cur_offset == drop_end which in turn makes us
call fill_holes() for inserting a file extent item that represents a 0 bytes range
hole (and this insertion succeeds, as in the meanwhile more space became available).
This 0 bytes file hole extent item is a problem because any subsequent caller of
__btrfs_drop_extents (regular file writes, or fallocate calls for e.g.), with a
start file offset that is equal to the offset of the hole, will not remove this
extent item due to the following conditional in the while loop of
__btrfs_drop_extents:
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
}
This later makes the call to setup_items_for_insert() (at the very end of
__btrfs_drop_extents), insert a new file extent item with the same offset as
the 0 bytes file hole extent item that follows it. Needless is to say that this
causes chaos, either when reading the leaf from disk (btree_readpage_end_io_hook),
where we perform leaf sanity checks or in subsequent operations that manipulate
file extent items, as in the fallocate call as shown by the dmesg trace above.
Without my other patch to perform the leaf sanity checks once a leaf is marked
as dirty (if the integrity checker is enabled), it would have been much harder
to debug this issue.
This change might fix a few similar issues reported by users in the mailing
list regarding assertion failures in btrfs_set_item_key_safe calls performed
by __btrfs_drop_extents, such as the following report:
http://comments.gmane.org/gmane.comp.file-systems.btrfs/32938
Asking fill_holes() to create a 0 bytes wide file hole item also produced the
first warning in the trace above, as we passed a range to btrfs_drop_extent_cache
that has an end smaller (by -1) than its start.
On 3.14 kernels this issue manifests itself through leaf corruption, as we get
duplicated file extent item keys in a leaf when calling setup_items_for_insert(),
but on older kernels, setup_items_for_insert() isn't called by __btrfs_drop_extents(),
instead we have callers of __btrfs_drop_extents(), namely the functions
inode.c:insert_inline_extent() and inode.c:insert_reserved_file_extent(), calling
btrfs_insert_empty_item() to insert the new file extent item, which would fail with
error -EEXIST, instead of inserting a duplicated key - which is still a serious
issue as it would make all similar file extent item replace operations keep
failing if they target the same file range.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-04-29 16:18:40 +04:00
delete_extent_item :
2009-11-12 12:34:08 +03:00
if ( del_nr = = 0 ) {
del_slot = path - > slots [ 0 ] ;
del_nr = 1 ;
} else {
BUG_ON ( del_slot + del_nr ! = path - > slots [ 0 ] ) ;
del_nr + + ;
}
2008-09-23 21:14:14 +04:00
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
if ( update_refs & &
extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2008-10-09 19:46:29 +04:00
inode_sub_bytes ( inode ,
2009-11-12 12:34:08 +03:00
extent_end - key . offset ) ;
extent_end = ALIGN ( extent_end ,
root - > sectorsize ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
} else if ( update_refs & & disk_bytenr > 0 ) {
2008-09-23 21:14:14 +04:00
ret = btrfs_free_extent ( trans , root ,
2009-11-12 12:34:08 +03:00
disk_bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
key . objectid , key . offset -
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 09:52:54 +03:00
extent_offset ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode ,
extent_end - key . offset ) ;
2008-09-23 21:14:14 +04:00
}
2009-11-12 12:34:08 +03:00
if ( end = = extent_end )
break ;
if ( path - > slots [ 0 ] + 1 < btrfs_header_nritems ( leaf ) ) {
path - > slots [ 0 ] + + ;
goto next_slot ;
}
ret = btrfs_del_items ( trans , root , path , del_slot ,
del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret ) {
btrfs_abort_transaction ( trans , root , ret ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
break ;
2012-03-12 19:03:00 +04:00
}
2009-11-12 12:34:08 +03:00
del_nr = 0 ;
del_slot = 0 ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
BUG_ON ( 1 ) ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
2012-03-12 19:03:00 +04:00
if ( ! ret & & del_nr > 0 ) {
2014-01-07 15:42:27 +04:00
/*
* Set path - > slots [ 0 ] to first slot , so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs , we end up with a correct and adjusted
2014-02-10 03:45:12 +04:00
* path - > slots [ 0 ] for our insertion ( if replace_extent ! = 0 ) .
2014-01-07 15:42:27 +04:00
*/
path - > slots [ 0 ] = del_slot ;
2009-11-12 12:34:08 +03:00
ret = btrfs_del_items ( trans , root , path , del_slot , del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret )
btrfs_abort_transaction ( trans , root , ret ) ;
2014-02-10 03:45:12 +04:00
}
2014-01-07 15:42:27 +04:00
2014-02-10 03:45:12 +04:00
leaf = path - > nodes [ 0 ] ;
/*
* If btrfs_del_items ( ) was called , it might have deleted a leaf , in
* which case it unlocked our path , so check path - > locks [ 0 ] matches a
* write lock .
*/
if ( ! ret & & replace_extent & & leafs_visited = = 1 & &
( path - > locks [ 0 ] = = BTRFS_WRITE_LOCK_BLOCKING | |
path - > locks [ 0 ] = = BTRFS_WRITE_LOCK ) & &
btrfs_leaf_free_space ( root , leaf ) > =
sizeof ( struct btrfs_item ) + extent_item_size ) {
key . objectid = ino ;
key . type = BTRFS_EXTENT_DATA_KEY ;
key . offset = start ;
if ( ! del_nr & & path - > slots [ 0 ] < btrfs_header_nritems ( leaf ) ) {
struct btrfs_key slot_key ;
btrfs_item_key_to_cpu ( leaf , & slot_key , path - > slots [ 0 ] ) ;
if ( btrfs_comp_cpu_keys ( & key , & slot_key ) > 0 )
path - > slots [ 0 ] + + ;
2014-01-07 15:42:27 +04:00
}
2014-02-10 03:45:12 +04:00
setup_items_for_insert ( root , path , & key ,
& extent_item_size ,
extent_item_size ,
sizeof ( struct btrfs_item ) +
extent_item_size , 1 ) ;
* key_inserted = 1 ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
2014-01-07 15:42:27 +04:00
if ( ! replace_extent | | ! ( * key_inserted ) )
btrfs_release_path ( path ) ;
2012-08-29 22:27:18 +04:00
if ( drop_end )
2012-09-14 22:51:22 +04:00
* drop_end = found ? min ( end , extent_end ) : end ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
return ret ;
}
int btrfs_drop_extents ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root , struct inode * inode , u64 start ,
2012-08-29 20:24:27 +04:00
u64 end , int drop_cache )
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
{
struct btrfs_path * path ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2012-08-29 22:27:18 +04:00
ret = __btrfs_drop_extents ( trans , root , inode , path , start , end , NULL ,
2014-01-07 15:42:27 +04:00
drop_cache , 0 , 0 , NULL ) ;
2009-11-12 12:34:08 +03:00
btrfs_free_path ( path ) ;
2007-06-12 14:35:45 +04:00
return ret ;
}
2008-10-30 21:25:28 +03:00
static int extent_mergeable ( struct extent_buffer * leaf , int slot ,
2010-01-15 11:43:09 +03:00
u64 objectid , u64 bytenr , u64 orig_offset ,
u64 * start , u64 * end )
2008-10-30 21:25:28 +03:00
{
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
u64 extent_end ;
if ( slot < 0 | | slot > = btrfs_header_nritems ( leaf ) )
return 0 ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = objectid | | key . type ! = BTRFS_EXTENT_DATA_KEY )
return 0 ;
fi = btrfs_item_ptr ( leaf , slot , struct btrfs_file_extent_item ) ;
if ( btrfs_file_extent_type ( leaf , fi ) ! = BTRFS_FILE_EXTENT_REG | |
btrfs_file_extent_disk_bytenr ( leaf , fi ) ! = bytenr | |
2010-01-15 11:43:09 +03:00
btrfs_file_extent_offset ( leaf , fi ) ! = key . offset - orig_offset | |
2008-10-30 21:25:28 +03:00
btrfs_file_extent_compression ( leaf , fi ) | |
btrfs_file_extent_encryption ( leaf , fi ) | |
btrfs_file_extent_other_encoding ( leaf , fi ) )
return 0 ;
extent_end = key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) ;
if ( ( * start & & * start ! = key . offset ) | | ( * end & & * end ! = extent_end ) )
return 0 ;
* start = key . offset ;
* end = extent_end ;
return 1 ;
}
/*
* Mark extent in the range start - end as written .
*
* This changes extent type from ' pre - allocated ' to ' regular ' . If only
* part of extent is marked as written , the extent will be split into
* two or three .
*/
int btrfs_mark_extent_written ( struct btrfs_trans_handle * trans ,
struct inode * inode , u64 start , u64 end )
{
2009-11-12 12:34:08 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-10-30 21:25:28 +03:00
struct extent_buffer * leaf ;
struct btrfs_path * path ;
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
2009-11-12 12:34:08 +03:00
struct btrfs_key new_key ;
2008-10-30 21:25:28 +03:00
u64 bytenr ;
u64 num_bytes ;
u64 extent_end ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
u64 orig_offset ;
2008-10-30 21:25:28 +03:00
u64 other_start ;
u64 other_end ;
2009-11-12 12:34:08 +03:00
u64 split ;
int del_nr = 0 ;
int del_slot = 0 ;
2010-01-15 11:43:09 +03:00
int recow ;
2008-10-30 21:25:28 +03:00
int ret ;
2011-04-20 06:31:50 +04:00
u64 ino = btrfs_ino ( inode ) ;
2008-10-30 21:25:28 +03:00
path = btrfs_alloc_path ( ) ;
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 21:38:47 +04:00
if ( ! path )
return - ENOMEM ;
2008-10-30 21:25:28 +03:00
again :
2010-01-15 11:43:09 +03:00
recow = 0 ;
2009-11-12 12:34:08 +03:00
split = start ;
2011-04-20 06:31:50 +04:00
key . objectid = ino ;
2008-10-30 21:25:28 +03:00
key . type = BTRFS_EXTENT_DATA_KEY ;
2009-11-12 12:34:08 +03:00
key . offset = split ;
2008-10-30 21:25:28 +03:00
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
2011-03-16 20:59:32 +03:00
if ( ret < 0 )
goto out ;
2008-10-30 21:25:28 +03:00
if ( ret > 0 & & path - > slots [ 0 ] > 0 )
path - > slots [ 0 ] - - ;
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
2011-04-20 06:31:50 +04:00
BUG_ON ( key . objectid ! = ino | | key . type ! = BTRFS_EXTENT_DATA_KEY ) ;
2008-10-30 21:25:28 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2009-11-12 12:34:08 +03:00
BUG_ON ( btrfs_file_extent_type ( leaf , fi ) ! =
BTRFS_FILE_EXTENT_PREALLOC ) ;
2008-10-30 21:25:28 +03:00
extent_end = key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) ;
BUG_ON ( key . offset > start | | extent_end < end ) ;
bytenr = btrfs_file_extent_disk_bytenr ( leaf , fi ) ;
num_bytes = btrfs_file_extent_disk_num_bytes ( leaf , fi ) ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
orig_offset = key . offset - btrfs_file_extent_offset ( leaf , fi ) ;
2010-01-15 11:43:09 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
if ( start = = key . offset & & end < extent_end ) {
other_start = 0 ;
other_end = start ;
if ( extent_mergeable ( leaf , path - > slots [ 0 ] - 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
new_key . offset = end ;
2014-11-12 07:43:09 +03:00
btrfs_set_item_key_safe ( root - > fs_info , path , & new_key ) ;
2010-01-15 11:43:09 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - end ) ;
btrfs_set_file_extent_offset ( leaf , fi ,
end - orig_offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
end - other_start ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
}
if ( start > key . offset & & end = = extent_end ) {
other_start = end ;
other_end = 0 ;
if ( extent_mergeable ( leaf , path - > slots [ 0 ] + 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
path - > slots [ 0 ] + + ;
new_key . offset = start ;
2014-11-12 07:43:09 +03:00
btrfs_set_item_key_safe ( root - > fs_info , path , & new_key ) ;
2010-01-15 11:43:09 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
other_end - start ) ;
btrfs_set_file_extent_offset ( leaf , fi ,
start - orig_offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
}
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
while ( start > key . offset | | end < extent_end ) {
if ( key . offset = = start )
split = end ;
new_key . offset = split ;
ret = btrfs_duplicate_item ( trans , root , path , & new_key ) ;
if ( ret = = - EAGAIN ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
goto again ;
2008-10-30 21:25:28 +03:00
}
2012-03-12 19:03:00 +04:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , root , ret ) ;
goto out ;
}
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
leaf = path - > nodes [ 0 ] ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
2008-10-30 21:25:28 +03:00
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2008-10-30 21:25:28 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
2009-11-12 12:34:08 +03:00
split - key . offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_offset ( leaf , fi , split - orig_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - split ) ;
2008-10-30 21:25:28 +03:00
btrfs_mark_buffer_dirty ( leaf ) ;
2009-11-12 12:34:08 +03:00
ret = btrfs_inc_extent_ref ( trans , root , bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 09:52:54 +03:00
ino , orig_offset ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
if ( split = = start ) {
key . offset = start ;
} else {
BUG_ON ( start ! = key . offset ) ;
2008-10-30 21:25:28 +03:00
path - > slots [ 0 ] - - ;
2009-11-12 12:34:08 +03:00
extent_end = end ;
2008-10-30 21:25:28 +03:00
}
2010-01-15 11:43:09 +03:00
recow = 1 ;
2008-10-30 21:25:28 +03:00
}
2009-11-12 12:34:08 +03:00
other_start = end ;
other_end = 0 ;
2010-01-15 11:43:09 +03:00
if ( extent_mergeable ( leaf , path - > slots [ 0 ] + 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
if ( recow ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2010-01-15 11:43:09 +03:00
goto again ;
}
2009-11-12 12:34:08 +03:00
extent_end = other_end ;
del_slot = path - > slots [ 0 ] + 1 ;
del_nr + + ;
ret = btrfs_free_extent ( trans , root , bytenr , num_bytes ,
0 , root - > root_key . objectid ,
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 09:52:54 +03:00
ino , orig_offset ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-10-30 21:25:28 +03:00
}
2009-11-12 12:34:08 +03:00
other_start = 0 ;
other_end = start ;
2010-01-15 11:43:09 +03:00
if ( extent_mergeable ( leaf , path - > slots [ 0 ] - 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
if ( recow ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2010-01-15 11:43:09 +03:00
goto again ;
}
2009-11-12 12:34:08 +03:00
key . offset = other_start ;
del_slot = path - > slots [ 0 ] ;
del_nr + + ;
ret = btrfs_free_extent ( trans , root , bytenr , num_bytes ,
0 , root - > root_key . objectid ,
Btrfs: fix regression running delayed references when using qgroups
In the kernel 4.2 merge window we had a big changes to the implementation
of delayed references and qgroups which made the no_quota field of delayed
references not used anymore. More specifically the no_quota field is not
used anymore as of:
commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.")
Leaving the no_quota field actually prevents delayed references from
getting merged, which in turn cause the following BUG_ON(), at
fs/btrfs/extent-tree.c, to be hit when qgroups are enabled:
static int run_delayed_tree_ref(...)
{
(...)
BUG_ON(node->ref_mod != 1);
(...)
}
This happens on a scenario like the following:
1) Ref1 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
2) Ref2 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with Ref1 because Ref1->no_quota != Ref2->no_quota.
3) Ref3 bytenr X, action = BTRFS_ADD_DELAYED_REF, no_quota = 1, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref2 is incompatible
due to Ref2->no_quota != Ref3->no_quota.
4) Ref4 bytenr X, action = BTRFS_DROP_DELAYED_REF, no_quota = 0, added.
It's not merged with the reference at the tail of the list of refs
for bytenr X because the reference at the tail, Ref3 is incompatible
due to Ref3->no_quota != Ref4->no_quota.
5) We run delayed references, trigger merging of delayed references,
through __btrfs_run_delayed_refs() -> btrfs_merge_delayed_refs().
6) Ref1 and Ref3 are merged as Ref1->no_quota = Ref3->no_quota and
all other conditions are satisfied too. So Ref1 gets a ref_mod
value of 2.
7) Ref2 and Ref4 are merged as Ref2->no_quota = Ref4->no_quota and
all other conditions are satisfied too. So Ref2 gets a ref_mod
value of 2.
8) Ref1 and Ref2 aren't merged, because they have different values
for their no_quota field.
9) Delayed reference Ref1 is picked for running (select_delayed_ref()
always prefers references with an action == BTRFS_ADD_DELAYED_REF).
So run_delayed_tree_ref() is called for Ref1 which triggers the
BUG_ON because Ref1->red_mod != 1 (equals 2).
So fix this by removing the no_quota field, as it's not used anymore as
of commit 0ed4792af0e8 ("btrfs: qgroup: Switch to new extent-oriented
qgroup mechanism.").
The use of no_quota was also buggy in at least two places:
1) At delayed-refs.c:btrfs_add_delayed_tree_ref() - we were setting
no_quota to 0 instead of 1 when the following condition was true:
is_fstree(ref_root) || !fs_info->quota_enabled
2) At extent-tree.c:__btrfs_inc_extent_ref() - we were attempting to
reset a node's no_quota when the condition "!is_fstree(root_objectid)
|| !root->fs_info->quota_enabled" was true but we did it only in
an unused local stack variable, that is, we never reset the no_quota
value in the node itself.
This fixes the remainder of problems several people have been having when
running delayed references, mostly while a balance is running in parallel,
on a 4.2+ kernel.
Very special thanks to Stéphane Lesimple for helping debugging this issue
and testing this fix on his multi terabyte filesystem (which took more
than one day to balance alone, plus fsck, etc).
Also, this fixes deadlock issue when using the clone ioctl with qgroups
enabled, as reported by Elias Probst in the mailing list. The deadlock
happens because after calling btrfs_insert_empty_item we have our path
holding a write lock on a leaf of the fs/subvol tree and then before
releasing the path we called check_ref() which did backref walking, when
qgroups are enabled, and tried to read lock the same leaf. The trace for
this case is the following:
INFO: task systemd-nspawn:6095 blocked for more than 120 seconds.
(...)
Call Trace:
[<ffffffff86999201>] schedule+0x74/0x83
[<ffffffff863ef64c>] btrfs_tree_read_lock+0xc0/0xea
[<ffffffff86137ed7>] ? wait_woken+0x74/0x74
[<ffffffff8639f0a7>] btrfs_search_old_slot+0x51a/0x810
[<ffffffff863a129b>] btrfs_next_old_leaf+0xdf/0x3ce
[<ffffffff86413a00>] ? ulist_add_merge+0x1b/0x127
[<ffffffff86411688>] __resolve_indirect_refs+0x62a/0x667
[<ffffffff863ef546>] ? btrfs_clear_lock_blocking_rw+0x78/0xbe
[<ffffffff864122d3>] find_parent_nodes+0xaf3/0xfc6
[<ffffffff86412838>] __btrfs_find_all_roots+0x92/0xf0
[<ffffffff864128f2>] btrfs_find_all_roots+0x45/0x65
[<ffffffff8639a75b>] ? btrfs_get_tree_mod_seq+0x2b/0x88
[<ffffffff863e852e>] check_ref+0x64/0xc4
[<ffffffff863e9e01>] btrfs_clone+0x66e/0xb5d
[<ffffffff863ea77f>] btrfs_ioctl_clone+0x48f/0x5bb
[<ffffffff86048a68>] ? native_sched_clock+0x28/0x77
[<ffffffff863ed9b0>] btrfs_ioctl+0xabc/0x25cb
(...)
The problem goes away by eleminating check_ref(), which no longer is
needed as its purpose was to get a value for the no_quota field of
a delayed reference (this patch removes the no_quota field as mentioned
earlier).
Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
Reported-by: Elias Probst <mail@eliasprobst.eu>
Reported-by: Peter Becker <floyd.net@gmail.com>
Reported-by: Malte Schröder <malte@tnxip.de>
Reported-by: Derek Dongray <derek@valedon.co.uk>
Reported-by: Erkki Seppala <flux-btrfs@inside.org>
Cc: stable@vger.kernel.org # 4.2+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
2015-10-23 09:52:54 +03:00
ino , orig_offset ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2009-11-12 12:34:08 +03:00
}
if ( del_nr = = 0 ) {
2010-02-11 10:43:00 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_type ( leaf , fi ,
BTRFS_FILE_EXTENT_REG ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2009-11-12 12:34:08 +03:00
btrfs_mark_buffer_dirty ( leaf ) ;
2010-01-15 11:43:09 +03:00
} else {
2010-02-11 10:43:00 +03:00
fi = btrfs_item_ptr ( leaf , del_slot - 1 ,
struct btrfs_file_extent_item ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_type ( leaf , fi ,
BTRFS_FILE_EXTENT_REG ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - key . offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2009-11-12 12:34:08 +03:00
2010-01-15 11:43:09 +03:00
ret = btrfs_del_items ( trans , root , path , del_slot , del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , root , ret ) ;
goto out ;
}
2010-01-15 11:43:09 +03:00
}
2009-11-12 12:34:08 +03:00
out :
2008-10-30 21:25:28 +03:00
btrfs_free_path ( path ) ;
return 0 ;
}
2011-02-28 17:52:08 +03:00
/*
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
2011-09-30 23:23:54 +04:00
static int prepare_uptodate_page ( struct page * page , u64 pos ,
bool force_uptodate )
2011-02-28 17:52:08 +03:00
{
int ret = 0 ;
2011-09-30 23:23:54 +04:00
if ( ( ( pos & ( PAGE_CACHE_SIZE - 1 ) ) | | force_uptodate ) & &
! PageUptodate ( page ) ) {
2011-02-28 17:52:08 +03:00
ret = btrfs_readpage ( NULL , page ) ;
if ( ret )
return ret ;
lock_page ( page ) ;
if ( ! PageUptodate ( page ) ) {
unlock_page ( page ) ;
return - EIO ;
}
}
return 0 ;
}
2007-06-12 14:35:45 +04:00
/*
2013-12-10 15:25:04 +04:00
* this just gets pages into the page cache and locks them down .
2007-06-12 14:35:45 +04:00
*/
2013-12-10 15:25:03 +04:00
static noinline int prepare_pages ( struct inode * inode , struct page * * pages ,
size_t num_pages , loff_t pos ,
size_t write_bytes , bool force_uptodate )
2007-06-12 14:35:45 +04:00
{
int i ;
unsigned long index = pos > > PAGE_CACHE_SHIFT ;
2011-09-21 23:05:58 +04:00
gfp_t mask = btrfs_alloc_write_mask ( inode - > i_mapping ) ;
2013-12-13 23:39:34 +04:00
int err = 0 ;
2013-12-10 15:25:04 +04:00
int faili ;
2007-06-18 17:57:58 +04:00
2007-06-12 14:35:45 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
2011-07-11 18:47:06 +04:00
pages [ i ] = find_or_create_page ( inode - > i_mapping , index + i ,
2012-01-11 03:07:55 +04:00
mask | __GFP_WRITE ) ;
2007-06-12 14:35:45 +04:00
if ( ! pages [ i ] ) {
2011-02-28 17:52:08 +03:00
faili = i - 1 ;
err = - ENOMEM ;
goto fail ;
}
if ( i = = 0 )
2011-09-30 23:23:54 +04:00
err = prepare_uptodate_page ( pages [ i ] , pos ,
force_uptodate ) ;
2011-02-28 17:52:08 +03:00
if ( i = = num_pages - 1 )
err = prepare_uptodate_page ( pages [ i ] ,
2011-09-30 23:23:54 +04:00
pos + write_bytes , false ) ;
2011-02-28 17:52:08 +03:00
if ( err ) {
page_cache_release ( pages [ i ] ) ;
faili = i - 1 ;
goto fail ;
2007-06-12 14:35:45 +04:00
}
2007-06-28 23:57:36 +04:00
wait_on_page_writeback ( pages [ i ] ) ;
2007-06-12 14:35:45 +04:00
}
2013-12-10 15:25:04 +04:00
return 0 ;
fail :
while ( faili > = 0 ) {
unlock_page ( pages [ faili ] ) ;
page_cache_release ( pages [ faili ] ) ;
faili - - ;
}
return err ;
}
/*
* This function locks the extent and properly waits for data = ordered extents
* to finish before allowing the pages to be modified if need .
*
* The return value :
* 1 - the extent is locked
* 0 - the extent is not locked , and everything is OK
* - EAGAIN - need re - prepare the pages
* the other < 0 number - Something wrong happens
*/
static noinline int
lock_and_cleanup_extent_if_need ( struct inode * inode , struct page * * pages ,
size_t num_pages , loff_t pos ,
u64 * lockstart , u64 * lockend ,
struct extent_state * * cached_state )
{
u64 start_pos ;
u64 last_pos ;
int i ;
int ret = 0 ;
start_pos = pos & ~ ( ( u64 ) PAGE_CACHE_SIZE - 1 ) ;
last_pos = start_pos + ( ( u64 ) num_pages < < PAGE_CACHE_SHIFT ) - 1 ;
2008-02-19 19:29:24 +03:00
if ( start_pos < inode - > i_size ) {
2008-07-17 20:53:50 +04:00
struct btrfs_ordered_extent * ordered ;
2010-02-03 22:33:23 +03:00
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree ,
2013-12-10 15:25:04 +04:00
start_pos , last_pos , 0 , cached_state ) ;
2014-03-06 09:54:58 +04:00
ordered = btrfs_lookup_ordered_range ( inode , start_pos ,
last_pos - start_pos + 1 ) ;
2008-07-17 20:53:50 +04:00
if ( ordered & &
ordered - > file_offset + ordered - > len > start_pos & &
2013-12-10 15:25:04 +04:00
ordered - > file_offset < = last_pos ) {
2010-02-03 22:33:23 +03:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
2013-12-10 15:25:04 +04:00
start_pos , last_pos ,
cached_state , GFP_NOFS ) ;
2008-07-17 20:53:50 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
unlock_page ( pages [ i ] ) ;
page_cache_release ( pages [ i ] ) ;
}
2014-03-06 09:54:58 +04:00
btrfs_start_ordered_extent ( inode , ordered , 1 ) ;
btrfs_put_ordered_extent ( ordered ) ;
return - EAGAIN ;
2008-07-17 20:53:50 +04:00
}
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
2010-02-03 22:33:23 +03:00
clear_extent_bit ( & BTRFS_I ( inode ) - > io_tree , start_pos ,
2013-12-10 15:25:04 +04:00
last_pos , EXTENT_DIRTY | EXTENT_DELALLOC |
2012-09-06 05:10:51 +04:00
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG ,
2013-12-10 15:25:04 +04:00
0 , 0 , cached_state , GFP_NOFS ) ;
* lockstart = start_pos ;
* lockend = last_pos ;
ret = 1 ;
2008-02-19 19:29:24 +03:00
}
2013-12-10 15:25:04 +04:00
2008-07-17 20:53:50 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
2011-08-09 01:19:47 +04:00
if ( clear_page_dirty_for_io ( pages [ i ] ) )
account_page_redirty ( pages [ i ] ) ;
2008-07-17 20:53:50 +04:00
set_page_extent_mapped ( pages [ i ] ) ;
WARN_ON ( ! PageLocked ( pages [ i ] ) ) ;
}
2011-02-28 17:52:08 +03:00
2013-12-10 15:25:04 +04:00
return ret ;
2007-06-12 14:35:45 +04:00
}
2013-06-22 00:37:03 +04:00
static noinline int check_can_nocow ( struct inode * inode , loff_t pos ,
size_t * write_bytes )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_ordered_extent * ordered ;
u64 lockstart , lockend ;
u64 num_bytes ;
int ret ;
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 14:57:59 +03:00
ret = btrfs_start_write_no_snapshoting ( root ) ;
2014-03-06 09:38:19 +04:00
if ( ! ret )
return - ENOSPC ;
2013-06-22 00:37:03 +04:00
lockstart = round_down ( pos , root - > sectorsize ) ;
Btrfs: fix wrong lock range and write size in check_can_nocow()
The write range may not be sector-aligned, for example:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------|--------| <- correct lock range, size: 3blocks
But according to the old code, we used the size of write range to calculate
the lock range directly, not considered the offset, we would get a wrong lock
range:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------| <- wrong lock range, size: 2blocks
And besides that, the old code also had the same problem when calculating
the real write size. Correct them.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-02-27 09:58:04 +04:00
lockend = round_up ( pos + * write_bytes , root - > sectorsize ) - 1 ;
2013-06-22 00:37:03 +04:00
while ( 1 ) {
lock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
ordered = btrfs_lookup_ordered_range ( inode , lockstart ,
lockend - lockstart + 1 ) ;
if ( ! ordered ) {
break ;
}
unlock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
btrfs_start_ordered_extent ( inode , ordered , 1 ) ;
btrfs_put_ordered_extent ( ordered ) ;
}
num_bytes = lockend - lockstart + 1 ;
2013-08-14 22:02:47 +04:00
ret = can_nocow_extent ( inode , lockstart , & num_bytes , NULL , NULL , NULL ) ;
2013-06-22 00:37:03 +04:00
if ( ret < = 0 ) {
ret = 0 ;
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 14:57:59 +03:00
btrfs_end_write_no_snapshoting ( root ) ;
2013-06-22 00:37:03 +04:00
} else {
Btrfs: fix wrong lock range and write size in check_can_nocow()
The write range may not be sector-aligned, for example:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------|--------| <- correct lock range, size: 3blocks
But according to the old code, we used the size of write range to calculate
the lock range directly, not considered the offset, we would get a wrong lock
range:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------| <- wrong lock range, size: 2blocks
And besides that, the old code also had the same problem when calculating
the real write size. Correct them.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-02-27 09:58:04 +04:00
* write_bytes = min_t ( size_t , * write_bytes ,
num_bytes - pos + lockstart ) ;
2013-06-22 00:37:03 +04:00
}
unlock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
return ret ;
}
2011-01-25 22:57:24 +03:00
static noinline ssize_t __btrfs_buffered_write ( struct file * file ,
struct iov_iter * i ,
loff_t pos )
2010-05-23 19:00:55 +04:00
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2010-05-23 19:07:21 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct page * * pages = NULL ;
2013-12-10 15:25:04 +04:00
struct extent_state * cached_state = NULL ;
2013-06-22 00:37:03 +04:00
u64 release_bytes = 0 ;
2013-12-10 15:25:04 +04:00
u64 lockstart ;
u64 lockend ;
2011-01-25 22:57:24 +03:00
size_t num_written = 0 ;
int nrptrs ;
2011-03-30 04:57:23 +04:00
int ret = 0 ;
2013-06-22 00:37:03 +04:00
bool only_release_metadata = false ;
2011-09-30 23:23:54 +04:00
bool force_page_uptodate = false ;
2013-12-10 15:25:04 +04:00
bool need_unlock ;
2010-05-23 19:00:55 +04:00
2014-06-05 03:59:57 +04:00
nrptrs = min ( DIV_ROUND_UP ( iov_iter_count ( i ) , PAGE_CACHE_SIZE ) ,
PAGE_CACHE_SIZE / ( sizeof ( struct page * ) ) ) ;
2011-12-16 21:32:57 +04:00
nrptrs = min ( nrptrs , current - > nr_dirtied_pause - current - > nr_dirtied ) ;
nrptrs = max ( nrptrs , 8 ) ;
2015-02-20 20:00:26 +03:00
pages = kmalloc_array ( nrptrs , sizeof ( struct page * ) , GFP_KERNEL ) ;
2011-01-25 22:57:24 +03:00
if ( ! pages )
return - ENOMEM ;
2009-10-01 20:29:10 +04:00
2011-01-25 22:57:24 +03:00
while ( iov_iter_count ( i ) > 0 ) {
2007-06-12 14:35:45 +04:00
size_t offset = pos & ( PAGE_CACHE_SIZE - 1 ) ;
2011-01-25 22:57:24 +03:00
size_t write_bytes = min ( iov_iter_count ( i ) ,
2010-05-23 19:07:21 +04:00
nrptrs * ( size_t ) PAGE_CACHE_SIZE -
2007-06-18 17:57:58 +04:00
offset ) ;
2014-06-05 03:59:57 +04:00
size_t num_pages = DIV_ROUND_UP ( write_bytes + offset ,
PAGE_CACHE_SIZE ) ;
2013-06-22 00:37:03 +04:00
size_t reserve_bytes ;
2011-01-25 22:57:24 +03:00
size_t dirty_pages ;
size_t copied ;
2007-06-12 14:35:45 +04:00
2007-06-18 17:57:58 +04:00
WARN_ON ( num_pages > nrptrs ) ;
2007-12-22 00:27:21 +03:00
2010-12-09 12:30:14 +03:00
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
2011-01-25 22:57:24 +03:00
if ( unlikely ( iov_iter_fault_in_readable ( i , write_bytes ) ) ) {
2010-12-09 12:30:14 +03:00
ret = - EFAULT ;
2011-01-25 22:57:24 +03:00
break ;
2010-12-09 12:30:14 +03:00
}
2013-06-22 00:37:03 +04:00
reserve_bytes = num_pages < < PAGE_CACHE_SHIFT ;
2015-09-08 12:22:43 +03:00
if ( BTRFS_I ( inode ) - > flags & ( BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC ) ) {
2013-06-22 00:37:03 +04:00
ret = check_can_nocow ( inode , pos , & write_bytes ) ;
2015-09-08 12:22:43 +03:00
if ( ret < 0 )
break ;
2013-06-22 00:37:03 +04:00
if ( ret > 0 ) {
2015-09-08 12:22:43 +03:00
/*
* For nodata cow case , no need to reserve
* data space .
*/
2013-06-22 00:37:03 +04:00
only_release_metadata = true ;
/*
* our prealloc extent may be smaller than
* write_bytes , so scale down .
*/
2014-06-05 03:59:57 +04:00
num_pages = DIV_ROUND_UP ( write_bytes + offset ,
PAGE_CACHE_SIZE ) ;
2013-06-22 00:37:03 +04:00
reserve_bytes = num_pages < < PAGE_CACHE_SHIFT ;
2015-09-08 12:22:43 +03:00
goto reserve_metadata ;
2013-06-22 00:37:03 +04:00
}
}
2015-09-08 12:25:55 +03:00
ret = btrfs_check_data_free_space ( inode , pos , write_bytes ) ;
2015-09-08 12:22:43 +03:00
if ( ret < 0 )
2011-01-25 22:57:24 +03:00
break ;
2007-12-22 00:27:21 +03:00
2015-09-08 12:22:43 +03:00
reserve_metadata :
2013-06-22 00:37:03 +04:00
ret = btrfs_delalloc_reserve_metadata ( inode , reserve_bytes ) ;
if ( ret ) {
if ( ! only_release_metadata )
2015-09-08 12:25:55 +03:00
btrfs_free_reserved_data_space ( inode , pos ,
write_bytes ) ;
2014-03-06 09:38:19 +04:00
else
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 14:57:59 +03:00
btrfs_end_write_no_snapshoting ( root ) ;
2013-06-22 00:37:03 +04:00
break ;
}
release_bytes = reserve_bytes ;
2013-12-10 15:25:04 +04:00
need_unlock = false ;
again :
2011-01-25 23:10:08 +03:00
/*
* This is going to setup the pages array with the number of
* pages we want , so we don ' t really need to worry about the
* contents of pages from loop to loop
*/
2013-12-10 15:25:03 +04:00
ret = prepare_pages ( inode , pages , num_pages ,
pos , write_bytes ,
2011-09-30 23:23:54 +04:00
force_page_uptodate ) ;
2013-06-22 00:37:03 +04:00
if ( ret )
2011-01-25 22:57:24 +03:00
break ;
2007-06-12 14:35:45 +04:00
2013-12-10 15:25:04 +04:00
ret = lock_and_cleanup_extent_if_need ( inode , pages , num_pages ,
pos , & lockstart , & lockend ,
& cached_state ) ;
if ( ret < 0 ) {
if ( ret = = - EAGAIN )
goto again ;
break ;
} else if ( ret > 0 ) {
need_unlock = true ;
ret = 0 ;
}
2010-12-09 12:30:14 +03:00
copied = btrfs_copy_from_user ( pos , num_pages ,
2011-01-25 22:57:24 +03:00
write_bytes , pages , i ) ;
2011-02-28 17:52:08 +03:00
/*
* if we have trouble faulting in the pages , fall
* back to one page at a time
*/
if ( copied < write_bytes )
nrptrs = 1 ;
2011-09-30 23:23:54 +04:00
if ( copied = = 0 ) {
force_page_uptodate = true ;
2011-02-28 17:52:08 +03:00
dirty_pages = 0 ;
2011-09-30 23:23:54 +04:00
} else {
force_page_uptodate = false ;
2014-06-05 03:59:57 +04:00
dirty_pages = DIV_ROUND_UP ( copied + offset ,
PAGE_CACHE_SIZE ) ;
2011-09-30 23:23:54 +04:00
}
2010-12-09 12:30:14 +03:00
2011-01-25 22:57:24 +03:00
/*
* If we had a short copy we need to release the excess delaloc
* bytes we reserved . We need to increment outstanding_extents
* because btrfs_delalloc_release_space will decrement it , but
* we still have an outstanding extent for the chunk we actually
* managed to copy .
*/
2010-12-09 12:30:14 +03:00
if ( num_pages > dirty_pages ) {
2013-06-22 00:37:03 +04:00
release_bytes = ( num_pages - dirty_pages ) < <
PAGE_CACHE_SHIFT ;
2011-07-15 19:16:44 +04:00
if ( copied > 0 ) {
spin_lock ( & BTRFS_I ( inode ) - > lock ) ;
BTRFS_I ( inode ) - > outstanding_extents + + ;
spin_unlock ( & BTRFS_I ( inode ) - > lock ) ;
}
2013-06-22 00:37:03 +04:00
if ( only_release_metadata )
btrfs_delalloc_release_metadata ( inode ,
release_bytes ) ;
else
2015-09-08 12:25:55 +03:00
btrfs_delalloc_release_space ( inode , pos ,
2013-06-22 00:37:03 +04:00
release_bytes ) ;
2010-12-09 12:30:14 +03:00
}
2013-06-22 00:37:03 +04:00
release_bytes = dirty_pages < < PAGE_CACHE_SHIFT ;
2013-12-10 15:25:04 +04:00
if ( copied > 0 )
2011-04-06 21:05:22 +04:00
ret = btrfs_dirty_pages ( root , inode , pages ,
dirty_pages , pos , copied ,
NULL ) ;
2013-12-10 15:25:04 +04:00
if ( need_unlock )
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
lockstart , lockend , & cached_state ,
GFP_NOFS ) ;
2014-01-09 06:06:10 +04:00
if ( ret ) {
btrfs_drop_pages ( pages , num_pages ) ;
2013-12-10 15:25:04 +04:00
break ;
2014-01-09 06:06:10 +04:00
}
2007-06-12 14:35:45 +04:00
2013-12-10 15:25:04 +04:00
release_bytes = 0 ;
2014-03-06 09:38:19 +04:00
if ( only_release_metadata )
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 14:57:59 +03:00
btrfs_end_write_no_snapshoting ( root ) ;
2014-03-06 09:38:19 +04:00
2013-06-22 00:37:03 +04:00
if ( only_release_metadata & & copied > 0 ) {
2015-02-24 21:07:26 +03:00
lockstart = round_down ( pos , root - > sectorsize ) ;
lockend = lockstart +
2013-06-22 00:37:03 +04:00
( dirty_pages < < PAGE_CACHE_SHIFT ) - 1 ;
set_extent_bit ( & BTRFS_I ( inode ) - > io_tree , lockstart ,
lockend , EXTENT_NORESERVE , NULL ,
NULL , GFP_NOFS ) ;
only_release_metadata = false ;
}
2014-01-09 06:06:10 +04:00
btrfs_drop_pages ( pages , num_pages ) ;
2011-01-25 22:57:24 +03:00
cond_resched ( ) ;
2012-12-12 04:00:21 +04:00
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
2014-06-04 21:22:26 +04:00
if ( dirty_pages < ( root - > nodesize > > PAGE_CACHE_SHIFT ) + 1 )
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2008-10-03 20:30:02 +04:00
2010-12-09 12:30:14 +03:00
pos + = copied ;
num_written + = copied ;
2011-01-25 22:57:24 +03:00
}
2007-06-12 14:35:45 +04:00
2011-01-25 22:57:24 +03:00
kfree ( pages ) ;
2013-06-22 00:37:03 +04:00
if ( release_bytes ) {
2014-03-06 09:38:19 +04:00
if ( only_release_metadata ) {
Btrfs: fix snapshot inconsistency after a file write followed by truncate
If right after starting the snapshot creation ioctl we perform a write against a
file followed by a truncate, with both operations increasing the file's size, we
can get a snapshot tree that reflects a state of the source subvolume's tree where
the file truncation happened but the write operation didn't. This leaves a gap
between 2 file extent items of the inode, which makes btrfs' fsck complain about it.
For example, if we perform the following file operations:
$ mkfs.btrfs -f /dev/vdd
$ mount /dev/vdd /mnt
$ xfs_io -f \
-c "pwrite -S 0xaa -b 32K 0 32K" \
-c "fsync" \
-c "pwrite -S 0xbb -b 32770 16K 32770" \
-c "truncate 90123" \
/mnt/foobar
and the snapshot creation ioctl was just called before the second write, we often
can get the following inode items in the snapshot's btree:
item 120 key (257 INODE_ITEM 0) itemoff 7987 itemsize 160
inode generation 146 transid 7 size 90123 block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 flags 0x0
item 121 key (257 INODE_REF 256) itemoff 7967 itemsize 20
inode ref index 282 namelen 10 name: foobar
item 122 key (257 EXTENT_DATA 0) itemoff 7914 itemsize 53
extent data disk byte 1104855040 nr 32768
extent data offset 0 nr 32768 ram 32768
extent compression 0
item 123 key (257 EXTENT_DATA 53248) itemoff 7861 itemsize 53
extent data disk byte 0 nr 0
extent data offset 0 nr 40960 ram 40960
extent compression 0
There's a file range, corresponding to the interval [32K; ALIGN(16K + 32770, 4096)[
for which there's no file extent item covering it. This is because the file write
and file truncate operations happened both right after the snapshot creation ioctl
called btrfs_start_delalloc_inodes(), which means we didn't start and wait for the
ordered extent that matches the write and, in btrfs_setsize(), we were able to call
btrfs_cont_expand() before being able to commit the current transaction in the
snapshot creation ioctl. So this made it possibe to insert the hole file extent
item in the source subvolume (which represents the region added by the truncate)
right before the transaction commit from the snapshot creation ioctl.
Btrfs' fsck tool complains about such cases with a message like the following:
"root 331 inode 257 errors 100, file extent discount"
>From a user perspective, the expectation when a snapshot is created while those
file operations are being performed is that the snapshot will have a file that
either:
1) is empty
2) only the first write was captured
3) only the 2 writes were captured
4) both writes and the truncation were captured
But never capture a state where only the first write and the truncation were
captured (since the second write was performed before the truncation).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-10-29 14:57:59 +03:00
btrfs_end_write_no_snapshoting ( root ) ;
2013-06-22 00:37:03 +04:00
btrfs_delalloc_release_metadata ( inode , release_bytes ) ;
2014-03-06 09:38:19 +04:00
} else {
2015-09-08 12:25:55 +03:00
btrfs_delalloc_release_space ( inode , pos , release_bytes ) ;
2014-03-06 09:38:19 +04:00
}
2013-06-22 00:37:03 +04:00
}
2011-01-25 22:57:24 +03:00
return num_written ? num_written : ret ;
}
static ssize_t __btrfs_direct_write ( struct kiocb * iocb ,
2014-03-04 07:09:39 +04:00
struct iov_iter * from ,
2014-03-22 14:51:37 +04:00
loff_t pos )
2011-01-25 22:57:24 +03:00
{
struct file * file = iocb - > ki_filp ;
2014-10-10 12:43:11 +04:00
struct inode * inode = file_inode ( file ) ;
2011-01-25 22:57:24 +03:00
ssize_t written ;
ssize_t written_buffered ;
loff_t endbyte ;
int err ;
2014-03-22 14:51:37 +04:00
written = generic_file_direct_write ( iocb , from , pos ) ;
2011-01-25 22:57:24 +03:00
2014-03-22 14:51:37 +04:00
if ( written < 0 | | ! iov_iter_count ( from ) )
2011-01-25 22:57:24 +03:00
return written ;
pos + = written ;
2014-03-04 07:09:39 +04:00
written_buffered = __btrfs_buffered_write ( file , from , pos ) ;
2011-01-25 22:57:24 +03:00
if ( written_buffered < 0 ) {
err = written_buffered ;
goto out ;
2007-06-12 14:35:45 +04:00
}
2014-10-10 00:18:55 +04:00
/*
* Ensure all data is persisted . We want the next direct IO read to be
* able to read what was just written .
*/
2011-01-25 22:57:24 +03:00
endbyte = pos + written_buffered - 1 ;
2014-10-10 12:43:11 +04:00
err = btrfs_fdatawrite_range ( inode , pos , endbyte ) ;
2014-10-10 00:18:55 +04:00
if ( err )
goto out ;
2014-10-10 12:43:11 +04:00
err = filemap_fdatawait_range ( inode - > i_mapping , pos , endbyte ) ;
2011-01-25 22:57:24 +03:00
if ( err )
goto out ;
written + = written_buffered ;
2014-02-12 04:31:06 +04:00
iocb - > ki_pos = pos + written_buffered ;
2011-01-25 22:57:24 +03:00
invalidate_mapping_pages ( file - > f_mapping , pos > > PAGE_CACHE_SHIFT ,
endbyte > > PAGE_CACHE_SHIFT ) ;
2007-06-12 14:35:45 +04:00
out :
2011-01-25 22:57:24 +03:00
return written ? written : err ;
}
2008-01-03 21:46:11 +03:00
2012-11-09 19:53:21 +04:00
static void update_time_for_write ( struct inode * inode )
{
struct timespec now ;
if ( IS_NOCMTIME ( inode ) )
return ;
now = current_fs_time ( inode - > i_sb ) ;
if ( ! timespec_equal ( & inode - > i_mtime , & now ) )
inode - > i_mtime = now ;
if ( ! timespec_equal ( & inode - > i_ctime , & now ) )
inode - > i_ctime = now ;
if ( IS_I_VERSION ( inode ) )
inode_inc_iversion ( inode ) ;
}
2014-04-03 22:29:04 +04:00
static ssize_t btrfs_file_write_iter ( struct kiocb * iocb ,
struct iov_iter * from )
2011-01-25 22:57:24 +03:00
{
struct file * file = iocb - > ki_filp ;
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2011-01-25 22:57:24 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-11 18:52:24 +04:00
u64 start_pos ;
2014-03-27 06:51:58 +04:00
u64 end_pos ;
2011-01-25 22:57:24 +03:00
ssize_t num_written = 0 ;
2012-11-16 22:56:32 +04:00
bool sync = ( file - > f_flags & O_DSYNC ) | | IS_SYNC ( file - > f_mapping - > host ) ;
2015-04-09 19:55:47 +03:00
ssize_t err ;
loff_t pos ;
size_t count ;
2011-01-25 22:57:24 +03:00
mutex_lock ( & inode - > i_mutex ) ;
2015-04-09 19:55:47 +03:00
err = generic_write_checks ( iocb , from ) ;
if ( err < = 0 ) {
2011-01-25 22:57:24 +03:00
mutex_unlock ( & inode - > i_mutex ) ;
2015-04-09 19:55:47 +03:00
return err ;
2011-01-25 22:57:24 +03:00
}
2015-04-09 19:55:47 +03:00
current - > backing_dev_info = inode_to_bdi ( inode ) ;
2015-05-21 17:05:53 +03:00
err = file_remove_privs ( file ) ;
2011-01-25 22:57:24 +03:00
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
/*
* If BTRFS flips readonly due to some impossible error
* ( fs_info - > fs_state now has BTRFS_SUPER_FLAG_ERROR ) ,
* although we have opened a file as writable , we have
* to stop this write operation to ensure FS consistency .
*/
2013-01-29 14:14:48 +04:00
if ( test_bit ( BTRFS_FS_STATE_ERROR , & root - > fs_info - > fs_state ) ) {
2011-01-25 22:57:24 +03:00
mutex_unlock ( & inode - > i_mutex ) ;
err = - EROFS ;
goto out ;
}
2012-11-09 19:53:21 +04:00
/*
* We reserve space for updating the inode when we reserve space for the
* extent we are going to write , so we will enospc out there . We don ' t
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write .
*/
update_time_for_write ( inode ) ;
2011-01-25 22:57:24 +03:00
2015-04-09 19:55:47 +03:00
pos = iocb - > ki_pos ;
count = iov_iter_count ( from ) ;
2011-09-11 18:52:24 +04:00
start_pos = round_down ( pos , root - > sectorsize ) ;
if ( start_pos > i_size_read ( inode ) ) {
2014-03-27 06:51:58 +04:00
/* Expand hole size to cover write data, preventing empty gap */
2014-04-15 06:41:00 +04:00
end_pos = round_up ( pos + count , root - > sectorsize ) ;
2014-03-27 06:51:58 +04:00
err = btrfs_cont_expand ( inode , i_size_read ( inode ) , end_pos ) ;
2011-09-11 18:52:24 +04:00
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
}
2012-11-16 22:56:32 +04:00
if ( sync )
atomic_inc ( & BTRFS_I ( inode ) - > sync_writers ) ;
2015-04-09 20:52:01 +03:00
if ( iocb - > ki_flags & IOCB_DIRECT ) {
2014-04-03 22:29:04 +04:00
num_written = __btrfs_direct_write ( iocb , from , pos ) ;
2011-01-25 22:57:24 +03:00
} else {
2014-04-03 22:29:04 +04:00
num_written = __btrfs_buffered_write ( file , from , pos ) ;
2011-01-25 22:57:24 +03:00
if ( num_written > 0 )
2014-02-12 04:31:06 +04:00
iocb - > ki_pos = pos + num_written ;
2011-01-25 22:57:24 +03:00
}
mutex_unlock ( & inode - > i_mutex ) ;
2007-10-29 21:36:41 +03:00
2009-03-31 21:27:11 +04:00
/*
2012-11-09 19:53:21 +04:00
* We also have to set last_sub_trans to the current log transid ,
* otherwise subsequent syncs to a file that ' s been synced in this
* transaction will appear to have already occured .
2009-03-31 21:27:11 +04:00
*/
Btrfs: fix metadata inconsistencies after directory fsync
We can get into inconsistency between inodes and directory entries
after fsyncing a directory. The issue is that while a directory gets
the new dentries persisted in the fsync log and replayed at mount time,
the link count of the inode that directory entries point to doesn't
get updated, staying with an incorrect link count (smaller then the
correct value). This later leads to stale file handle errors when
accessing (including attempt to delete) some of the links if all the
other ones are removed, which also implies impossibility to delete the
parent directories, since the dentries can not be removed.
Another issue is that (unlike ext3/4, xfs, f2fs, reiserfs, nilfs2),
when fsyncing a directory, new files aren't logged (their metadata and
dentries) nor any child directories. So this patch fixes this issue too,
since it has the same resolution as the incorrect inode link count issue
mentioned before.
This is very easy to reproduce, and the following excerpt from my test
case for xfstests shows how:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our main test file and directory.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" $SCRATCH_MNT/foo | _filter_xfs_io
mkdir $SCRATCH_MNT/mydir
# Make sure all metadata and data are durably persisted.
sync
# Add a hard link to 'foo' inside our test directory and fsync only the
# directory. The btrfs fsync implementation had a bug that caused the new
# directory entry to be visible after the fsync log replay but, the inode
# of our file remained with a link count of 1.
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_2
# Add a few more links and new files.
# This is just to verify nothing breaks or gives incorrect results after the
# fsync log is replayed.
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_3
$XFS_IO_PROG -f -c "pwrite -S 0xff 0 64K" $SCRATCH_MNT/hello | _filter_xfs_io
ln $SCRATCH_MNT/hello $SCRATCH_MNT/mydir/hello_2
# Add some subdirectories and new files and links to them. This is to verify
# that after fsyncing our top level directory 'mydir', all the subdirectories
# and their files/links are registered in the fsync log and exist after the
# fsync log is replayed.
mkdir -p $SCRATCH_MNT/mydir/x/y/z
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/foo_y_link
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/z/foo_z_link
touch $SCRATCH_MNT/mydir/x/y/z/qwerty
# Now fsync only our top directory.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/mydir
# And fsync now our new file named 'hello', just to verify later that it has
# the expected content and that the previous fsync on the directory 'mydir' had
# no bad influence on this fsync.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/hello
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Verify the content of our file 'foo' remains the same as before, 8192 bytes,
# all with the value 0xaa.
echo "File 'foo' content after log replay:"
od -t x1 $SCRATCH_MNT/foo
# Remove the first name of our inode. Because of the directory fsync bug, the
# inode's link count was 1 instead of 5, so removing the 'foo' name ended up
# deleting the inode and the other names became stale directory entries (still
# visible to applications). Attempting to remove or access the remaining
# dentries pointing to that inode resulted in stale file handle errors and
# made it impossible to remove the parent directories since it was impossible
# for them to become empty.
echo "file 'foo' link count after log replay: $(stat -c %h $SCRATCH_MNT/foo)"
rm -f $SCRATCH_MNT/foo
# Now verify that all files, links and directories created before fsyncing our
# directory exist after the fsync log was replayed.
[ -f $SCRATCH_MNT/mydir/foo_2 ] || echo "Link mydir/foo_2 is missing"
[ -f $SCRATCH_MNT/mydir/foo_3 ] || echo "Link mydir/foo_3 is missing"
[ -f $SCRATCH_MNT/hello ] || echo "File hello is missing"
[ -f $SCRATCH_MNT/mydir/hello_2 ] || echo "Link mydir/hello_2 is missing"
[ -f $SCRATCH_MNT/mydir/x/y/foo_y_link ] || \
echo "Link mydir/x/y/foo_y_link is missing"
[ -f $SCRATCH_MNT/mydir/x/y/z/foo_z_link ] || \
echo "Link mydir/x/y/z/foo_z_link is missing"
[ -f $SCRATCH_MNT/mydir/x/y/z/qwerty ] || \
echo "File mydir/x/y/z/qwerty is missing"
# We expect our file here to have a size of 64Kb and all the bytes having the
# value 0xff.
echo "file 'hello' content after log replay:"
od -t x1 $SCRATCH_MNT/hello
# Now remove all files/links, under our test directory 'mydir', and verify we
# can remove all the directories.
rm -f $SCRATCH_MNT/mydir/x/y/z/*
rmdir $SCRATCH_MNT/mydir/x/y/z
rm -f $SCRATCH_MNT/mydir/x/y/*
rmdir $SCRATCH_MNT/mydir/x/y
rmdir $SCRATCH_MNT/mydir/x
rm -f $SCRATCH_MNT/mydir/*
rmdir $SCRATCH_MNT/mydir
# An fsck, run by the fstests framework everytime a test finishes, also detected
# the inconsistency and printed the following error message:
#
# root 5 inode 257 errors 2001, no inode item, link count wrong
# unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
# unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref
status=0
exit
The expected golden output for the test is:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 65536/65536 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File 'foo' content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000
file 'foo' link count after log replay: 5
file 'hello' content after log replay:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0200000
Which is the output after this patch and when running the test against
ext3/4, xfs, f2fs, reiserfs or nilfs2. Without this patch, the test's
output is:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 65536/65536 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File 'foo' content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000
file 'foo' link count after log replay: 1
Link mydir/foo_2 is missing
Link mydir/foo_3 is missing
Link mydir/x/y/foo_y_link is missing
Link mydir/x/y/z/foo_z_link is missing
File mydir/x/y/z/qwerty is missing
file 'hello' content after log replay:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0200000
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y/z': No such file or directory
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y': No such file or directory
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x': No such file or directory
rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_2': Stale file handle
rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_3': Stale file handle
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir': Directory not empty
Fsck, without this fix, also complains about the wrong link count:
root 5 inode 257 errors 2001, no inode item, link count wrong
unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref
So fix this by logging the inodes that the dentries point to when
fsyncing a directory.
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-03-20 20:19:46 +03:00
spin_lock ( & BTRFS_I ( inode ) - > lock ) ;
2012-11-09 19:53:21 +04:00
BTRFS_I ( inode ) - > last_sub_trans = root - > log_transid ;
Btrfs: fix metadata inconsistencies after directory fsync
We can get into inconsistency between inodes and directory entries
after fsyncing a directory. The issue is that while a directory gets
the new dentries persisted in the fsync log and replayed at mount time,
the link count of the inode that directory entries point to doesn't
get updated, staying with an incorrect link count (smaller then the
correct value). This later leads to stale file handle errors when
accessing (including attempt to delete) some of the links if all the
other ones are removed, which also implies impossibility to delete the
parent directories, since the dentries can not be removed.
Another issue is that (unlike ext3/4, xfs, f2fs, reiserfs, nilfs2),
when fsyncing a directory, new files aren't logged (their metadata and
dentries) nor any child directories. So this patch fixes this issue too,
since it has the same resolution as the incorrect inode link count issue
mentioned before.
This is very easy to reproduce, and the following excerpt from my test
case for xfstests shows how:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our main test file and directory.
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" $SCRATCH_MNT/foo | _filter_xfs_io
mkdir $SCRATCH_MNT/mydir
# Make sure all metadata and data are durably persisted.
sync
# Add a hard link to 'foo' inside our test directory and fsync only the
# directory. The btrfs fsync implementation had a bug that caused the new
# directory entry to be visible after the fsync log replay but, the inode
# of our file remained with a link count of 1.
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_2
# Add a few more links and new files.
# This is just to verify nothing breaks or gives incorrect results after the
# fsync log is replayed.
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/foo_3
$XFS_IO_PROG -f -c "pwrite -S 0xff 0 64K" $SCRATCH_MNT/hello | _filter_xfs_io
ln $SCRATCH_MNT/hello $SCRATCH_MNT/mydir/hello_2
# Add some subdirectories and new files and links to them. This is to verify
# that after fsyncing our top level directory 'mydir', all the subdirectories
# and their files/links are registered in the fsync log and exist after the
# fsync log is replayed.
mkdir -p $SCRATCH_MNT/mydir/x/y/z
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/foo_y_link
ln $SCRATCH_MNT/foo $SCRATCH_MNT/mydir/x/y/z/foo_z_link
touch $SCRATCH_MNT/mydir/x/y/z/qwerty
# Now fsync only our top directory.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/mydir
# And fsync now our new file named 'hello', just to verify later that it has
# the expected content and that the previous fsync on the directory 'mydir' had
# no bad influence on this fsync.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/hello
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Verify the content of our file 'foo' remains the same as before, 8192 bytes,
# all with the value 0xaa.
echo "File 'foo' content after log replay:"
od -t x1 $SCRATCH_MNT/foo
# Remove the first name of our inode. Because of the directory fsync bug, the
# inode's link count was 1 instead of 5, so removing the 'foo' name ended up
# deleting the inode and the other names became stale directory entries (still
# visible to applications). Attempting to remove or access the remaining
# dentries pointing to that inode resulted in stale file handle errors and
# made it impossible to remove the parent directories since it was impossible
# for them to become empty.
echo "file 'foo' link count after log replay: $(stat -c %h $SCRATCH_MNT/foo)"
rm -f $SCRATCH_MNT/foo
# Now verify that all files, links and directories created before fsyncing our
# directory exist after the fsync log was replayed.
[ -f $SCRATCH_MNT/mydir/foo_2 ] || echo "Link mydir/foo_2 is missing"
[ -f $SCRATCH_MNT/mydir/foo_3 ] || echo "Link mydir/foo_3 is missing"
[ -f $SCRATCH_MNT/hello ] || echo "File hello is missing"
[ -f $SCRATCH_MNT/mydir/hello_2 ] || echo "Link mydir/hello_2 is missing"
[ -f $SCRATCH_MNT/mydir/x/y/foo_y_link ] || \
echo "Link mydir/x/y/foo_y_link is missing"
[ -f $SCRATCH_MNT/mydir/x/y/z/foo_z_link ] || \
echo "Link mydir/x/y/z/foo_z_link is missing"
[ -f $SCRATCH_MNT/mydir/x/y/z/qwerty ] || \
echo "File mydir/x/y/z/qwerty is missing"
# We expect our file here to have a size of 64Kb and all the bytes having the
# value 0xff.
echo "file 'hello' content after log replay:"
od -t x1 $SCRATCH_MNT/hello
# Now remove all files/links, under our test directory 'mydir', and verify we
# can remove all the directories.
rm -f $SCRATCH_MNT/mydir/x/y/z/*
rmdir $SCRATCH_MNT/mydir/x/y/z
rm -f $SCRATCH_MNT/mydir/x/y/*
rmdir $SCRATCH_MNT/mydir/x/y
rmdir $SCRATCH_MNT/mydir/x
rm -f $SCRATCH_MNT/mydir/*
rmdir $SCRATCH_MNT/mydir
# An fsck, run by the fstests framework everytime a test finishes, also detected
# the inconsistency and printed the following error message:
#
# root 5 inode 257 errors 2001, no inode item, link count wrong
# unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
# unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref
status=0
exit
The expected golden output for the test is:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 65536/65536 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File 'foo' content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000
file 'foo' link count after log replay: 5
file 'hello' content after log replay:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0200000
Which is the output after this patch and when running the test against
ext3/4, xfs, f2fs, reiserfs or nilfs2. Without this patch, the test's
output is:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 65536/65536 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File 'foo' content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000
file 'foo' link count after log replay: 1
Link mydir/foo_2 is missing
Link mydir/foo_3 is missing
Link mydir/x/y/foo_y_link is missing
Link mydir/x/y/z/foo_z_link is missing
File mydir/x/y/z/qwerty is missing
file 'hello' content after log replay:
0000000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
*
0200000
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y/z': No such file or directory
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x/y': No such file or directory
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/x': No such file or directory
rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_2': Stale file handle
rm: cannot remove '/home/fdmanana/btrfs-tests/scratch_1/mydir/foo_3': Stale file handle
rmdir: failed to remove '/home/fdmanana/btrfs-tests/scratch_1/mydir': Directory not empty
Fsck, without this fix, also complains about the wrong link count:
root 5 inode 257 errors 2001, no inode item, link count wrong
unresolved ref dir 258 index 2 namelen 5 name foo_2 filetype 1 errors 4, no inode ref
unresolved ref dir 258 index 3 namelen 5 name foo_3 filetype 1 errors 4, no inode ref
So fix this by logging the inodes that the dentries point to when
fsyncing a directory.
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-03-20 20:19:46 +03:00
spin_unlock ( & BTRFS_I ( inode ) - > lock ) ;
2013-09-04 17:04:40 +04:00
if ( num_written > 0 ) {
2011-01-25 22:57:24 +03:00
err = generic_write_sync ( file , pos , num_written ) ;
2014-04-04 01:47:17 +04:00
if ( err < 0 )
2007-10-29 21:36:41 +03:00
num_written = err ;
}
2013-01-28 16:34:55 +04:00
2012-11-16 22:56:32 +04:00
if ( sync )
atomic_dec ( & BTRFS_I ( inode ) - > sync_writers ) ;
2013-01-28 16:34:55 +04:00
out :
2007-06-12 14:35:45 +04:00
current - > backing_dev_info = NULL ;
return num_written ? num_written : err ;
}
2009-01-06 05:25:51 +03:00
int btrfs_release_file ( struct inode * inode , struct file * filp )
2008-05-27 18:55:43 +04:00
{
2008-06-10 18:07:39 +04:00
if ( filp - > private_data )
btrfs_ioctl_trans_end ( filp ) ;
2014-08-20 18:15:33 +04:00
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non - zero size to a zero size . This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place .
*/
if ( test_and_clear_bit ( BTRFS_INODE_ORDERED_DATA_CLOSE ,
& BTRFS_I ( inode ) - > runtime_flags ) )
filemap_flush ( inode - > i_mapping ) ;
2008-05-27 18:55:43 +04:00
return 0 ;
}
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
static int start_ordered_ops ( struct inode * inode , loff_t start , loff_t end )
{
int ret ;
atomic_inc ( & BTRFS_I ( inode ) - > sync_writers ) ;
2014-10-10 12:43:11 +04:00
ret = btrfs_fdatawrite_range ( inode , start , end ) ;
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
atomic_dec ( & BTRFS_I ( inode ) - > sync_writers ) ;
return ret ;
}
2008-09-29 23:18:18 +04:00
/*
* fsync call for both files and directories . This logs the inode into
* the tree log instead of forcing full commits whenever possible .
*
* It needs to call filemap_fdatawait so that all ordered extent updates are
* in the metadata btree are up to date for copying to the log .
*
* It drops the inode mutex before doing the tree log commit . This is an
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk .
*/
2011-07-17 04:44:56 +04:00
int btrfs_sync_file ( struct file * file , loff_t start , loff_t end , int datasync )
2007-06-12 14:35:45 +04:00
{
2010-05-26 19:53:25 +04:00
struct dentry * dentry = file - > f_path . dentry ;
2015-03-18 01:25:59 +03:00
struct inode * inode = d_inode ( dentry ) ;
2007-06-12 14:35:45 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_trans_handle * trans ;
2014-02-20 14:08:58 +04:00
struct btrfs_log_ctx ctx ;
int ret = 0 ;
2012-10-12 23:27:49 +04:00
bool full_sync = 0 ;
2015-03-31 16:16:52 +03:00
const u64 len = end - start + 1 ;
2007-06-12 14:35:45 +04:00
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 14:18:59 +03:00
trace_btrfs_sync_file ( file , datasync ) ;
2009-10-13 21:21:08 +04:00
2012-09-13 14:53:47 +04:00
/*
* We write the dirty pages in the range and wait until they complete
* out of the - > i_mutex . If so , we can flush the dirty pages by
2012-10-12 23:27:49 +04:00
* multi - task , and make the performance up . See
* btrfs_wait_ordered_range for an explanation of the ASYNC check .
2012-09-13 14:53:47 +04:00
*/
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
ret = start_ordered_ops ( inode , start , end ) ;
2012-09-13 14:53:47 +04:00
if ( ret )
return ret ;
2011-07-17 04:44:56 +04:00
mutex_lock ( & inode - > i_mutex ) ;
2012-09-06 14:04:27 +04:00
atomic_inc ( & root - > log_batch ) ;
2012-10-12 23:27:49 +04:00
full_sync = test_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
/*
* We might have have had more pages made dirty after calling
* start_ordered_ops and before acquiring the inode ' s i_mutex .
*/
2013-10-26 00:13:35 +04:00
if ( full_sync ) {
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
/*
* For a full sync , we need to make sure any ordered operations
* start and finish before we start logging the inode , so that
* all extents are persisted and the respective file extent
* items are in the fs / subvol btree .
*/
2015-03-31 16:16:52 +03:00
ret = btrfs_wait_ordered_range ( inode , start , len ) ;
Btrfs: fix fsync race leading to invalid data after log replay
When the fsync callback (btrfs_sync_file) starts, it first waits for
the writeback of any dirty pages to start and finish without holding
the inode's mutex (to reduce contention). After this it acquires the
inode's mutex and repeats that process via btrfs_wait_ordered_range
only if we're doing a full sync (BTRFS_INODE_NEEDS_FULL_SYNC flag
is set on the inode).
This is not safe for a non full sync - we need to start and wait for
writeback to finish for any pages that might have been made dirty
before acquiring the inode's mutex and after that first step mentioned
before. Why this is needed is explained by the following comment added
to btrfs_sync_file:
"Right before acquiring the inode's mutex, we might have new
writes dirtying pages, which won't immediately start the
respective ordered operations - that is done through the
fill_delalloc callbacks invoked from the writepage and
writepages address space operations. So make sure we start
all ordered operations before starting to log our inode. Not
doing this means that while logging the inode, writeback
could start and invoke writepage/writepages, which would call
the fill_delalloc callbacks (cow_file_range,
submit_compressed_extents). These callbacks add first an
extent map to the modified list of extents and then create
the respective ordered operation, which means in
tree-log.c:btrfs_log_inode() we might capture all existing
ordered operations (with btrfs_get_logged_extents()) before
the fill_delalloc callback adds its ordered operation, and by
the time we visit the modified list of extent maps (with
btrfs_log_changed_extents()), we see and process the extent
map they created. We then use the extent map to construct a
file extent item for logging without waiting for the
respective ordered operation to finish - this file extent
item points to a disk location that might not have yet been
written to, containing random data - so after a crash a log
replay will make our inode have file extent items that point
to disk locations containing invalid data, as we returned
success to userspace without waiting for the respective
ordered operation to finish, because it wasn't captured by
btrfs_get_logged_extents()."
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-02 14:09:58 +04:00
} else {
/*
* Start any new ordered operations before starting to log the
* inode . We will wait for them to finish in btrfs_sync_log ( ) .
*
* Right before acquiring the inode ' s mutex , we might have new
* writes dirtying pages , which won ' t immediately start the
* respective ordered operations - that is done through the
* fill_delalloc callbacks invoked from the writepage and
* writepages address space operations . So make sure we start
* all ordered operations before starting to log our inode . Not
* doing this means that while logging the inode , writeback
* could start and invoke writepage / writepages , which would call
* the fill_delalloc callbacks ( cow_file_range ,
* submit_compressed_extents ) . These callbacks add first an
* extent map to the modified list of extents and then create
* the respective ordered operation , which means in
* tree - log . c : btrfs_log_inode ( ) we might capture all existing
* ordered operations ( with btrfs_get_logged_extents ( ) ) before
* the fill_delalloc callback adds its ordered operation , and by
* the time we visit the modified list of extent maps ( with
* btrfs_log_changed_extents ( ) ) , we see and process the extent
* map they created . We then use the extent map to construct a
* file extent item for logging without waiting for the
* respective ordered operation to finish - this file extent
* item points to a disk location that might not have yet been
* written to , containing random data - so after a crash a log
* replay will make our inode have file extent items that point
* to disk locations containing invalid data , as we returned
* success to userspace without waiting for the respective
* ordered operation to finish , because it wasn ' t captured by
* btrfs_get_logged_extents ( ) .
*/
ret = start_ordered_ops ( inode , start , end ) ;
}
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
2013-10-26 00:13:35 +04:00
}
2012-09-06 14:04:27 +04:00
atomic_inc ( & root - > log_batch ) ;
2009-10-13 21:21:08 +04:00
2007-06-12 14:35:45 +04:00
/*
Btrfs: fix data loss in the fast fsync path
When using the fast file fsync code path we can miss the fact that new
writes happened since the last file fsync and therefore return without
waiting for the IO to finish and write the new extents to the fsync log.
Here's an example scenario where the fsync will miss the fact that new
file data exists that wasn't yet durably persisted:
1. fs_info->last_trans_committed == N - 1 and current transaction is
transaction N (fs_info->generation == N);
2. do a buffered write;
3. fsync our inode, this clears our inode's full sync flag, starts
an ordered extent and waits for it to complete - when it completes
at btrfs_finish_ordered_io(), the inode's last_trans is set to the
value N (via btrfs_update_inode_fallback -> btrfs_update_inode ->
btrfs_set_inode_last_trans);
4. transaction N is committed, so fs_info->last_trans_committed is now
set to the value N and fs_info->generation remains with the value N;
5. do another buffered write, when this happens btrfs_file_write_iter
sets our inode's last_trans to the value N + 1 (that is
fs_info->generation + 1 == N + 1);
6. transaction N + 1 is started and fs_info->generation now has the
value N + 1;
7. transaction N + 1 is committed, so fs_info->last_trans_committed
is set to the value N + 1;
8. fsync our inode - because it doesn't have the full sync flag set,
we only start the ordered extent, we don't wait for it to complete
(only in a later phase) therefore its last_trans field has the
value N + 1 set previously by btrfs_file_write_iter(), and so we
have:
inode->last_trans <= fs_info->last_trans_committed
(N + 1) (N + 1)
Which made us not log the last buffered write and exit the fsync
handler immediately, returning success (0) to user space and resulting
in data loss after a crash.
This can actually be triggered deterministically and the following excerpt
from a testcase I made for xfstests triggers the issue. It moves a dummy
file across directories and then fsyncs the old parent directory - this
is just to trigger a transaction commit, so moving files around isn't
directly related to the issue but it was chosen because running 'sync' for
example does more than just committing the current transaction, as it
flushes/waits for all file data to be persisted. The issue can also happen
at random periods, since the transaction kthread periodicaly commits the
current transaction (about every 30 seconds by default).
The body of the test is:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our main test file 'foo', the one we check for data loss.
# By doing an fsync against our file, it makes btrfs clear the 'needs_full_sync'
# bit from its flags (btrfs inode specific flags).
$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 8K" \
-c "fsync" $SCRATCH_MNT/foo | _filter_xfs_io
# Now create one other file and 2 directories. We will move this second file
# from one directory to the other later because it forces btrfs to commit its
# currently open transaction if we fsync the old parent directory. This is
# necessary to trigger the data loss bug that affected btrfs.
mkdir $SCRATCH_MNT/testdir_1
touch $SCRATCH_MNT/testdir_1/bar
mkdir $SCRATCH_MNT/testdir_2
# Make sure everything is durably persisted.
sync
# Write more 8Kb of data to our file.
$XFS_IO_PROG -c "pwrite -S 0xbb 8K 8K" $SCRATCH_MNT/foo | _filter_xfs_io
# Move our 'bar' file into a new directory.
mv $SCRATCH_MNT/testdir_1/bar $SCRATCH_MNT/testdir_2/bar
# Fsync our first directory. Because it had a file moved into some other
# directory, this made btrfs commit the currently open transaction. This is
# a condition necessary to trigger the data loss bug.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir_1
# Now fsync our main test file. If the fsync succeeds, we expect the 8Kb of
# data we wrote previously to be persisted and available if a crash happens.
# This did not happen with btrfs, because of the transaction commit that
# happened when we fsynced the parent directory.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Now check that all data we wrote before are available.
echo "File content after log replay:"
od -t x1 $SCRATCH_MNT/foo
status=0
exit
The expected golden output for the test, which is what we get with this
fix applied (or when running against ext3/4 and xfs), is:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 8192/8192 bytes at offset 8192
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0040000
Without this fix applied, the output shows the test file does not have
the second 8Kb extent that we successfully fsynced:
wrote 8192/8192 bytes at offset 0
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 8192/8192 bytes at offset 8192
XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
File content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
0020000
So fix this by skipping the fsync only if we're doing a full sync and
if the inode's last_trans is <= fs_info->last_trans_committed, or if
the inode is already in the log. Also remove setting the inode's
last_trans in btrfs_file_write_iter since it's useless/unreliable.
Also because btrfs_file_write_iter no longer sets inode->last_trans to
fs_info->generation + 1, don't set last_trans to 0 if we bail out and don't
bail out if last_trans is 0, otherwise something as simple as the following
example wouldn't log the second write on the last fsync:
1. write to file
2. fsync file
3. fsync file
|--> btrfs_inode_in_log() returns true and it set last_trans to 0
4. write to file
|--> btrfs_file_write_iter() no longers sets last_trans, so it
remained with a value of 0
5. fsync
|--> inode->last_trans == 0, so it bails out without logging the
second write
A test case for xfstests will be sent soon.
CC: <stable@vger.kernel.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-03-01 23:36:00 +03:00
* If the last transaction that changed this file was before the current
* transaction and we have the full sync flag set in our inode , we can
* bail out now without any syncing .
*
* Note that we can ' t bail out if the full sync flag isn ' t set . This is
* because when the full sync flag is set we start all ordered extents
* and wait for them to fully complete - when they complete they update
* the inode ' s last_trans field through :
*
* btrfs_finish_ordered_io ( ) - >
* btrfs_update_inode_fallback ( ) - >
* btrfs_update_inode ( ) - >
* btrfs_set_inode_last_trans ( )
*
* So we are sure that last_trans is up to date and can do this check to
* bail out safely . For the fast path , when the full sync flag is not
* set in our inode , we can not do it because we start only our ordered
* extents and don ' t wait for them to complete ( that is when
* btrfs_finish_ordered_io runs ) , so here at this point their last_trans
* value might be less than or equals to fs_info - > last_trans_committed ,
* and setting a speculative last_trans for an inode when a buffered
* write is made ( such as fs_info - > generation + 1 for example ) would not
* be reliable since after setting the value and before fsync is called
* any number of transactions can start and commit ( transaction kthread
* commits the current transaction periodically ) , and a transaction
* commit does not start nor waits for ordered extents to complete .
2009-10-13 21:21:08 +04:00
*/
2011-04-12 01:25:13 +04:00
smp_mb ( ) ;
2012-05-30 00:57:49 +04:00
if ( btrfs_inode_in_log ( inode , root - > fs_info - > generation ) | |
2015-03-31 16:16:52 +03:00
( BTRFS_I ( inode ) - > last_trans < =
root - > fs_info - > last_trans_committed & &
( full_sync | |
! btrfs_have_ordered_extents_in_range ( inode , start , len ) ) ) ) {
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
/*
* We ' v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason , it ' s no longer relevant .
*/
clear_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2007-08-11 00:22:09 +04:00
goto out ;
}
/*
2007-08-28 00:49:44 +04:00
* ok we haven ' t committed the transaction yet , lets do a commit
*/
2010-05-29 13:49:07 +04:00
if ( file - > private_data )
2008-06-10 18:07:39 +04:00
btrfs_ioctl_trans_end ( file ) ;
2014-01-15 22:34:13 +04:00
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log , which could require joining a transaction ( for
* example checking cross references in the nocow path ) . If we use join
* here we could get into a situation where we ' re waiting on IO to
* happen that is blocked on a transaction trying to commit . With start
* we inc the extwriter counter , so we wait for all extwriters to exit
* before we start blocking join ' ers . This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction * cough * Josef * cough * .
*/
2010-05-16 18:48:46 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2007-06-12 14:35:45 +04:00
goto out ;
}
2014-01-15 22:34:13 +04:00
trans - > sync = true ;
2008-09-06 00:13:11 +04:00
2014-02-20 14:08:58 +04:00
btrfs_init_log_ctx ( & ctx ) ;
2014-09-07 01:34:39 +04:00
ret = btrfs_log_dentry_safe ( trans , root , dentry , start , end , & ctx ) ;
2011-07-17 04:44:56 +04:00
if ( ret < 0 ) {
2013-09-11 23:36:44 +04:00
/* Fallthrough and commit/free transaction. */
ret = 1 ;
2011-07-17 04:44:56 +04:00
}
2008-09-11 23:53:12 +04:00
/* we've logged all the items and now have a consistent
* version of the file in the log . It is possible that
* someone will come in and modify the file , but that ' s
* fine because the log is consistent on disk , and we
* have references to all of the file ' s extents
*
* It is possible that someone will come in and log the
* file again , but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe .
*/
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2008-09-11 23:53:12 +04:00
Btrfs: fix data corruption after fast fsync and writeback error
When we do a fast fsync, we start all ordered operations and then while
they're running in parallel we visit the list of modified extent maps
and construct their matching file extent items and write them to the
log btree. After that, in btrfs_sync_log() we wait for all the ordered
operations to finish (via btrfs_wait_logged_extents).
The problem with this is that we were completely ignoring errors that
can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM
or -EIO errors for example. When such error happens, it means we have parts
of the on disk extent that weren't written to, and so we end up logging
file extent items that point to these extents that contain garbage/random
data - so after a crash/reboot plus log replay, we get our inode's metadata
pointing to those extents.
This worked in contrast with the full (non-fast) fsync path, where we
start all ordered operations, wait for them to finish and then write
to the log btree. In this path, after each ordered operation completes
we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return
-EIO if so (via btrfs_wait_ordered_range).
So if an error happens with any ordered operation, just return a -EIO
error to userspace, so that it knows that not all of its previous writes
were durably persisted and the application can take proper action (like
redo the writes for e.g.) - and definitely not leave any file extent items
in the log refer to non fully written extents.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-05 18:14:39 +04:00
/*
* If any of the ordered extents had an error , just return it to user
* space , so that the application knows some writes didn ' t succeed and
* can take proper action ( retry for e . g . ) . Blindly committing the
* transaction in this case , would fool userspace that everything was
* successful . And we also want to make sure our log doesn ' t contain
* file extent items pointing to extents that weren ' t fully written to -
* just like in the non fast fsync path , where we check for the ordered
* operation ' s error flag before writing to the log tree and return - EIO
* if any of them had this flag set ( btrfs_wait_ordered_range ) -
* therefore we need to check for errors in the ordered operations ,
* which are indicated by ctx . io_err .
*/
if ( ctx . io_err ) {
btrfs_end_transaction ( trans , root ) ;
ret = ctx . io_err ;
goto out ;
}
2009-10-13 21:21:08 +04:00
if ( ret ! = BTRFS_NO_LOG_SYNC ) {
2013-10-26 00:13:35 +04:00
if ( ! ret ) {
2014-02-20 14:08:58 +04:00
ret = btrfs_sync_log ( trans , root , & ctx ) ;
2013-10-26 00:13:35 +04:00
if ( ! ret ) {
2009-10-13 21:21:08 +04:00
ret = btrfs_end_transaction ( trans , root ) ;
2013-10-26 00:13:35 +04:00
goto out ;
2012-10-12 23:27:49 +04:00
}
2009-10-13 21:21:08 +04:00
}
2013-10-26 00:13:35 +04:00
if ( ! full_sync ) {
ret = btrfs_wait_ordered_range ( inode , start ,
end - start + 1 ) ;
2014-05-30 02:31:39 +04:00
if ( ret ) {
btrfs_end_transaction ( trans , root ) ;
2013-10-26 00:13:35 +04:00
goto out ;
2014-05-30 02:31:39 +04:00
}
2013-10-26 00:13:35 +04:00
}
ret = btrfs_commit_transaction ( trans , root ) ;
2009-10-13 21:21:08 +04:00
} else {
ret = btrfs_end_transaction ( trans , root ) ;
2008-09-06 00:13:11 +04:00
}
2007-06-12 14:35:45 +04:00
out :
2010-01-29 13:42:11 +03:00
return ret > 0 ? - EIO : ret ;
2007-06-12 14:35:45 +04:00
}
2009-09-27 22:29:37 +04:00
static const struct vm_operations_struct btrfs_file_vm_ops = {
2007-07-25 20:31:35 +04:00
. fault = filemap_fault ,
2014-04-08 02:37:19 +04:00
. map_pages = filemap_map_pages ,
2007-06-15 21:50:00 +04:00
. page_mkwrite = btrfs_page_mkwrite ,
} ;
static int btrfs_file_mmap ( struct file * filp , struct vm_area_struct * vma )
{
2010-05-20 11:21:50 +04:00
struct address_space * mapping = filp - > f_mapping ;
if ( ! mapping - > a_ops - > readpage )
return - ENOEXEC ;
2007-06-15 21:50:00 +04:00
file_accessed ( filp ) ;
2010-05-20 11:21:50 +04:00
vma - > vm_ops = & btrfs_file_vm_ops ;
2007-06-15 21:50:00 +04:00
return 0 ;
}
2012-08-29 22:27:18 +04:00
static int hole_mergeable ( struct inode * inode , struct extent_buffer * leaf ,
int slot , u64 start , u64 end )
{
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
if ( slot < 0 | | slot > = btrfs_header_nritems ( leaf ) )
return 0 ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = btrfs_ino ( inode ) | |
key . type ! = BTRFS_EXTENT_DATA_KEY )
return 0 ;
fi = btrfs_item_ptr ( leaf , slot , struct btrfs_file_extent_item ) ;
if ( btrfs_file_extent_type ( leaf , fi ) ! = BTRFS_FILE_EXTENT_REG )
return 0 ;
if ( btrfs_file_extent_disk_bytenr ( leaf , fi ) )
return 0 ;
if ( key . offset = = end )
return 1 ;
if ( key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) = = start )
return 1 ;
return 0 ;
}
static int fill_holes ( struct btrfs_trans_handle * trans , struct inode * inode ,
struct btrfs_path * path , u64 offset , u64 end )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct extent_buffer * leaf ;
struct btrfs_file_extent_item * fi ;
struct extent_map * hole_em ;
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
struct btrfs_key key ;
int ret ;
2013-10-22 20:18:51 +04:00
if ( btrfs_fs_incompat ( root - > fs_info , NO_HOLES ) )
goto out ;
2012-08-29 22:27:18 +04:00
key . objectid = btrfs_ino ( inode ) ;
key . type = BTRFS_EXTENT_DATA_KEY ;
key . offset = offset ;
ret = btrfs_search_slot ( trans , root , & key , path , 0 , 1 ) ;
if ( ret < 0 )
return ret ;
BUG_ON ( ! ret ) ;
leaf = path - > nodes [ 0 ] ;
if ( hole_mergeable ( inode , leaf , path - > slots [ 0 ] - 1 , offset , end ) ) {
u64 num_bytes ;
path - > slots [ 0 ] - - ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
num_bytes = btrfs_file_extent_num_bytes ( leaf , fi ) +
end - offset ;
btrfs_set_file_extent_num_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_ram_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_offset ( leaf , fi , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
2014-07-01 10:34:28 +04:00
if ( hole_mergeable ( inode , leaf , path - > slots [ 0 ] , offset , end ) ) {
2012-08-29 22:27:18 +04:00
u64 num_bytes ;
key . offset = offset ;
2014-11-12 07:43:09 +03:00
btrfs_set_item_key_safe ( root - > fs_info , path , & key ) ;
2012-08-29 22:27:18 +04:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
num_bytes = btrfs_file_extent_num_bytes ( leaf , fi ) + end -
offset ;
btrfs_set_file_extent_num_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_ram_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_offset ( leaf , fi , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
btrfs_release_path ( path ) ;
ret = btrfs_insert_file_extent ( trans , root , btrfs_ino ( inode ) , offset ,
0 , 0 , end - offset , 0 , end - offset ,
0 , 0 , 0 ) ;
if ( ret )
return ret ;
out :
btrfs_release_path ( path ) ;
hole_em = alloc_extent_map ( ) ;
if ( ! hole_em ) {
btrfs_drop_extent_cache ( inode , offset , end - 1 , 0 ) ;
set_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
} else {
hole_em - > start = offset ;
hole_em - > len = end - offset ;
2013-04-04 22:31:27 +04:00
hole_em - > ram_bytes = hole_em - > len ;
2012-08-29 22:27:18 +04:00
hole_em - > orig_start = offset ;
hole_em - > block_start = EXTENT_MAP_HOLE ;
hole_em - > block_len = 0 ;
2012-12-03 19:31:19 +04:00
hole_em - > orig_block_len = 0 ;
2012-08-29 22:27:18 +04:00
hole_em - > bdev = root - > fs_info - > fs_devices - > latest_bdev ;
hole_em - > compress_type = BTRFS_COMPRESS_NONE ;
hole_em - > generation = trans - > transid ;
do {
btrfs_drop_extent_cache ( inode , offset , end - 1 , 0 ) ;
write_lock ( & em_tree - > lock ) ;
2013-04-06 00:51:15 +04:00
ret = add_extent_mapping ( em_tree , hole_em , 1 ) ;
2012-08-29 22:27:18 +04:00
write_unlock ( & em_tree - > lock ) ;
} while ( ret = = - EEXIST ) ;
free_extent_map ( hole_em ) ;
if ( ret )
set_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
}
return 0 ;
}
2014-05-30 11:16:10 +04:00
/*
* Find a hole extent on given inode and change start / len to the end of hole
* extent . ( hole / vacuum extent whose em - > start < = start & &
* em - > start + em - > len > start )
* When a hole extent is found , return 1 and modify start / len .
*/
static int find_first_non_hole ( struct inode * inode , u64 * start , u64 * len )
{
struct extent_map * em ;
int ret = 0 ;
em = btrfs_get_extent ( inode , NULL , 0 , * start , * len , 0 ) ;
if ( IS_ERR_OR_NULL ( em ) ) {
if ( ! em )
ret = - ENOMEM ;
else
ret = PTR_ERR ( em ) ;
return ret ;
}
/* Hole or vacuum extent(only exists in no-hole mode) */
if ( em - > block_start = = EXTENT_MAP_HOLE ) {
ret = 1 ;
* len = em - > start + em - > len > * start + * len ?
0 : * start + * len - em - > start - em - > len ;
* start = em - > start + em - > len ;
}
free_extent_map ( em ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
static int btrfs_punch_hole ( struct inode * inode , loff_t offset , loff_t len )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct extent_state * cached_state = NULL ;
struct btrfs_path * path ;
struct btrfs_block_rsv * rsv ;
struct btrfs_trans_handle * trans ;
2014-05-30 11:16:10 +04:00
u64 lockstart ;
u64 lockend ;
u64 tail_start ;
u64 tail_len ;
u64 orig_start = offset ;
u64 cur_offset ;
2012-08-29 22:27:18 +04:00
u64 min_size = btrfs_calc_trunc_metadata_size ( root , 1 ) ;
u64 drop_end ;
int ret = 0 ;
int err = 0 ;
2015-09-23 00:00:07 +03:00
unsigned int rsv_count ;
2014-05-30 11:16:10 +04:00
bool same_page ;
2013-10-22 20:18:51 +04:00
bool no_holes = btrfs_fs_incompat ( root - > fs_info , NO_HOLES ) ;
2014-04-26 04:35:31 +04:00
u64 ino_size ;
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
bool truncated_page = false ;
bool updated_inode = false ;
2012-08-29 22:27:18 +04:00
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , offset , len ) ;
if ( ret )
return ret ;
2012-08-29 22:27:18 +04:00
mutex_lock ( & inode - > i_mutex ) ;
2014-04-26 04:35:31 +04:00
ino_size = round_up ( inode - > i_size , PAGE_CACHE_SIZE ) ;
2014-05-30 11:16:10 +04:00
ret = find_first_non_hole ( inode , & offset , & len ) ;
if ( ret < 0 )
goto out_only_mutex ;
if ( ret & & ! len ) {
/* Already in a large hole */
ret = 0 ;
goto out_only_mutex ;
}
btrfs: Use right extent length when inserting overlap extent map.
When current btrfs finds that a new extent map is going to be insereted
but failed with -EEXIST, it will try again to insert the extent map
but with the length of sectorsize.
This is OK if we don't enable 'no-holes' feature since all extent space
is continuous, we will not go into the not found->insert routine.
But if we enable 'no-holes' feature, it will make things out of control.
e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent():
btrfs_get_extent() args: start: 27874 len 4100
28672 27874 28672 27874+4100 32768
|-----------------------|
|---------hole--------------------|---------data----------|
1) not found and insert
Since no extent map containing the range, btrfs_get_extent() will go
into the not_found and insert routine, which will try to insert the
extent map (27874, 27847 + 4100).
2) first overlap
But it overlaps with (28672, 32768) extent, so -EEXIST will be returned
by add_extent_mapping().
3) retry but still overlap
After catching the -EEXIST, then btrfs_get_extent() will try insert it
again but with 4K length, which still overlaps, so -EEXIST will be
returned.
This makes the following patch fail to punch hole.
d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole.
This patch will use the right length, which is the (exsisting->start -
em->start) to insert, making the above patch works in 'no-holes' mode.
Also, some small code style problems in above patch is fixed too.
Reported-by: Filipe David Manana <fdmanana@gmail.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe David Manana <fdmanana@suse.com>
Tested-by: Filipe David Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-08 09:06:20 +04:00
lockstart = round_up ( offset , BTRFS_I ( inode ) - > root - > sectorsize ) ;
2014-05-30 11:16:10 +04:00
lockend = round_down ( offset + len ,
BTRFS_I ( inode ) - > root - > sectorsize ) - 1 ;
same_page = ( ( offset > > PAGE_CACHE_SHIFT ) = =
( ( offset + len - 1 ) > > PAGE_CACHE_SHIFT ) ) ;
2012-12-05 14:54:52 +04:00
/*
* We needn ' t truncate any page which is beyond the end of the file
* because we are sure there is no data there .
*/
2012-08-29 22:27:18 +04:00
/*
* Only do this if we are in the same page and we aren ' t doing the
* entire page .
*/
if ( same_page & & len < PAGE_CACHE_SIZE ) {
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
if ( offset < ino_size ) {
truncated_page = true ;
2012-12-05 14:54:52 +04:00
ret = btrfs_truncate_page ( inode , offset , len , 0 ) ;
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
} else {
ret = 0 ;
}
2014-05-30 11:16:10 +04:00
goto out_only_mutex ;
2012-08-29 22:27:18 +04:00
}
/* zero back part of the first page */
2014-02-15 19:55:58 +04:00
if ( offset < ino_size ) {
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
truncated_page = true ;
2012-12-05 14:54:52 +04:00
ret = btrfs_truncate_page ( inode , offset , 0 , 0 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
}
2014-05-30 11:16:10 +04:00
/* Check the aligned pages after the first unaligned page,
* if offset ! = orig_start , which means the first unaligned page
* including serveral following pages are already in holes ,
* the extra check can be skipped */
if ( offset = = orig_start ) {
/* after truncate page, check hole again */
len = offset + len - lockstart ;
offset = lockstart ;
ret = find_first_non_hole ( inode , & offset , & len ) ;
if ( ret < 0 )
goto out_only_mutex ;
if ( ret & & ! len ) {
ret = 0 ;
goto out_only_mutex ;
}
lockstart = offset ;
}
/* Check the tail unaligned part is in a hole */
tail_start = lockend + 1 ;
tail_len = offset + len - tail_start ;
if ( tail_len ) {
ret = find_first_non_hole ( inode , & tail_start , & tail_len ) ;
if ( unlikely ( ret < 0 ) )
goto out_only_mutex ;
if ( ! ret ) {
/* zero the front end of the last page */
if ( tail_start + tail_len < ino_size ) {
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
truncated_page = true ;
2014-05-30 11:16:10 +04:00
ret = btrfs_truncate_page ( inode ,
tail_start + tail_len , 0 , 1 ) ;
if ( ret )
goto out_only_mutex ;
btrfs: Use right extent length when inserting overlap extent map.
When current btrfs finds that a new extent map is going to be insereted
but failed with -EEXIST, it will try again to insert the extent map
but with the length of sectorsize.
This is OK if we don't enable 'no-holes' feature since all extent space
is continuous, we will not go into the not found->insert routine.
But if we enable 'no-holes' feature, it will make things out of control.
e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent():
btrfs_get_extent() args: start: 27874 len 4100
28672 27874 28672 27874+4100 32768
|-----------------------|
|---------hole--------------------|---------data----------|
1) not found and insert
Since no extent map containing the range, btrfs_get_extent() will go
into the not_found and insert routine, which will try to insert the
extent map (27874, 27847 + 4100).
2) first overlap
But it overlaps with (28672, 32768) extent, so -EEXIST will be returned
by add_extent_mapping().
3) retry but still overlap
After catching the -EEXIST, then btrfs_get_extent() will try insert it
again but with 4K length, which still overlaps, so -EEXIST will be
returned.
This makes the following patch fail to punch hole.
d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole.
This patch will use the right length, which is the (exsisting->start -
em->start) to insert, making the above patch works in 'no-holes' mode.
Also, some small code style problems in above patch is fixed too.
Reported-by: Filipe David Manana <fdmanana@gmail.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Filipe David Manana <fdmanana@suse.com>
Tested-by: Filipe David Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-08-08 09:06:20 +04:00
}
2012-12-05 14:54:12 +04:00
}
2012-08-29 22:27:18 +04:00
}
if ( lockend < lockstart ) {
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
ret = 0 ;
goto out_only_mutex ;
2012-08-29 22:27:18 +04:00
}
while ( 1 ) {
struct btrfs_ordered_extent * ordered ;
truncate_pagecache_range ( inode , lockstart , lockend ) ;
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
0 , & cached_state ) ;
ordered = btrfs_lookup_first_ordered_extent ( inode , lockend ) ;
/*
* We need to make sure we have no ordered extents in this range
* and nobody raced in and read a page in this range , if we did
* we need to try again .
*/
if ( ( ! ordered | |
2013-11-19 20:19:24 +04:00
( ordered - > file_offset + ordered - > len < = lockstart | |
2012-08-29 22:27:18 +04:00
ordered - > file_offset > lockend ) ) & &
2014-05-21 00:07:56 +04:00
! btrfs_page_exists_in_range ( inode , lockstart , lockend ) ) {
2012-08-29 22:27:18 +04:00
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
break ;
}
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart ,
lockend , & cached_state , GFP_NOFS ) ;
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , lockstart ,
lockend - lockstart + 1 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
}
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
2012-09-06 14:02:28 +04:00
rsv = btrfs_alloc_block_rsv ( root , BTRFS_BLOCK_RSV_TEMP ) ;
2012-08-29 22:27:18 +04:00
if ( ! rsv ) {
ret = - ENOMEM ;
goto out_free ;
}
rsv - > size = btrfs_calc_trunc_metadata_size ( root , 1 ) ;
rsv - > failfast = 1 ;
/*
* 1 - update the inode
* 1 - removing the extents in the range
2013-10-22 20:18:51 +04:00
* 1 - adding the hole extent if no_holes isn ' t set
2012-08-29 22:27:18 +04:00
*/
2013-10-22 20:18:51 +04:00
rsv_count = no_holes ? 2 : 3 ;
trans = btrfs_start_transaction ( root , rsv_count ) ;
2012-08-29 22:27:18 +04:00
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
goto out_free ;
}
ret = btrfs_block_rsv_migrate ( & root - > fs_info - > trans_block_rsv , rsv ,
min_size ) ;
BUG_ON ( ret ) ;
trans - > block_rsv = rsv ;
2014-05-30 11:16:10 +04:00
cur_offset = lockstart ;
len = lockend - cur_offset ;
2012-08-29 22:27:18 +04:00
while ( cur_offset < lockend ) {
ret = __btrfs_drop_extents ( trans , root , inode , path ,
cur_offset , lockend + 1 ,
2014-01-07 15:42:27 +04:00
& drop_end , 1 , 0 , 0 , NULL ) ;
2012-08-29 22:27:18 +04:00
if ( ret ! = - ENOSPC )
break ;
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
2014-02-15 19:55:58 +04:00
if ( cur_offset < ino_size ) {
ret = fill_holes ( trans , inode , path , cur_offset ,
drop_end ) ;
if ( ret ) {
err = ret ;
break ;
}
2012-08-29 22:27:18 +04:00
}
cur_offset = drop_end ;
ret = btrfs_update_inode ( trans , root , inode ) ;
if ( ret ) {
err = ret ;
break ;
}
btrfs_end_transaction ( trans , root ) ;
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2012-08-29 22:27:18 +04:00
2013-10-22 20:18:51 +04:00
trans = btrfs_start_transaction ( root , rsv_count ) ;
2012-08-29 22:27:18 +04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
trans = NULL ;
break ;
}
ret = btrfs_block_rsv_migrate ( & root - > fs_info - > trans_block_rsv ,
rsv , min_size ) ;
BUG_ON ( ret ) ; /* shouldn't happen */
trans - > block_rsv = rsv ;
2014-05-30 11:16:10 +04:00
ret = find_first_non_hole ( inode , & cur_offset , & len ) ;
if ( unlikely ( ret < 0 ) )
break ;
if ( ret & & ! len ) {
ret = 0 ;
break ;
}
2012-08-29 22:27:18 +04:00
}
if ( ret ) {
err = ret ;
goto out_trans ;
}
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
Btrfs: fix leaf corruption caused by ENOSPC while hole punching
While running a stress test with multiple threads writing to the same btrfs
file system, I ended up with a situation where a leaf was corrupted in that
it had 2 file extent item keys that had the same exact key. I was able to
detect this quickly thanks to the following patch which triggers an assertion
as soon as a leaf is marked dirty if there are duplicated keys or out of order
keys:
Btrfs: check if items are ordered when a leaf is marked dirty
(https://patchwork.kernel.org/patch/3955431/)
Basically while running the test, I got the following in dmesg:
[28877.415877] WARNING: CPU: 2 PID: 10706 at fs/btrfs/file.c:553 btrfs_drop_extent_cache+0x435/0x440 [btrfs]()
(...)
[28877.415917] Call Trace:
[28877.415922] [<ffffffff816f1189>] dump_stack+0x4e/0x68
[28877.415926] [<ffffffff8104a32c>] warn_slowpath_common+0x8c/0xc0
[28877.415929] [<ffffffff8104a37a>] warn_slowpath_null+0x1a/0x20
[28877.415944] [<ffffffffa03775a5>] btrfs_drop_extent_cache+0x435/0x440 [btrfs]
[28877.415949] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[28877.415962] [<ffffffffa03777d9>] fill_holes+0x229/0x3e0 [btrfs]
[28877.415972] [<ffffffffa0345865>] ? block_rsv_add_bytes+0x55/0x80 [btrfs]
[28877.415984] [<ffffffffa03792cb>] btrfs_fallocate+0xb6b/0xc20 [btrfs]
(...)
[29854.132560] BTRFS critical (device sdc): corrupt leaf, bad key order: block=955232256,root=1, slot=24
[29854.132565] BTRFS info (device sdc): leaf 955232256 total ptrs 40 free space 778
(...)
[29854.132637] item 23 key (3486 108 667648) itemoff 2694 itemsize 53
[29854.132638] extent data disk bytenr 14574411776 nr 286720
[29854.132639] extent data offset 0 nr 286720 ram 286720
[29854.132640] item 24 key (3486 108 954368) itemoff 2641 itemsize 53
[29854.132641] extent data disk bytenr 0 nr 0
[29854.132643] extent data offset 0 nr 0 ram 0
[29854.132644] item 25 key (3486 108 954368) itemoff 2588 itemsize 53
[29854.132645] extent data disk bytenr 8699670528 nr 77824
[29854.132646] extent data offset 0 nr 77824 ram 77824
[29854.132647] item 26 key (3486 108 1146880) itemoff 2535 itemsize 53
[29854.132648] extent data disk bytenr 8699670528 nr 77824
[29854.132649] extent data offset 0 nr 77824 ram 77824
(...)
[29854.132707] kernel BUG at fs/btrfs/ctree.h:3901!
(...)
[29854.132771] Call Trace:
[29854.132779] [<ffffffffa0342b5c>] setup_items_for_insert+0x2dc/0x400 [btrfs]
[29854.132791] [<ffffffffa0378537>] __btrfs_drop_extents+0xba7/0xdd0 [btrfs]
[29854.132794] [<ffffffff8109c0d6>] ? trace_hardirqs_on_caller+0x16/0x1d0
[29854.132797] [<ffffffff8109c29d>] ? trace_hardirqs_on+0xd/0x10
[29854.132800] [<ffffffff8118e7be>] ? kmem_cache_alloc+0xfe/0x1c0
[29854.132810] [<ffffffffa036783b>] insert_reserved_file_extent.constprop.66+0xab/0x310 [btrfs]
[29854.132820] [<ffffffffa036a6c6>] __btrfs_prealloc_file_range+0x116/0x340 [btrfs]
[29854.132830] [<ffffffffa0374d53>] btrfs_prealloc_file_range+0x23/0x30 [btrfs]
(...)
So this is caused by getting an -ENOSPC error while punching a file hole, more
specifically, we get -ENOSPC error from __btrfs_drop_extents in the while loop
of file.c:btrfs_punch_hole() when it's unable to modify the btree to delete one
or more file extent items due to lack of enough free space. When this happens,
in btrfs_punch_hole(), we attempt to reclaim free space by switching our transaction
block reservation object to root->fs_info->trans_block_rsv, end our transaction and
start a new transaction basically - and, we keep increasing our current offset
(cur_offset) as long as it's smaller than the end of the target range (lockend) -
this makes use leave the loop with cur_offset == drop_end which in turn makes us
call fill_holes() for inserting a file extent item that represents a 0 bytes range
hole (and this insertion succeeds, as in the meanwhile more space became available).
This 0 bytes file hole extent item is a problem because any subsequent caller of
__btrfs_drop_extents (regular file writes, or fallocate calls for e.g.), with a
start file offset that is equal to the offset of the hole, will not remove this
extent item due to the following conditional in the while loop of
__btrfs_drop_extents:
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
}
This later makes the call to setup_items_for_insert() (at the very end of
__btrfs_drop_extents), insert a new file extent item with the same offset as
the 0 bytes file hole extent item that follows it. Needless is to say that this
causes chaos, either when reading the leaf from disk (btree_readpage_end_io_hook),
where we perform leaf sanity checks or in subsequent operations that manipulate
file extent items, as in the fallocate call as shown by the dmesg trace above.
Without my other patch to perform the leaf sanity checks once a leaf is marked
as dirty (if the integrity checker is enabled), it would have been much harder
to debug this issue.
This change might fix a few similar issues reported by users in the mailing
list regarding assertion failures in btrfs_set_item_key_safe calls performed
by __btrfs_drop_extents, such as the following report:
http://comments.gmane.org/gmane.comp.file-systems.btrfs/32938
Asking fill_holes() to create a 0 bytes wide file hole item also produced the
first warning in the trace above, as we passed a range to btrfs_drop_extent_cache
that has an end smaller (by -1) than its start.
On 3.14 kernels this issue manifests itself through leaf corruption, as we get
duplicated file extent item keys in a leaf when calling setup_items_for_insert(),
but on older kernels, setup_items_for_insert() isn't called by __btrfs_drop_extents(),
instead we have callers of __btrfs_drop_extents(), namely the functions
inode.c:insert_inline_extent() and inode.c:insert_reserved_file_extent(), calling
btrfs_insert_empty_item() to insert the new file extent item, which would fail with
error -EEXIST, instead of inserting a duplicated key - which is still a serious
issue as it would make all similar file extent item replace operations keep
failing if they target the same file range.
Cc: stable@vger.kernel.org
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-04-29 16:18:40 +04:00
/*
* Don ' t insert file hole extent item if it ' s for a range beyond eof
* ( because it ' s useless ) or if it represents a 0 bytes range ( when
* cur_offset = = drop_end ) .
*/
if ( cur_offset < ino_size & & cur_offset < drop_end ) {
2014-02-15 19:55:58 +04:00
ret = fill_holes ( trans , inode , path , cur_offset , drop_end ) ;
if ( ret ) {
err = ret ;
goto out_trans ;
}
2012-08-29 22:27:18 +04:00
}
out_trans :
if ( ! trans )
goto out_free ;
2012-11-08 08:47:33 +04:00
inode_inc_iversion ( inode ) ;
inode - > i_mtime = inode - > i_ctime = CURRENT_TIME ;
2012-08-29 22:27:18 +04:00
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
ret = btrfs_update_inode ( trans , root , inode ) ;
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
updated_inode = true ;
2012-08-29 22:27:18 +04:00
btrfs_end_transaction ( trans , root ) ;
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2012-08-29 22:27:18 +04:00
out_free :
btrfs_free_path ( path ) ;
btrfs_free_block_rsv ( root , rsv ) ;
out :
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
& cached_state , GFP_NOFS ) ;
2014-05-30 11:16:10 +04:00
out_only_mutex :
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page,
because the start offset isn't a multiple of the sector size or the
start offset and length fall within the same page, we were not updating
the inode item. This prevented an fsync from doing anything, if no other
file changes happened in the current transaction, because the fields
in btrfs_inode used to check if the inode needs to be fsync'ed weren't
updated.
This issue is easy to reproduce and the following excerpt from the
xfstest case I made shows how to trigger it:
_scratch_mkfs >> $seqres.full 2>&1
_init_flakey
_mount_flakey
# Create our test file.
$XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
$SCRATCH_MNT/foo | _filter_xfs_io
# Fsync the file, this makes btrfs update some btrfs inode specific fields
# that are used to track if the inode needs to be written/updated to the fsync
# log or not. After this fsync, the new values for those fields indicate that
# a subsequent fsync does not need to touch the fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
# Force a commit of the current transaction. After this point, any operation
# that modifies the data or metadata of our file, should update those fields in
# the btrfs inode with values that make the next fsync operation write to the
# fsync log.
sync
# Punch a hole in our file. This small range affects only 1 page.
# This made the btrfs hole punching implementation write only some zeroes in
# one page, but it did not update the btrfs inode fields used to determine if
# the next fsync needs to write to the fsync log.
$XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
# Another variation of the previously mentioned case.
$XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
# Now fsync the file. This was a no-operation because the previous hole punch
# operation didn't update the inode's fields mentioned before, so they remained
# with the values they had after the first fsync - that is, they indicate that
# it is not needed to write to fsync log.
$XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
echo "File content before:"
od -t x1 $SCRATCH_MNT/foo
# Simulate a crash/power loss.
_load_flakey_table $FLAKEY_DROP_WRITES
_unmount_flakey
# Enable writes and mount the fs. This makes the fsync log replay code run.
_load_flakey_table $FLAKEY_ALLOW_WRITES
_mount_flakey
# Because the last fsync didn't do anything, here the file content matched what
# it was after the first fsync, before the holes were punched, and not what it
# was after the holes were punched.
echo "File content after:"
od -t x1 $SCRATCH_MNT/foo
This issue has been around since 2012, when the punch hole implementation
was added, commit 2aaa66558172 ("Btrfs: add hole punching").
A test case for xfstests follows soon.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
2015-02-16 01:38:54 +03:00
if ( ! updated_inode & & truncated_page & & ! ret & & ! err ) {
/*
* If we only end up zeroing part of a page , we still need to
* update the inode item , so that all the time fields are
* updated as well as the necessary btrfs inode in memory fields
* for detecting , at fsync time , if the inode isn ' t yet in the
* log tree or it ' s there but not up to date .
*/
trans = btrfs_start_transaction ( root , 1 ) ;
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
} else {
err = btrfs_update_inode ( trans , root , inode ) ;
ret = btrfs_end_transaction ( trans , root ) ;
}
}
2012-08-29 22:27:18 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
if ( ret & & ! err )
err = ret ;
return err ;
}
2015-09-08 12:22:44 +03:00
/* Helper structure to record which range is already reserved */
struct falloc_range {
struct list_head list ;
u64 start ;
u64 len ;
} ;
/*
* Helper function to add falloc range
*
* Caller should have locked the larger range of extent containing
* [ start , len )
*/
static int add_falloc_range ( struct list_head * head , u64 start , u64 len )
{
struct falloc_range * prev = NULL ;
struct falloc_range * range = NULL ;
if ( list_empty ( head ) )
goto insert ;
/*
* As fallocate iterate by bytenr order , we only need to check
* the last range .
*/
prev = list_entry ( head - > prev , struct falloc_range , list ) ;
if ( prev - > start + prev - > len = = start ) {
prev - > len + = len ;
return 0 ;
}
insert :
range = kmalloc ( sizeof ( * range ) , GFP_NOFS ) ;
if ( ! range )
return - ENOMEM ;
range - > start = start ;
range - > len = len ;
list_add_tail ( & range - > list , head ) ;
return 0 ;
}
2011-01-14 15:07:43 +03:00
static long btrfs_fallocate ( struct file * file , int mode ,
loff_t offset , loff_t len )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2011-01-14 15:07:43 +03:00
struct extent_state * cached_state = NULL ;
2015-09-08 12:22:44 +03:00
struct falloc_range * range ;
struct falloc_range * tmp ;
struct list_head reserve_list ;
2011-01-14 15:07:43 +03:00
u64 cur_offset ;
u64 last_byte ;
u64 alloc_start ;
u64 alloc_end ;
u64 alloc_hint = 0 ;
u64 locked_end ;
2015-09-08 12:22:44 +03:00
u64 actual_end = 0 ;
2011-01-14 15:07:43 +03:00
struct extent_map * em ;
2012-11-28 14:28:07 +04:00
int blocksize = BTRFS_I ( inode ) - > root - > sectorsize ;
2011-01-14 15:07:43 +03:00
int ret ;
2012-11-28 14:28:07 +04:00
alloc_start = round_down ( offset , blocksize ) ;
alloc_end = round_up ( offset + len , blocksize ) ;
2011-01-14 15:07:43 +03:00
2012-08-29 22:27:18 +04:00
/* Make sure we aren't being give some crap mode */
if ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE ) )
2011-01-14 15:07:43 +03:00
return - EOPNOTSUPP ;
2012-08-29 22:27:18 +04:00
if ( mode & FALLOC_FL_PUNCH_HOLE )
return btrfs_punch_hole ( inode , offset , len ) ;
2012-02-01 05:27:41 +04:00
/*
2015-09-08 12:22:44 +03:00
* Only trigger disk allocation , don ' t trigger qgroup reserve
*
* For qgroup space , it will be checked later .
2012-02-01 05:27:41 +04:00
*/
2015-09-08 12:22:44 +03:00
ret = btrfs_alloc_data_chunk_ondemand ( inode , alloc_end - alloc_start ) ;
if ( ret < 0 )
2012-02-01 05:27:41 +04:00
return ret ;
2011-01-14 15:07:43 +03:00
mutex_lock ( & inode - > i_mutex ) ;
ret = inode_newsize_ok ( inode , alloc_end ) ;
if ( ret )
goto out ;
2015-09-08 12:22:44 +03:00
/*
* TODO : Move these two operations after we have checked
* accurate reserved space , or fallocate can still fail but
* with page truncated or size expanded .
*
* But that ' s a minor problem and won ' t do much harm BTW .
*/
2011-01-14 15:07:43 +03:00
if ( alloc_start > inode - > i_size ) {
2011-01-31 23:30:16 +03:00
ret = btrfs_cont_expand ( inode , i_size_read ( inode ) ,
alloc_start ) ;
2011-01-14 15:07:43 +03:00
if ( ret )
goto out ;
2013-06-18 01:14:39 +04:00
} else {
/*
* If we are fallocating from the end of the file onward we
* need to zero out the end of the page if i_size lands in the
* middle of a page .
*/
ret = btrfs_truncate_page ( inode , inode - > i_size , 0 , 0 ) ;
if ( ret )
goto out ;
2011-01-14 15:07:43 +03:00
}
2013-06-18 01:14:39 +04:00
/*
* wait for ordered IO before we have any locks . We ' ll loop again
* below with the locks held .
*/
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , alloc_start ,
alloc_end - alloc_start ) ;
if ( ret )
goto out ;
2013-06-18 01:14:39 +04:00
2011-01-14 15:07:43 +03:00
locked_end = alloc_end - 1 ;
while ( 1 ) {
struct btrfs_ordered_extent * ordered ;
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , alloc_start ,
2012-03-01 17:57:19 +04:00
locked_end , 0 , & cached_state ) ;
2011-01-14 15:07:43 +03:00
ordered = btrfs_lookup_first_ordered_extent ( inode ,
alloc_end - 1 ) ;
if ( ordered & &
ordered - > file_offset + ordered - > len > alloc_start & &
ordered - > file_offset < alloc_end ) {
btrfs_put_ordered_extent ( ordered ) ;
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
alloc_start , locked_end ,
& cached_state , GFP_NOFS ) ;
/*
* we can ' t wait on the range with the transaction
* running or with the extent lock held
*/
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , alloc_start ,
alloc_end - alloc_start ) ;
if ( ret )
goto out ;
2011-01-14 15:07:43 +03:00
} else {
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
break ;
}
}
2015-09-08 12:22:44 +03:00
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD ( & reserve_list ) ;
2011-01-14 15:07:43 +03:00
cur_offset = alloc_start ;
while ( 1 ) {
em = btrfs_get_extent ( inode , NULL , 0 , cur_offset ,
alloc_end - cur_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
if ( IS_ERR_OR_NULL ( em ) ) {
if ( ! em )
ret = - ENOMEM ;
else
ret = PTR_ERR ( em ) ;
break ;
}
2011-01-14 15:07:43 +03:00
last_byte = min ( extent_map_end ( em ) , alloc_end ) ;
2011-08-18 18:36:39 +04:00
actual_end = min_t ( u64 , extent_map_end ( em ) , offset + len ) ;
2012-11-28 14:28:07 +04:00
last_byte = ALIGN ( last_byte , blocksize ) ;
2011-01-14 15:07:43 +03:00
if ( em - > block_start = = EXTENT_MAP_HOLE | |
( cur_offset > = inode - > i_size & &
! test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) ) {
2015-09-08 12:22:44 +03:00
ret = add_falloc_range ( & reserve_list , cur_offset ,
last_byte - cur_offset ) ;
if ( ret < 0 ) {
free_extent_map ( em ) ;
break ;
2015-03-13 02:23:13 +03:00
}
2015-09-08 12:22:44 +03:00
ret = btrfs_qgroup_reserve_data ( inode , cur_offset ,
last_byte - cur_offset ) ;
if ( ret < 0 )
break ;
2011-01-14 15:07:43 +03:00
}
free_extent_map ( em ) ;
cur_offset = last_byte ;
2015-09-08 12:22:44 +03:00
if ( cur_offset > = alloc_end )
2011-01-14 15:07:43 +03:00
break ;
2015-09-08 12:22:44 +03:00
}
/*
* If ret is still 0 , means we ' re OK to fallocate .
* Or just cleanup the list and exit .
*/
list_for_each_entry_safe ( range , tmp , & reserve_list , list ) {
if ( ! ret )
ret = btrfs_prealloc_file_range ( inode , mode ,
range - > start ,
range - > len , 1 < < inode - > i_blkbits ,
offset + len , & alloc_hint ) ;
list_del ( & range - > list ) ;
kfree ( range ) ;
}
if ( ret < 0 )
goto out_unlock ;
if ( actual_end > inode - > i_size & &
! ( mode & FALLOC_FL_KEEP_SIZE ) ) {
struct btrfs_trans_handle * trans ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
/*
* We didn ' t need to allocate any more space , but we
* still extended the size of the file so we need to
* update i_size and the inode item .
*/
trans = btrfs_start_transaction ( root , 1 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
} else {
inode - > i_ctime = CURRENT_TIME ;
i_size_write ( inode , actual_end ) ;
btrfs_ordered_update_i_size ( inode , actual_end , NULL ) ;
ret = btrfs_update_inode ( trans , root , inode ) ;
if ( ret )
btrfs_end_transaction ( trans , root ) ;
else
ret = btrfs_end_transaction ( trans , root ) ;
2011-01-14 15:07:43 +03:00
}
}
2015-09-08 12:22:44 +03:00
out_unlock :
2011-01-14 15:07:43 +03:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , alloc_start , locked_end ,
& cached_state , GFP_NOFS ) ;
out :
2015-09-08 12:22:44 +03:00
/*
* As we waited the extent range , the data_rsv_map must be empty
* in the range , as written data range will be released from it .
* And for prealloacted extent , it will also be released when
* its metadata is written .
* So this is completely used as cleanup .
*/
btrfs_qgroup_free_data ( inode , alloc_start , alloc_end - alloc_start ) ;
2011-01-14 15:07:43 +03:00
mutex_unlock ( & inode - > i_mutex ) ;
2012-02-01 05:27:41 +04:00
/* Let go of our reservation. */
2015-09-08 12:25:55 +03:00
btrfs_free_reserved_data_space ( inode , alloc_start ,
alloc_end - alloc_start ) ;
2011-01-14 15:07:43 +03:00
return ret ;
}
2012-12-18 03:59:39 +04:00
static int find_desired_extent ( struct inode * inode , loff_t * offset , int whence )
2011-07-18 21:21:36 +04:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2013-10-18 19:44:46 +04:00
struct extent_map * em = NULL ;
2011-07-18 21:21:36 +04:00
struct extent_state * cached_state = NULL ;
2014-09-16 13:49:30 +04:00
u64 lockstart ;
u64 lockend ;
u64 start ;
u64 len ;
2011-07-18 21:21:36 +04:00
int ret = 0 ;
2014-09-16 13:49:30 +04:00
if ( inode - > i_size = = 0 )
return - ENXIO ;
/*
* * offset can be negative , in this case we start finding DATA / HOLE from
* the very start of the file .
*/
start = max_t ( loff_t , 0 , * offset ) ;
lockstart = round_down ( start , root - > sectorsize ) ;
lockend = round_up ( i_size_read ( inode ) , root - > sectorsize ) ;
2011-07-18 21:21:36 +04:00
if ( lockend < = lockstart )
lockend = lockstart + root - > sectorsize ;
2013-01-07 07:53:08 +04:00
lockend - - ;
2011-07-18 21:21:36 +04:00
len = lockend - lockstart + 1 ;
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend , 0 ,
2012-03-01 17:57:19 +04:00
& cached_state ) ;
2011-07-18 21:21:36 +04:00
2013-10-18 19:44:46 +04:00
while ( start < inode - > i_size ) {
2011-07-18 21:21:36 +04:00
em = btrfs_get_extent_fiemap ( inode , NULL , 0 , start , len , 0 ) ;
if ( IS_ERR ( em ) ) {
2012-02-09 10:25:50 +04:00
ret = PTR_ERR ( em ) ;
2013-10-18 19:44:46 +04:00
em = NULL ;
2011-07-18 21:21:36 +04:00
break ;
}
2013-10-18 19:44:46 +04:00
if ( whence = = SEEK_HOLE & &
( em - > block_start = = EXTENT_MAP_HOLE | |
test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) )
break ;
else if ( whence = = SEEK_DATA & &
( em - > block_start ! = EXTENT_MAP_HOLE & &
! test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) )
break ;
2011-07-18 21:21:36 +04:00
start = em - > start + em - > len ;
free_extent_map ( em ) ;
2013-10-18 19:44:46 +04:00
em = NULL ;
2011-07-18 21:21:36 +04:00
cond_resched ( ) ;
}
2013-10-18 19:44:46 +04:00
free_extent_map ( em ) ;
if ( ! ret ) {
if ( whence = = SEEK_DATA & & start > = inode - > i_size )
ret = - ENXIO ;
else
* offset = min_t ( loff_t , start , inode - > i_size ) ;
}
2011-07-18 21:21:36 +04:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
& cached_state , GFP_NOFS ) ;
return ret ;
}
2012-12-18 03:59:39 +04:00
static loff_t btrfs_file_llseek ( struct file * file , loff_t offset , int whence )
2011-07-18 21:21:36 +04:00
{
struct inode * inode = file - > f_mapping - > host ;
int ret ;
mutex_lock ( & inode - > i_mutex ) ;
2012-12-18 03:59:39 +04:00
switch ( whence ) {
2011-07-18 21:21:36 +04:00
case SEEK_END :
case SEEK_CUR :
2012-12-18 03:59:39 +04:00
offset = generic_file_llseek ( file , offset , whence ) ;
2011-07-18 21:21:36 +04:00
goto out ;
case SEEK_DATA :
case SEEK_HOLE :
2011-09-18 18:34:02 +04:00
if ( offset > = i_size_read ( inode ) ) {
mutex_unlock ( & inode - > i_mutex ) ;
return - ENXIO ;
}
2012-12-18 03:59:39 +04:00
ret = find_desired_extent ( inode , & offset , whence ) ;
2011-07-18 21:21:36 +04:00
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
}
2013-06-25 08:02:13 +04:00
offset = vfs_setpos ( file , offset , inode - > i_sb - > s_maxbytes ) ;
2011-07-18 21:21:36 +04:00
out :
mutex_unlock ( & inode - > i_mutex ) ;
return offset ;
}
2009-10-02 02:43:56 +04:00
const struct file_operations btrfs_file_operations = {
2011-07-18 21:21:36 +04:00
. llseek = btrfs_file_llseek ,
2014-04-02 22:33:16 +04:00
. read_iter = generic_file_read_iter ,
2007-12-14 20:56:58 +03:00
. splice_read = generic_file_splice_read ,
2014-04-03 22:29:04 +04:00
. write_iter = btrfs_file_write_iter ,
2007-06-15 21:50:00 +04:00
. mmap = btrfs_file_mmap ,
2007-06-12 14:35:45 +04:00
. open = generic_file_open ,
2008-05-27 18:55:43 +04:00
. release = btrfs_release_file ,
2007-06-12 14:35:45 +04:00
. fsync = btrfs_sync_file ,
2011-01-14 15:07:43 +03:00
. fallocate = btrfs_fallocate ,
2007-09-14 18:22:47 +04:00
. unlocked_ioctl = btrfs_ioctl ,
2007-06-12 14:35:45 +04:00
# ifdef CONFIG_COMPAT
2007-09-14 18:22:47 +04:00
. compat_ioctl = btrfs_ioctl ,
2007-06-12 14:35:45 +04:00
# endif
} ;
2012-11-26 13:24:43 +04:00
void btrfs_auto_defrag_exit ( void )
{
if ( btrfs_inode_defrag_cachep )
kmem_cache_destroy ( btrfs_inode_defrag_cachep ) ;
}
int btrfs_auto_defrag_init ( void )
{
btrfs_inode_defrag_cachep = kmem_cache_create ( " btrfs_inode_defrag " ,
sizeof ( struct inode_defrag ) , 0 ,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD ,
NULL ) ;
if ( ! btrfs_inode_defrag_cachep )
return - ENOMEM ;
return 0 ;
}
2014-10-10 12:43:11 +04:00
int btrfs_fdatawrite_range ( struct inode * inode , loff_t start , loff_t end )
{
int ret ;
/*
* So with compression we will find and lock a dirty page and clear the
* first one as dirty , setup an async extent , and immediately return
* with the entire range locked but with nobody actually marked with
* writeback . So we can ' t just filemap_write_and_wait_range ( ) and
* expect it to work since it will just kick off a thread to do the
* actual work . So we need to call filemap_fdatawrite_range _again_
* since it will wait on the page lock , which won ' t be unlocked until
* after the pages have been marked as writeback and so we ' re good to go
* from there . We have to do this otherwise we ' ll miss the ordered
* extents and that results in badness . Please Josef , do not think you
* know better and pull this out at some point in the future , it is
* right and you are wrong .
*/
ret = filemap_fdatawrite_range ( inode - > i_mapping , start , end ) ;
if ( ! ret & & test_bit ( BTRFS_INODE_HAS_ASYNC_EXTENT ,
& BTRFS_I ( inode ) - > runtime_flags ) )
ret = filemap_fdatawrite_range ( inode - > i_mapping , start , end ) ;
return ret ;
}