2007-06-12 17:07:21 +04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
2007-06-12 14:35:45 +04:00
# include <linux/fs.h>
# include <linux/pagemap.h>
# include <linux/highmem.h>
# include <linux/time.h>
# include <linux/init.h>
# include <linux/string.h>
# include <linux/backing-dev.h>
# include <linux/mpage.h>
2013-05-08 03:19:08 +04:00
# include <linux/aio.h>
2011-01-14 15:07:43 +03:00
# include <linux/falloc.h>
2007-06-12 14:35:45 +04:00
# include <linux/swap.h>
# include <linux/writeback.h>
# include <linux/statfs.h>
# include <linux/compat.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
# include <linux/slab.h>
2013-01-29 10:04:50 +04:00
# include <linux/btrfs.h>
2007-06-12 14:35:45 +04:00
# include "ctree.h"
# include "disk-io.h"
# include "transaction.h"
# include "btrfs_inode.h"
# include "print-tree.h"
2008-09-06 00:13:11 +04:00
# include "tree-log.h"
# include "locking.h"
2012-08-29 22:27:18 +04:00
# include "volumes.h"
2007-06-12 14:35:45 +04:00
2012-11-26 13:24:43 +04:00
static struct kmem_cache * btrfs_inode_defrag_cachep ;
2011-05-24 23:35:30 +04:00
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node ;
/* objectid */
u64 ino ;
/*
* transid where the defrag was added , we search for
* extents newer than this
*/
u64 transid ;
/* root objectid */
u64 root ;
/* last offset we were able to defrag */
u64 last_offset ;
/* if we've wrapped around back to zero once already */
int cycled ;
} ;
2012-05-24 14:58:27 +04:00
static int __compare_inode_defrag ( struct inode_defrag * defrag1 ,
struct inode_defrag * defrag2 )
{
if ( defrag1 - > root > defrag2 - > root )
return 1 ;
else if ( defrag1 - > root < defrag2 - > root )
return - 1 ;
else if ( defrag1 - > ino > defrag2 - > ino )
return 1 ;
else if ( defrag1 - > ino < defrag2 - > ino )
return - 1 ;
else
return 0 ;
}
2011-05-24 23:35:30 +04:00
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you ' re inserting a record for an older transid than an
* existing record , the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
2012-11-26 13:25:38 +04:00
static int __btrfs_add_inode_defrag ( struct inode * inode ,
2011-05-24 23:35:30 +04:00
struct inode_defrag * defrag )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct inode_defrag * entry ;
struct rb_node * * p ;
struct rb_node * parent = NULL ;
2012-05-24 14:58:27 +04:00
int ret ;
2011-05-24 23:35:30 +04:00
p = & root - > fs_info - > defrag_inodes . rb_node ;
while ( * p ) {
parent = * p ;
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-05-24 14:58:27 +04:00
ret = __compare_inode_defrag ( defrag , entry ) ;
if ( ret < 0 )
2011-05-24 23:35:30 +04:00
p = & parent - > rb_left ;
2012-05-24 14:58:27 +04:00
else if ( ret > 0 )
2011-05-24 23:35:30 +04:00
p = & parent - > rb_right ;
else {
/* if we're reinserting an entry for
* an old defrag run , make sure to
* lower the transid of our existing record
*/
if ( defrag - > transid < entry - > transid )
entry - > transid = defrag - > transid ;
if ( defrag - > last_offset > entry - > last_offset )
entry - > last_offset = defrag - > last_offset ;
2012-11-26 13:25:38 +04:00
return - EEXIST ;
2011-05-24 23:35:30 +04:00
}
}
2012-05-23 22:13:11 +04:00
set_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ;
2011-05-24 23:35:30 +04:00
rb_link_node ( & defrag - > rb_node , parent , p ) ;
rb_insert_color ( & defrag - > rb_node , & root - > fs_info - > defrag_inodes ) ;
2012-11-26 13:25:38 +04:00
return 0 ;
}
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
static inline int __need_auto_defrag ( struct btrfs_root * root )
{
if ( ! btrfs_test_opt ( root , AUTO_DEFRAG ) )
return 0 ;
if ( btrfs_fs_closing ( root - > fs_info ) )
return 0 ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
return 1 ;
2011-05-24 23:35:30 +04:00
}
/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag ( struct btrfs_trans_handle * trans ,
struct inode * inode )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct inode_defrag * defrag ;
u64 transid ;
2012-11-26 13:25:38 +04:00
int ret ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:25:38 +04:00
if ( ! __need_auto_defrag ( root ) )
2011-05-24 23:35:30 +04:00
return 0 ;
2012-05-23 22:13:11 +04:00
if ( test_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) )
2011-05-24 23:35:30 +04:00
return 0 ;
if ( trans )
transid = trans - > transid ;
else
transid = BTRFS_I ( inode ) - > root - > last_trans ;
2012-11-26 13:24:43 +04:00
defrag = kmem_cache_zalloc ( btrfs_inode_defrag_cachep , GFP_NOFS ) ;
2011-05-24 23:35:30 +04:00
if ( ! defrag )
return - ENOMEM ;
2011-05-31 21:08:14 +04:00
defrag - > ino = btrfs_ino ( inode ) ;
2011-05-24 23:35:30 +04:00
defrag - > transid = transid ;
defrag - > root = root - > root_key . objectid ;
spin_lock ( & root - > fs_info - > defrag_inodes_lock ) ;
2012-11-26 13:25:38 +04:00
if ( ! test_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ) {
/*
* If we set IN_DEFRAG flag and evict the inode from memory ,
* and then re - read this inode , this new inode doesn ' t have
* IN_DEFRAG flag . At the case , we may find the existed defrag .
*/
ret = __btrfs_add_inode_defrag ( inode , defrag ) ;
if ( ret )
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
} else {
2012-11-26 13:24:43 +04:00
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
2012-11-26 13:25:38 +04:00
}
2011-05-24 23:35:30 +04:00
spin_unlock ( & root - > fs_info - > defrag_inodes_lock ) ;
2011-07-18 16:19:35 +04:00
return 0 ;
2011-05-24 23:35:30 +04:00
}
/*
2012-11-26 13:25:38 +04:00
* Requeue the defrag object . If there is a defrag object that points to
* the same inode in the tree , we will merge them together ( by
* __btrfs_add_inode_defrag ( ) ) and free the one that we want to requeue .
2011-05-24 23:35:30 +04:00
*/
2013-04-26 00:41:01 +04:00
static void btrfs_requeue_inode_defrag ( struct inode * inode ,
struct inode_defrag * defrag )
2012-11-26 13:25:38 +04:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
int ret ;
if ( ! __need_auto_defrag ( root ) )
goto out ;
/*
* Here we don ' t check the IN_DEFRAG flag , because we need merge
* them together .
*/
spin_lock ( & root - > fs_info - > defrag_inodes_lock ) ;
ret = __btrfs_add_inode_defrag ( inode , defrag ) ;
spin_unlock ( & root - > fs_info - > defrag_inodes_lock ) ;
if ( ret )
goto out ;
return ;
out :
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
}
2011-05-24 23:35:30 +04:00
/*
2012-11-26 13:26:20 +04:00
* pick the defragable inode that we want , if it doesn ' t exist , we will get
* the next one .
2011-05-24 23:35:30 +04:00
*/
2012-11-26 13:26:20 +04:00
static struct inode_defrag *
btrfs_pick_defrag_inode ( struct btrfs_fs_info * fs_info , u64 root , u64 ino )
2011-05-24 23:35:30 +04:00
{
struct inode_defrag * entry = NULL ;
2012-05-24 14:58:27 +04:00
struct inode_defrag tmp ;
2011-05-24 23:35:30 +04:00
struct rb_node * p ;
struct rb_node * parent = NULL ;
2012-05-24 14:58:27 +04:00
int ret ;
tmp . ino = ino ;
tmp . root = root ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
spin_lock ( & fs_info - > defrag_inodes_lock ) ;
p = fs_info - > defrag_inodes . rb_node ;
2011-05-24 23:35:30 +04:00
while ( p ) {
parent = p ;
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-05-24 14:58:27 +04:00
ret = __compare_inode_defrag ( & tmp , entry ) ;
if ( ret < 0 )
2011-05-24 23:35:30 +04:00
p = parent - > rb_left ;
2012-05-24 14:58:27 +04:00
else if ( ret > 0 )
2011-05-24 23:35:30 +04:00
p = parent - > rb_right ;
else
2012-11-26 13:26:20 +04:00
goto out ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
if ( parent & & __compare_inode_defrag ( & tmp , entry ) > 0 ) {
parent = rb_next ( parent ) ;
if ( parent )
2011-05-24 23:35:30 +04:00
entry = rb_entry ( parent , struct inode_defrag , rb_node ) ;
2012-11-26 13:26:20 +04:00
else
entry = NULL ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
out :
if ( entry )
rb_erase ( parent , & fs_info - > defrag_inodes ) ;
spin_unlock ( & fs_info - > defrag_inodes_lock ) ;
return entry ;
2011-05-24 23:35:30 +04:00
}
2012-11-26 13:26:20 +04:00
void btrfs_cleanup_defrag_inodes ( struct btrfs_fs_info * fs_info )
2011-05-24 23:35:30 +04:00
{
struct inode_defrag * defrag ;
2012-11-26 13:26:20 +04:00
struct rb_node * node ;
spin_lock ( & fs_info - > defrag_inodes_lock ) ;
node = rb_first ( & fs_info - > defrag_inodes ) ;
while ( node ) {
rb_erase ( node , & fs_info - > defrag_inodes ) ;
defrag = rb_entry ( node , struct inode_defrag , rb_node ) ;
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
if ( need_resched ( ) ) {
spin_unlock ( & fs_info - > defrag_inodes_lock ) ;
cond_resched ( ) ;
spin_lock ( & fs_info - > defrag_inodes_lock ) ;
}
node = rb_first ( & fs_info - > defrag_inodes ) ;
}
spin_unlock ( & fs_info - > defrag_inodes_lock ) ;
}
# define BTRFS_DEFRAG_BATCH 1024
static int __btrfs_run_defrag_inode ( struct btrfs_fs_info * fs_info ,
struct inode_defrag * defrag )
{
2011-05-24 23:35:30 +04:00
struct btrfs_root * inode_root ;
struct inode * inode ;
struct btrfs_key key ;
struct btrfs_ioctl_defrag_range_args range ;
int num_defrag ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
int index ;
int ret ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
/* get the inode */
key . objectid = defrag - > root ;
btrfs_set_key_type ( & key , BTRFS_ROOT_ITEM_KEY ) ;
key . offset = ( u64 ) - 1 ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
index = srcu_read_lock ( & fs_info - > subvol_srcu ) ;
2012-11-26 13:26:20 +04:00
inode_root = btrfs_read_fs_root_no_name ( fs_info , & key ) ;
if ( IS_ERR ( inode_root ) ) {
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
ret = PTR_ERR ( inode_root ) ;
goto cleanup ;
}
2012-11-26 13:26:20 +04:00
key . objectid = defrag - > ino ;
btrfs_set_key_type ( & key , BTRFS_INODE_ITEM_KEY ) ;
key . offset = 0 ;
inode = btrfs_iget ( fs_info - > sb , & key , inode_root , NULL ) ;
if ( IS_ERR ( inode ) ) {
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
ret = PTR_ERR ( inode ) ;
goto cleanup ;
2012-11-26 13:26:20 +04:00
}
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
srcu_read_unlock ( & fs_info - > subvol_srcu , index ) ;
2012-11-26 13:26:20 +04:00
/* do a chunk of defrag */
clear_bit ( BTRFS_INODE_IN_DEFRAG , & BTRFS_I ( inode ) - > runtime_flags ) ;
2011-05-24 23:35:30 +04:00
memset ( & range , 0 , sizeof ( range ) ) ;
range . len = ( u64 ) - 1 ;
2012-11-26 13:26:20 +04:00
range . start = defrag - > last_offset ;
2012-11-26 13:27:29 +04:00
sb_start_write ( fs_info - > sb ) ;
2012-11-26 13:26:20 +04:00
num_defrag = btrfs_defrag_file ( inode , NULL , & range , defrag - > transid ,
BTRFS_DEFRAG_BATCH ) ;
2012-11-26 13:27:29 +04:00
sb_end_write ( fs_info - > sb ) ;
2012-11-26 13:26:20 +04:00
/*
* if we filled the whole defrag batch , there
* must be more work to do . Queue this defrag
* again
*/
if ( num_defrag = = BTRFS_DEFRAG_BATCH ) {
defrag - > last_offset = range . start ;
btrfs_requeue_inode_defrag ( inode , defrag ) ;
} else if ( defrag - > last_offset & & ! defrag - > cycled ) {
/*
* we didn ' t fill our defrag batch , but
* we didn ' t start at zero . Make sure we loop
* around to the start of the file .
*/
defrag - > last_offset = 0 ;
defrag - > cycled = 1 ;
btrfs_requeue_inode_defrag ( inode , defrag ) ;
} else {
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
}
iput ( inode ) ;
return 0 ;
Btrfs: fix race between snapshot deletion and getting inode
While running snapshot testscript created by Mitch and David,
the race between autodefrag and snapshot deletion can lead to
corruption of dead_root list so that we can get crash on
btrfs_clean_old_snapshots().
And besides autodefrag, scrub also does the same thing, ie. read
root first and get inode.
Here is the story(take autodefrag as an example):
(1) when we delete a snapshot or subvolume, it will set its root's
refs to zero and do a iput() on its own inode, and if this inode happens
to be the only active in-meory one in root's inode rbtree, it will add
itself to the global dead_roots list for later cleanup.
(2) after (1), the autodefrag thread may read another inode for defrag
and the inode is just in the deleted snapshot/subvolume, but all of these
are without checking if the root is still valid(refs > 0). So the end up
result is adding the deleted snapshot/subvolume's root to the global
dead_roots list AGAIN.
Fortunately, we already have a srcu lock to avoid the race, ie. subvol_srcu.
So all we need to do is to take the lock to protect 'read root and get inode',
since we synchronize to wait for the rcu grace period before adding something
to the global dead_roots list.
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2013-01-29 07:22:10 +04:00
cleanup :
srcu_read_unlock ( & fs_info - > subvol_srcu , index ) ;
kmem_cache_free ( btrfs_inode_defrag_cachep , defrag ) ;
return ret ;
2012-11-26 13:26:20 +04:00
}
/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes ( struct btrfs_fs_info * fs_info )
{
struct inode_defrag * defrag ;
u64 first_ino = 0 ;
u64 root_objectid = 0 ;
2011-05-24 23:35:30 +04:00
atomic_inc ( & fs_info - > defrag_running ) ;
2013-10-31 09:03:04 +04:00
while ( 1 ) {
2013-02-21 10:32:52 +04:00
/* Pause the auto defragger. */
if ( test_bit ( BTRFS_FS_STATE_REMOUNTING ,
& fs_info - > fs_state ) )
break ;
2012-11-26 13:26:20 +04:00
if ( ! __need_auto_defrag ( fs_info - > tree_root ) )
break ;
2011-05-24 23:35:30 +04:00
/* find an inode to defrag */
2012-11-26 13:26:20 +04:00
defrag = btrfs_pick_defrag_inode ( fs_info , root_objectid ,
first_ino ) ;
2011-05-24 23:35:30 +04:00
if ( ! defrag ) {
2012-11-26 13:26:20 +04:00
if ( root_objectid | | first_ino ) {
2012-05-24 14:58:27 +04:00
root_objectid = 0 ;
2011-05-24 23:35:30 +04:00
first_ino = 0 ;
continue ;
} else {
break ;
}
}
first_ino = defrag - > ino + 1 ;
2012-05-24 14:58:27 +04:00
root_objectid = defrag - > root ;
2011-05-24 23:35:30 +04:00
2012-11-26 13:26:20 +04:00
__btrfs_run_defrag_inode ( fs_info , defrag ) ;
2011-05-24 23:35:30 +04:00
}
atomic_dec ( & fs_info - > defrag_running ) ;
/*
* during unmount , we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up ( & fs_info - > transaction_wait ) ;
return 0 ;
}
2007-06-12 14:35:45 +04:00
2008-09-29 23:18:18 +04:00
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code .
*/
2009-01-06 05:25:51 +03:00
static noinline int btrfs_copy_from_user ( loff_t pos , int num_pages ,
2011-01-25 22:57:24 +03:00
size_t write_bytes ,
2008-09-06 00:09:51 +04:00
struct page * * prepared_pages ,
2010-05-23 19:07:21 +04:00
struct iov_iter * i )
2007-06-12 14:35:45 +04:00
{
2010-12-09 12:30:14 +03:00
size_t copied = 0 ;
2011-01-25 22:57:24 +03:00
size_t total_copied = 0 ;
2010-05-23 19:07:21 +04:00
int pg = 0 ;
2007-06-12 14:35:45 +04:00
int offset = pos & ( PAGE_CACHE_SIZE - 1 ) ;
2010-05-23 19:07:21 +04:00
while ( write_bytes > 0 ) {
2007-06-12 14:35:45 +04:00
size_t count = min_t ( size_t ,
PAGE_CACHE_SIZE - offset , write_bytes ) ;
2010-05-23 19:07:21 +04:00
struct page * page = prepared_pages [ pg ] ;
2010-12-09 12:30:14 +03:00
/*
* Copy data from userspace to the current page
*/
copied = iov_iter_copy_from_user_atomic ( page , i , offset , count ) ;
2010-05-23 19:07:21 +04:00
2007-06-12 14:35:45 +04:00
/* Flush processor's dcache for this page */
flush_dcache_page ( page ) ;
2011-03-07 19:10:24 +03:00
/*
* if we get a partial write , we can end up with
* partially up to date pages . These add
* a lot of complexity , so make sure they don ' t
* happen by forcing this copy to be retried .
*
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
if ( ! PageUptodate ( page ) & & copied < count )
copied = 0 ;
2010-05-23 19:07:21 +04:00
iov_iter_advance ( i , copied ) ;
write_bytes - = copied ;
2010-12-09 12:30:14 +03:00
total_copied + = copied ;
2007-06-12 14:35:45 +04:00
2010-12-09 12:30:14 +03:00
/* Return to btrfs_file_aio_write to fault page */
2011-01-25 20:42:37 +03:00
if ( unlikely ( copied = = 0 ) )
2010-12-09 12:30:14 +03:00
break ;
2010-05-23 19:07:21 +04:00
if ( unlikely ( copied < PAGE_CACHE_SIZE - offset ) ) {
offset + = copied ;
} else {
pg + + ;
offset = 0 ;
}
2007-06-12 14:35:45 +04:00
}
2010-12-09 12:30:14 +03:00
return total_copied ;
2007-06-12 14:35:45 +04:00
}
2008-09-29 23:18:18 +04:00
/*
* unlocks pages after btrfs_file_write is done with them
*/
2013-04-26 00:41:01 +04:00
static void btrfs_drop_pages ( struct page * * pages , size_t num_pages )
2007-06-12 14:35:45 +04:00
{
size_t i ;
for ( i = 0 ; i < num_pages ; i + + ) {
2008-09-29 23:18:18 +04:00
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
* clear it here
*/
2008-07-21 18:29:44 +04:00
ClearPageChecked ( pages [ i ] ) ;
2007-06-12 14:35:45 +04:00
unlock_page ( pages [ i ] ) ;
mark_page_accessed ( pages [ i ] ) ;
page_cache_release ( pages [ i ] ) ;
}
}
2008-09-29 23:18:18 +04:00
/*
* after copy_from_user , pages need to be dirtied and we need to make
* sure holes are created between the current EOF and the start of
* any next extents ( if required ) .
*
* this also makes the decision about creating an inline extent vs
* doing real data extents , marking pages dirty and delalloc as required .
*/
2011-04-06 21:05:22 +04:00
int btrfs_dirty_pages ( struct btrfs_root * root , struct inode * inode ,
2013-04-26 00:41:01 +04:00
struct page * * pages , size_t num_pages ,
loff_t pos , size_t write_bytes ,
struct extent_state * * cached )
2007-06-12 14:35:45 +04:00
{
int err = 0 ;
2007-08-28 00:49:44 +04:00
int i ;
2007-10-16 00:15:53 +04:00
u64 num_bytes ;
2007-08-28 00:49:44 +04:00
u64 start_pos ;
u64 end_of_last_block ;
u64 end_pos = pos + write_bytes ;
loff_t isize = i_size_read ( inode ) ;
2007-06-12 14:35:45 +04:00
2007-10-16 00:14:19 +04:00
start_pos = pos & ~ ( ( u64 ) root - > sectorsize - 1 ) ;
2013-02-26 12:10:22 +04:00
num_bytes = ALIGN ( write_bytes + pos - start_pos , root - > sectorsize ) ;
2007-06-12 14:35:45 +04:00
2007-10-16 00:15:53 +04:00
end_of_last_block = start_pos + num_bytes - 1 ;
2010-02-03 22:33:23 +03:00
err = btrfs_set_extent_delalloc ( inode , start_pos , end_of_last_block ,
2011-04-06 21:05:22 +04:00
cached ) ;
2011-01-25 22:57:24 +03:00
if ( err )
return err ;
2009-09-12 00:12:44 +04:00
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
for ( i = 0 ; i < num_pages ; i + + ) {
struct page * p = pages [ i ] ;
SetPageUptodate ( p ) ;
ClearPageChecked ( p ) ;
set_page_dirty ( p ) ;
2007-08-28 00:49:44 +04:00
}
2011-01-25 20:42:37 +03:00
/*
* we ' ve only changed i_size in ram , and we haven ' t updated
* the disk i_size . There is no need to log the inode
* at this time .
*/
if ( end_pos > isize )
2007-08-28 00:49:44 +04:00
i_size_write ( inode , end_pos ) ;
2010-05-16 18:48:46 +04:00
return 0 ;
2007-06-12 14:35:45 +04:00
}
2008-09-29 23:18:18 +04:00
/*
* this drops all the extents in the cache that intersect the range
* [ start , end ] . Existing extents are split as required .
*/
2012-08-31 04:06:49 +04:00
void btrfs_drop_extent_cache ( struct inode * inode , u64 start , u64 end ,
int skip_pinned )
2007-08-28 00:49:44 +04:00
{
struct extent_map * em ;
2008-04-17 19:29:12 +04:00
struct extent_map * split = NULL ;
struct extent_map * split2 = NULL ;
2007-08-28 00:49:44 +04:00
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
2008-02-15 18:40:50 +03:00
u64 len = end - start + 1 ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
u64 gen ;
2008-04-17 19:29:12 +04:00
int ret ;
int testend = 1 ;
2008-09-26 18:05:38 +04:00
unsigned long flags ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
int compressed = 0 ;
2013-04-06 00:51:15 +04:00
bool modified ;
2007-08-28 00:49:44 +04:00
2008-07-17 20:53:50 +04:00
WARN_ON ( end < start ) ;
2008-04-17 19:29:12 +04:00
if ( end = = ( u64 ) - 1 ) {
2008-02-15 18:40:50 +03:00
len = ( u64 ) - 1 ;
2008-04-17 19:29:12 +04:00
testend = 0 ;
}
2009-01-06 05:25:51 +03:00
while ( 1 ) {
2012-08-31 04:06:49 +04:00
int no_splits = 0 ;
2013-04-06 00:51:15 +04:00
modified = false ;
2008-04-17 19:29:12 +04:00
if ( ! split )
2011-04-21 02:48:27 +04:00
split = alloc_extent_map ( ) ;
2008-04-17 19:29:12 +04:00
if ( ! split2 )
2011-04-21 02:48:27 +04:00
split2 = alloc_extent_map ( ) ;
2012-08-31 04:06:49 +04:00
if ( ! split | | ! split2 )
no_splits = 1 ;
2008-04-17 19:29:12 +04:00
2009-09-03 00:24:52 +04:00
write_lock ( & em_tree - > lock ) ;
2008-02-15 18:40:50 +03:00
em = lookup_extent_mapping ( em_tree , start , len ) ;
2008-01-25 00:13:08 +03:00
if ( ! em ) {
2009-09-03 00:24:52 +04:00
write_unlock ( & em_tree - > lock ) ;
2007-08-28 00:49:44 +04:00
break ;
2008-01-25 00:13:08 +03:00
}
2008-09-26 18:05:38 +04:00
flags = em - > flags ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
gen = em - > generation ;
2008-09-26 18:05:38 +04:00
if ( skip_pinned & & test_bit ( EXTENT_FLAG_PINNED , & em - > flags ) ) {
2009-11-12 12:36:44 +03:00
if ( testend & & em - > start + em - > len > = start + len ) {
2008-09-26 18:05:38 +04:00
free_extent_map ( em ) ;
2009-09-11 20:27:37 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-09-26 18:05:38 +04:00
break ;
}
2009-11-12 12:36:44 +03:00
start = em - > start + em - > len ;
if ( testend )
2008-09-26 18:05:38 +04:00
len = start + len - ( em - > start + em - > len ) ;
free_extent_map ( em ) ;
2009-09-11 20:27:37 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-09-26 18:05:38 +04:00
continue ;
}
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
compressed = test_bit ( EXTENT_FLAG_COMPRESSED , & em - > flags ) ;
2008-07-31 23:42:54 +04:00
clear_bit ( EXTENT_FLAG_PINNED , & em - > flags ) ;
2013-03-15 18:46:39 +04:00
clear_bit ( EXTENT_FLAG_LOGGING , & flags ) ;
2013-04-06 00:51:15 +04:00
modified = ! list_empty ( & em - > list ) ;
2012-08-31 04:06:49 +04:00
if ( no_splits )
goto next ;
2008-04-17 19:29:12 +04:00
2013-07-11 18:34:59 +04:00
if ( em - > start < start ) {
2008-04-17 19:29:12 +04:00
split - > start = em - > start ;
split - > len = start - em - > start ;
2013-07-11 18:34:59 +04:00
if ( em - > block_start < EXTENT_MAP_LAST_BYTE ) {
split - > orig_start = em - > orig_start ;
split - > block_start = em - > block_start ;
if ( compressed )
split - > block_len = em - > block_len ;
else
split - > block_len = split - > len ;
split - > orig_block_len = max ( split - > block_len ,
em - > orig_block_len ) ;
split - > ram_bytes = em - > ram_bytes ;
} else {
split - > orig_start = split - > start ;
split - > block_len = 0 ;
split - > block_start = em - > block_start ;
split - > orig_block_len = 0 ;
split - > ram_bytes = split - > len ;
}
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
split - > generation = gen ;
2008-04-17 19:29:12 +04:00
split - > bdev = em - > bdev ;
2008-09-26 18:05:38 +04:00
split - > flags = flags ;
2010-12-17 09:21:50 +03:00
split - > compress_type = em - > compress_type ;
2014-02-25 18:15:13 +04:00
replace_extent_mapping ( em_tree , em , split , modified ) ;
2008-04-17 19:29:12 +04:00
free_extent_map ( split ) ;
split = split2 ;
split2 = NULL ;
}
2013-07-11 18:34:59 +04:00
if ( testend & & em - > start + em - > len > start + len ) {
2008-04-17 19:29:12 +04:00
u64 diff = start + len - em - > start ;
split - > start = start + len ;
split - > len = em - > start + em - > len - ( start + len ) ;
split - > bdev = em - > bdev ;
2008-09-26 18:05:38 +04:00
split - > flags = flags ;
2010-12-17 09:21:50 +03:00
split - > compress_type = em - > compress_type ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
split - > generation = gen ;
2013-07-11 18:34:59 +04:00
if ( em - > block_start < EXTENT_MAP_LAST_BYTE ) {
split - > orig_block_len = max ( em - > block_len ,
2012-12-03 19:31:19 +04:00
em - > orig_block_len ) ;
2008-04-17 19:29:12 +04:00
2013-07-11 18:34:59 +04:00
split - > ram_bytes = em - > ram_bytes ;
if ( compressed ) {
split - > block_len = em - > block_len ;
split - > block_start = em - > block_start ;
split - > orig_start = em - > orig_start ;
} else {
split - > block_len = split - > len ;
split - > block_start = em - > block_start
+ diff ;
split - > orig_start = em - > orig_start ;
}
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
} else {
2013-07-11 18:34:59 +04:00
split - > ram_bytes = split - > len ;
split - > orig_start = split - > start ;
split - > block_len = 0 ;
split - > block_start = em - > block_start ;
split - > orig_block_len = 0 ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
}
2008-04-17 19:29:12 +04:00
2014-02-25 18:15:13 +04:00
if ( extent_map_in_tree ( em ) ) {
replace_extent_mapping ( em_tree , em , split ,
modified ) ;
} else {
ret = add_extent_mapping ( em_tree , split ,
modified ) ;
ASSERT ( ret = = 0 ) ; /* Logic error */
}
2008-04-17 19:29:12 +04:00
free_extent_map ( split ) ;
split = NULL ;
}
2012-08-31 04:06:49 +04:00
next :
2014-02-25 18:15:13 +04:00
if ( extent_map_in_tree ( em ) )
remove_extent_mapping ( em_tree , em ) ;
2009-09-03 00:24:52 +04:00
write_unlock ( & em_tree - > lock ) ;
2008-01-25 00:13:08 +03:00
2007-08-28 00:49:44 +04:00
/* once for us */
free_extent_map ( em ) ;
/* once for the tree*/
free_extent_map ( em ) ;
}
2008-04-17 19:29:12 +04:00
if ( split )
free_extent_map ( split ) ;
if ( split2 )
free_extent_map ( split2 ) ;
2007-08-28 00:49:44 +04:00
}
2007-06-12 14:35:45 +04:00
/*
* this is very complex , but the basic idea is to drop all extents
* in the range start - end . hint_block is filled in with a block number
* that would be a good hint to the block allocator for this file .
*
* If an extent intersects the range but is not entirely inside the range
* it is either truncated or split . Anything entirely inside the range
* is deleted from the tree .
*/
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
int __btrfs_drop_extents ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root , struct inode * inode ,
struct btrfs_path * path , u64 start , u64 end ,
2014-01-07 15:42:27 +04:00
u64 * drop_end , int drop_cache ,
int replace_extent ,
u32 extent_item_size ,
int * key_inserted )
2007-06-12 14:35:45 +04:00
{
2007-10-16 00:14:19 +04:00
struct extent_buffer * leaf ;
2009-11-12 12:34:08 +03:00
struct btrfs_file_extent_item * fi ;
2007-11-30 18:09:33 +03:00
struct btrfs_key key ;
2009-11-12 12:34:08 +03:00
struct btrfs_key new_key ;
2011-04-20 06:31:50 +04:00
u64 ino = btrfs_ino ( inode ) ;
2009-11-12 12:34:08 +03:00
u64 search_start = start ;
u64 disk_bytenr = 0 ;
u64 num_bytes = 0 ;
u64 extent_offset = 0 ;
u64 extent_end = 0 ;
int del_nr = 0 ;
int del_slot = 0 ;
int extent_type ;
2007-06-28 23:57:36 +04:00
int recow ;
2007-11-30 18:09:33 +03:00
int ret ;
2012-04-27 22:31:29 +04:00
int modify_tree = - 1 ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
int update_refs = ( root - > ref_cows | | root = = root - > fs_info - > tree_root ) ;
2012-09-14 22:51:22 +04:00
int found = 0 ;
2014-01-07 15:42:27 +04:00
int leafs_visited = 0 ;
2007-06-12 14:35:45 +04:00
2009-09-11 20:27:37 +04:00
if ( drop_cache )
btrfs_drop_extent_cache ( inode , start , end - 1 , 0 ) ;
2007-08-28 00:49:44 +04:00
2014-02-10 03:45:12 +04:00
if ( start > = BTRFS_I ( inode ) - > disk_i_size & & ! replace_extent )
2012-04-27 22:31:29 +04:00
modify_tree = 0 ;
2009-01-06 05:25:51 +03:00
while ( 1 ) {
2007-06-28 23:57:36 +04:00
recow = 0 ;
2011-04-20 06:31:50 +04:00
ret = btrfs_lookup_file_extent ( trans , root , path , ino ,
2012-04-27 22:31:29 +04:00
search_start , modify_tree ) ;
2007-06-12 14:35:45 +04:00
if ( ret < 0 )
2009-11-12 12:34:08 +03:00
break ;
if ( ret > 0 & & path - > slots [ 0 ] > 0 & & search_start = = start ) {
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] - 1 ) ;
2011-04-20 06:31:50 +04:00
if ( key . objectid = = ino & &
2009-11-12 12:34:08 +03:00
key . type = = BTRFS_EXTENT_DATA_KEY )
path - > slots [ 0 ] - - ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
ret = 0 ;
2014-01-07 15:42:27 +04:00
leafs_visited + + ;
2007-06-18 17:57:58 +04:00
next_slot :
2007-10-16 00:14:19 +04:00
leaf = path - > nodes [ 0 ] ;
2009-11-12 12:34:08 +03:00
if ( path - > slots [ 0 ] > = btrfs_header_nritems ( leaf ) ) {
BUG_ON ( del_nr > 0 ) ;
ret = btrfs_next_leaf ( root , path ) ;
if ( ret < 0 )
break ;
if ( ret > 0 ) {
ret = 0 ;
break ;
2007-06-18 17:57:58 +04:00
}
2014-01-07 15:42:27 +04:00
leafs_visited + + ;
2009-11-12 12:34:08 +03:00
leaf = path - > nodes [ 0 ] ;
recow = 1 ;
}
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
2011-04-20 06:31:50 +04:00
if ( key . objectid > ino | |
2009-11-12 12:34:08 +03:00
key . type > BTRFS_EXTENT_DATA_KEY | | key . offset > = end )
break ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
extent_type = btrfs_file_extent_type ( leaf , fi ) ;
if ( extent_type = = BTRFS_FILE_EXTENT_REG | |
extent_type = = BTRFS_FILE_EXTENT_PREALLOC ) {
disk_bytenr = btrfs_file_extent_disk_bytenr ( leaf , fi ) ;
num_bytes = btrfs_file_extent_disk_num_bytes ( leaf , fi ) ;
extent_offset = btrfs_file_extent_offset ( leaf , fi ) ;
extent_end = key . offset +
btrfs_file_extent_num_bytes ( leaf , fi ) ;
} else if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
extent_end = key . offset +
2014-01-04 09:07:00 +04:00
btrfs_file_extent_inline_len ( leaf ,
path - > slots [ 0 ] , fi ) ;
2007-06-18 17:57:58 +04:00
} else {
2009-11-12 12:34:08 +03:00
WARN_ON ( 1 ) ;
2007-06-18 17:57:58 +04:00
extent_end = search_start ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
if ( extent_end < = search_start ) {
path - > slots [ 0 ] + + ;
2007-06-18 17:57:58 +04:00
goto next_slot ;
2007-06-12 14:35:45 +04:00
}
2012-09-14 22:51:22 +04:00
found = 1 ;
2009-11-12 12:34:08 +03:00
search_start = max ( key . offset , start ) ;
2012-04-27 22:31:29 +04:00
if ( recow | | ! modify_tree ) {
modify_tree = - 1 ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2007-06-12 14:35:45 +04:00
}
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
/*
* | - range to drop - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start > key . offset & & end < extent_end ) {
BUG_ON ( del_nr > 0 ) ;
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2009-11-12 12:34:08 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
new_key . offset = start ;
ret = btrfs_duplicate_item ( trans , root , path ,
& new_key ) ;
if ( ret = = - EAGAIN ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
if ( ret < 0 )
break ;
leaf = path - > nodes [ 0 ] ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
struct btrfs_file_extent_item ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
extent_offset + = start - key . offset ;
btrfs_set_file_extent_offset ( leaf , fi , extent_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - start ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
if ( update_refs & & disk_bytenr > 0 ) {
2008-11-07 06:02:51 +03:00
ret = btrfs_inc_extent_ref ( trans , root ,
2009-11-12 12:34:08 +03:00
disk_bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
new_key . objectid ,
2011-09-12 17:26:38 +04:00
start - extent_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-11-07 06:02:51 +03:00
}
2009-11-12 12:34:08 +03:00
key . offset = start ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
/*
* | - - - - range to drop - - - - - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start < = key . offset & & end < extent_end ) {
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
new_key . offset = end ;
2013-04-16 09:18:22 +04:00
btrfs_set_item_key_safe ( root , path , & new_key ) ;
2008-10-30 21:19:50 +03:00
2009-11-12 12:34:08 +03:00
extent_offset + = end - key . offset ;
btrfs_set_file_extent_offset ( leaf , fi , extent_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - end ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2012-08-29 20:24:27 +04:00
if ( update_refs & & disk_bytenr > 0 )
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode , end - key . offset ) ;
break ;
2007-06-12 14:35:45 +04:00
}
2008-11-07 06:02:51 +03:00
2009-11-12 12:34:08 +03:00
search_start = extent_end ;
/*
* | - - - - range to drop - - - - - |
* | - - - - - - - - extent - - - - - - - - |
*/
if ( start > key . offset & & end > = extent_end ) {
BUG_ON ( del_nr > 0 ) ;
2014-03-10 14:56:07 +04:00
if ( extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2014-04-15 20:50:17 +04:00
ret = - EOPNOTSUPP ;
2014-03-10 14:56:07 +04:00
break ;
}
2007-06-18 17:57:58 +04:00
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2012-08-29 20:24:27 +04:00
if ( update_refs & & disk_bytenr > 0 )
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode , extent_end - start ) ;
if ( end = = extent_end )
break ;
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2008-10-29 21:49:59 +03:00
2009-11-12 12:34:08 +03:00
path - > slots [ 0 ] + + ;
goto next_slot ;
2008-09-23 21:14:14 +04:00
}
2009-11-12 12:34:08 +03:00
/*
* | - - - - range to drop - - - - - |
* | - - - - - - extent - - - - - - |
*/
if ( start < = key . offset & & end > = extent_end ) {
if ( del_nr = = 0 ) {
del_slot = path - > slots [ 0 ] ;
del_nr = 1 ;
} else {
BUG_ON ( del_slot + del_nr ! = path - > slots [ 0 ] ) ;
del_nr + + ;
}
2008-09-23 21:14:14 +04:00
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
if ( update_refs & &
extent_type = = BTRFS_FILE_EXTENT_INLINE ) {
2008-10-09 19:46:29 +04:00
inode_sub_bytes ( inode ,
2009-11-12 12:34:08 +03:00
extent_end - key . offset ) ;
extent_end = ALIGN ( extent_end ,
root - > sectorsize ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
} else if ( update_refs & & disk_bytenr > 0 ) {
2008-09-23 21:14:14 +04:00
ret = btrfs_free_extent ( trans , root ,
2009-11-12 12:34:08 +03:00
disk_bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
key . objectid , key . offset -
2011-09-12 17:26:38 +04:00
extent_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2009-11-12 12:34:08 +03:00
inode_sub_bytes ( inode ,
extent_end - key . offset ) ;
2008-09-23 21:14:14 +04:00
}
2009-11-12 12:34:08 +03:00
if ( end = = extent_end )
break ;
if ( path - > slots [ 0 ] + 1 < btrfs_header_nritems ( leaf ) ) {
path - > slots [ 0 ] + + ;
goto next_slot ;
}
ret = btrfs_del_items ( trans , root , path , del_slot ,
del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret ) {
btrfs_abort_transaction ( trans , root , ret ) ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
break ;
2012-03-12 19:03:00 +04:00
}
2009-11-12 12:34:08 +03:00
del_nr = 0 ;
del_slot = 0 ;
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
continue ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
BUG_ON ( 1 ) ;
2007-06-12 14:35:45 +04:00
}
2009-11-12 12:34:08 +03:00
2012-03-12 19:03:00 +04:00
if ( ! ret & & del_nr > 0 ) {
2014-01-07 15:42:27 +04:00
/*
* Set path - > slots [ 0 ] to first slot , so that after the delete
* if items are move off from our leaf to its immediate left or
* right neighbor leafs , we end up with a correct and adjusted
2014-02-10 03:45:12 +04:00
* path - > slots [ 0 ] for our insertion ( if replace_extent ! = 0 ) .
2014-01-07 15:42:27 +04:00
*/
path - > slots [ 0 ] = del_slot ;
2009-11-12 12:34:08 +03:00
ret = btrfs_del_items ( trans , root , path , del_slot , del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret )
btrfs_abort_transaction ( trans , root , ret ) ;
2014-02-10 03:45:12 +04:00
}
2014-01-07 15:42:27 +04:00
2014-02-10 03:45:12 +04:00
leaf = path - > nodes [ 0 ] ;
/*
* If btrfs_del_items ( ) was called , it might have deleted a leaf , in
* which case it unlocked our path , so check path - > locks [ 0 ] matches a
* write lock .
*/
if ( ! ret & & replace_extent & & leafs_visited = = 1 & &
( path - > locks [ 0 ] = = BTRFS_WRITE_LOCK_BLOCKING | |
path - > locks [ 0 ] = = BTRFS_WRITE_LOCK ) & &
btrfs_leaf_free_space ( root , leaf ) > =
sizeof ( struct btrfs_item ) + extent_item_size ) {
key . objectid = ino ;
key . type = BTRFS_EXTENT_DATA_KEY ;
key . offset = start ;
if ( ! del_nr & & path - > slots [ 0 ] < btrfs_header_nritems ( leaf ) ) {
struct btrfs_key slot_key ;
btrfs_item_key_to_cpu ( leaf , & slot_key , path - > slots [ 0 ] ) ;
if ( btrfs_comp_cpu_keys ( & key , & slot_key ) > 0 )
path - > slots [ 0 ] + + ;
2014-01-07 15:42:27 +04:00
}
2014-02-10 03:45:12 +04:00
setup_items_for_insert ( root , path , & key ,
& extent_item_size ,
extent_item_size ,
sizeof ( struct btrfs_item ) +
extent_item_size , 1 ) ;
* key_inserted = 1 ;
2008-10-30 21:19:50 +03:00
}
2009-11-12 12:34:08 +03:00
2014-01-07 15:42:27 +04:00
if ( ! replace_extent | | ! ( * key_inserted ) )
btrfs_release_path ( path ) ;
2012-08-29 22:27:18 +04:00
if ( drop_end )
2012-09-14 22:51:22 +04:00
* drop_end = found ? min ( end , extent_end ) : end ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
return ret ;
}
int btrfs_drop_extents ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root , struct inode * inode , u64 start ,
2012-08-29 20:24:27 +04:00
u64 end , int drop_cache )
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
{
struct btrfs_path * path ;
int ret ;
path = btrfs_alloc_path ( ) ;
if ( ! path )
return - ENOMEM ;
2012-08-29 22:27:18 +04:00
ret = __btrfs_drop_extents ( trans , root , inode , path , start , end , NULL ,
2014-01-07 15:42:27 +04:00
drop_cache , 0 , 0 , NULL ) ;
2009-11-12 12:34:08 +03:00
btrfs_free_path ( path ) ;
2007-06-12 14:35:45 +04:00
return ret ;
}
2008-10-30 21:25:28 +03:00
static int extent_mergeable ( struct extent_buffer * leaf , int slot ,
2010-01-15 11:43:09 +03:00
u64 objectid , u64 bytenr , u64 orig_offset ,
u64 * start , u64 * end )
2008-10-30 21:25:28 +03:00
{
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
u64 extent_end ;
if ( slot < 0 | | slot > = btrfs_header_nritems ( leaf ) )
return 0 ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = objectid | | key . type ! = BTRFS_EXTENT_DATA_KEY )
return 0 ;
fi = btrfs_item_ptr ( leaf , slot , struct btrfs_file_extent_item ) ;
if ( btrfs_file_extent_type ( leaf , fi ) ! = BTRFS_FILE_EXTENT_REG | |
btrfs_file_extent_disk_bytenr ( leaf , fi ) ! = bytenr | |
2010-01-15 11:43:09 +03:00
btrfs_file_extent_offset ( leaf , fi ) ! = key . offset - orig_offset | |
2008-10-30 21:25:28 +03:00
btrfs_file_extent_compression ( leaf , fi ) | |
btrfs_file_extent_encryption ( leaf , fi ) | |
btrfs_file_extent_other_encoding ( leaf , fi ) )
return 0 ;
extent_end = key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) ;
if ( ( * start & & * start ! = key . offset ) | | ( * end & & * end ! = extent_end ) )
return 0 ;
* start = key . offset ;
* end = extent_end ;
return 1 ;
}
/*
* Mark extent in the range start - end as written .
*
* This changes extent type from ' pre - allocated ' to ' regular ' . If only
* part of extent is marked as written , the extent will be split into
* two or three .
*/
int btrfs_mark_extent_written ( struct btrfs_trans_handle * trans ,
struct inode * inode , u64 start , u64 end )
{
2009-11-12 12:34:08 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2008-10-30 21:25:28 +03:00
struct extent_buffer * leaf ;
struct btrfs_path * path ;
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
2009-11-12 12:34:08 +03:00
struct btrfs_key new_key ;
2008-10-30 21:25:28 +03:00
u64 bytenr ;
u64 num_bytes ;
u64 extent_end ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
u64 orig_offset ;
2008-10-30 21:25:28 +03:00
u64 other_start ;
u64 other_end ;
2009-11-12 12:34:08 +03:00
u64 split ;
int del_nr = 0 ;
int del_slot = 0 ;
2010-01-15 11:43:09 +03:00
int recow ;
2008-10-30 21:25:28 +03:00
int ret ;
2011-04-20 06:31:50 +04:00
u64 ino = btrfs_ino ( inode ) ;
2008-10-30 21:25:28 +03:00
path = btrfs_alloc_path ( ) ;
btrfs: don't BUG_ON btrfs_alloc_path() errors
This patch fixes many callers of btrfs_alloc_path() which BUG_ON allocation
failure. All the sites that are fixed in this patch were checked by me to
be fairly trivial to fix because of at least one of two criteria:
- Callers of the function catch errors from it already so bubbling the
error up will be handled.
- Callers of the function might BUG_ON any nonzero return code in which
case there is no behavior changed (but we still got to remove a BUG_ON)
The following functions were updated:
btrfs_lookup_extent, alloc_reserved_tree_block, btrfs_remove_block_group,
btrfs_lookup_csums_range, btrfs_csum_file_blocks, btrfs_mark_extent_written,
btrfs_inode_by_name, btrfs_new_inode, btrfs_symlink,
insert_reserved_file_extent, and run_delalloc_nocow
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
2011-07-13 21:38:47 +04:00
if ( ! path )
return - ENOMEM ;
2008-10-30 21:25:28 +03:00
again :
2010-01-15 11:43:09 +03:00
recow = 0 ;
2009-11-12 12:34:08 +03:00
split = start ;
2011-04-20 06:31:50 +04:00
key . objectid = ino ;
2008-10-30 21:25:28 +03:00
key . type = BTRFS_EXTENT_DATA_KEY ;
2009-11-12 12:34:08 +03:00
key . offset = split ;
2008-10-30 21:25:28 +03:00
ret = btrfs_search_slot ( trans , root , & key , path , - 1 , 1 ) ;
2011-03-16 20:59:32 +03:00
if ( ret < 0 )
goto out ;
2008-10-30 21:25:28 +03:00
if ( ret > 0 & & path - > slots [ 0 ] > 0 )
path - > slots [ 0 ] - - ;
leaf = path - > nodes [ 0 ] ;
btrfs_item_key_to_cpu ( leaf , & key , path - > slots [ 0 ] ) ;
2011-04-20 06:31:50 +04:00
BUG_ON ( key . objectid ! = ino | | key . type ! = BTRFS_EXTENT_DATA_KEY ) ;
2008-10-30 21:25:28 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2009-11-12 12:34:08 +03:00
BUG_ON ( btrfs_file_extent_type ( leaf , fi ) ! =
BTRFS_FILE_EXTENT_PREALLOC ) ;
2008-10-30 21:25:28 +03:00
extent_end = key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) ;
BUG_ON ( key . offset > start | | extent_end < end ) ;
bytenr = btrfs_file_extent_disk_bytenr ( leaf , fi ) ;
num_bytes = btrfs_file_extent_disk_num_bytes ( leaf , fi ) ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 18:45:14 +04:00
orig_offset = key . offset - btrfs_file_extent_offset ( leaf , fi ) ;
2010-01-15 11:43:09 +03:00
memcpy ( & new_key , & key , sizeof ( new_key ) ) ;
if ( start = = key . offset & & end < extent_end ) {
other_start = 0 ;
other_end = start ;
if ( extent_mergeable ( leaf , path - > slots [ 0 ] - 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
new_key . offset = end ;
2013-04-16 09:18:22 +04:00
btrfs_set_item_key_safe ( root , path , & new_key ) ;
2010-01-15 11:43:09 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - end ) ;
btrfs_set_file_extent_offset ( leaf , fi ,
end - orig_offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
end - other_start ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
}
if ( start > key . offset & & end = = extent_end ) {
other_start = end ;
other_end = 0 ;
if ( extent_mergeable ( leaf , path - > slots [ 0 ] + 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
start - key . offset ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
path - > slots [ 0 ] + + ;
new_key . offset = start ;
2013-04-16 09:18:22 +04:00
btrfs_set_item_key_safe ( root , path , & new_key ) ;
2010-01-15 11:43:09 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi ,
trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
other_end - start ) ;
btrfs_set_file_extent_offset ( leaf , fi ,
start - orig_offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
}
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
while ( start > key . offset | | end < extent_end ) {
if ( key . offset = = start )
split = end ;
new_key . offset = split ;
ret = btrfs_duplicate_item ( trans , root , path , & new_key ) ;
if ( ret = = - EAGAIN ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2009-11-12 12:34:08 +03:00
goto again ;
2008-10-30 21:25:28 +03:00
}
2012-03-12 19:03:00 +04:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , root , ret ) ;
goto out ;
}
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
leaf = path - > nodes [ 0 ] ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] - 1 ,
2008-10-30 21:25:28 +03:00
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2008-10-30 21:25:28 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
2009-11-12 12:34:08 +03:00
split - key . offset ) ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_offset ( leaf , fi , split - orig_offset ) ;
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - split ) ;
2008-10-30 21:25:28 +03:00
btrfs_mark_buffer_dirty ( leaf ) ;
2009-11-12 12:34:08 +03:00
ret = btrfs_inc_extent_ref ( trans , root , bytenr , num_bytes , 0 ,
root - > root_key . objectid ,
2011-09-12 17:26:38 +04:00
ino , orig_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-10-30 21:25:28 +03:00
2009-11-12 12:34:08 +03:00
if ( split = = start ) {
key . offset = start ;
} else {
BUG_ON ( start ! = key . offset ) ;
2008-10-30 21:25:28 +03:00
path - > slots [ 0 ] - - ;
2009-11-12 12:34:08 +03:00
extent_end = end ;
2008-10-30 21:25:28 +03:00
}
2010-01-15 11:43:09 +03:00
recow = 1 ;
2008-10-30 21:25:28 +03:00
}
2009-11-12 12:34:08 +03:00
other_start = end ;
other_end = 0 ;
2010-01-15 11:43:09 +03:00
if ( extent_mergeable ( leaf , path - > slots [ 0 ] + 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
if ( recow ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2010-01-15 11:43:09 +03:00
goto again ;
}
2009-11-12 12:34:08 +03:00
extent_end = other_end ;
del_slot = path - > slots [ 0 ] + 1 ;
del_nr + + ;
ret = btrfs_free_extent ( trans , root , bytenr , num_bytes ,
0 , root - > root_key . objectid ,
2011-09-12 17:26:38 +04:00
ino , orig_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2008-10-30 21:25:28 +03:00
}
2009-11-12 12:34:08 +03:00
other_start = 0 ;
other_end = start ;
2010-01-15 11:43:09 +03:00
if ( extent_mergeable ( leaf , path - > slots [ 0 ] - 1 ,
2011-04-20 06:31:50 +04:00
ino , bytenr , orig_offset ,
2010-01-15 11:43:09 +03:00
& other_start , & other_end ) ) {
if ( recow ) {
2011-04-21 03:20:15 +04:00
btrfs_release_path ( path ) ;
2010-01-15 11:43:09 +03:00
goto again ;
}
2009-11-12 12:34:08 +03:00
key . offset = other_start ;
del_slot = path - > slots [ 0 ] ;
del_nr + + ;
ret = btrfs_free_extent ( trans , root , bytenr , num_bytes ,
0 , root - > root_key . objectid ,
2011-09-12 17:26:38 +04:00
ino , orig_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
BUG_ON ( ret ) ; /* -ENOMEM */
2009-11-12 12:34:08 +03:00
}
if ( del_nr = = 0 ) {
2010-02-11 10:43:00 +03:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
2009-11-12 12:34:08 +03:00
btrfs_set_file_extent_type ( leaf , fi ,
BTRFS_FILE_EXTENT_REG ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2009-11-12 12:34:08 +03:00
btrfs_mark_buffer_dirty ( leaf ) ;
2010-01-15 11:43:09 +03:00
} else {
2010-02-11 10:43:00 +03:00
fi = btrfs_item_ptr ( leaf , del_slot - 1 ,
struct btrfs_file_extent_item ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_type ( leaf , fi ,
BTRFS_FILE_EXTENT_REG ) ;
2012-08-17 00:32:06 +04:00
btrfs_set_file_extent_generation ( leaf , fi , trans - > transid ) ;
2010-01-15 11:43:09 +03:00
btrfs_set_file_extent_num_bytes ( leaf , fi ,
extent_end - key . offset ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
2009-11-12 12:34:08 +03:00
2010-01-15 11:43:09 +03:00
ret = btrfs_del_items ( trans , root , path , del_slot , del_nr ) ;
2012-03-12 19:03:00 +04:00
if ( ret < 0 ) {
btrfs_abort_transaction ( trans , root , ret ) ;
goto out ;
}
2010-01-15 11:43:09 +03:00
}
2009-11-12 12:34:08 +03:00
out :
2008-10-30 21:25:28 +03:00
btrfs_free_path ( path ) ;
return 0 ;
}
2011-02-28 17:52:08 +03:00
/*
* on error we return an unlocked page and the error value
* on success we return a locked page and 0
*/
2011-09-30 23:23:54 +04:00
static int prepare_uptodate_page ( struct page * page , u64 pos ,
bool force_uptodate )
2011-02-28 17:52:08 +03:00
{
int ret = 0 ;
2011-09-30 23:23:54 +04:00
if ( ( ( pos & ( PAGE_CACHE_SIZE - 1 ) ) | | force_uptodate ) & &
! PageUptodate ( page ) ) {
2011-02-28 17:52:08 +03:00
ret = btrfs_readpage ( NULL , page ) ;
if ( ret )
return ret ;
lock_page ( page ) ;
if ( ! PageUptodate ( page ) ) {
unlock_page ( page ) ;
return - EIO ;
}
}
return 0 ;
}
2007-06-12 14:35:45 +04:00
/*
2013-12-10 15:25:04 +04:00
* this just gets pages into the page cache and locks them down .
2007-06-12 14:35:45 +04:00
*/
2013-12-10 15:25:03 +04:00
static noinline int prepare_pages ( struct inode * inode , struct page * * pages ,
size_t num_pages , loff_t pos ,
size_t write_bytes , bool force_uptodate )
2007-06-12 14:35:45 +04:00
{
int i ;
unsigned long index = pos > > PAGE_CACHE_SHIFT ;
2011-09-21 23:05:58 +04:00
gfp_t mask = btrfs_alloc_write_mask ( inode - > i_mapping ) ;
2013-12-13 23:39:34 +04:00
int err = 0 ;
2013-12-10 15:25:04 +04:00
int faili ;
2007-06-18 17:57:58 +04:00
2007-06-12 14:35:45 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
2011-07-11 18:47:06 +04:00
pages [ i ] = find_or_create_page ( inode - > i_mapping , index + i ,
2012-01-11 03:07:55 +04:00
mask | __GFP_WRITE ) ;
2007-06-12 14:35:45 +04:00
if ( ! pages [ i ] ) {
2011-02-28 17:52:08 +03:00
faili = i - 1 ;
err = - ENOMEM ;
goto fail ;
}
if ( i = = 0 )
2011-09-30 23:23:54 +04:00
err = prepare_uptodate_page ( pages [ i ] , pos ,
force_uptodate ) ;
2011-02-28 17:52:08 +03:00
if ( i = = num_pages - 1 )
err = prepare_uptodate_page ( pages [ i ] ,
2011-09-30 23:23:54 +04:00
pos + write_bytes , false ) ;
2011-02-28 17:52:08 +03:00
if ( err ) {
page_cache_release ( pages [ i ] ) ;
faili = i - 1 ;
goto fail ;
2007-06-12 14:35:45 +04:00
}
2007-06-28 23:57:36 +04:00
wait_on_page_writeback ( pages [ i ] ) ;
2007-06-12 14:35:45 +04:00
}
2013-12-10 15:25:04 +04:00
return 0 ;
fail :
while ( faili > = 0 ) {
unlock_page ( pages [ faili ] ) ;
page_cache_release ( pages [ faili ] ) ;
faili - - ;
}
return err ;
}
/*
* This function locks the extent and properly waits for data = ordered extents
* to finish before allowing the pages to be modified if need .
*
* The return value :
* 1 - the extent is locked
* 0 - the extent is not locked , and everything is OK
* - EAGAIN - need re - prepare the pages
* the other < 0 number - Something wrong happens
*/
static noinline int
lock_and_cleanup_extent_if_need ( struct inode * inode , struct page * * pages ,
size_t num_pages , loff_t pos ,
u64 * lockstart , u64 * lockend ,
struct extent_state * * cached_state )
{
u64 start_pos ;
u64 last_pos ;
int i ;
int ret = 0 ;
start_pos = pos & ~ ( ( u64 ) PAGE_CACHE_SIZE - 1 ) ;
last_pos = start_pos + ( ( u64 ) num_pages < < PAGE_CACHE_SHIFT ) - 1 ;
2008-02-19 19:29:24 +03:00
if ( start_pos < inode - > i_size ) {
2008-07-17 20:53:50 +04:00
struct btrfs_ordered_extent * ordered ;
2010-02-03 22:33:23 +03:00
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree ,
2013-12-10 15:25:04 +04:00
start_pos , last_pos , 0 , cached_state ) ;
2014-03-06 09:54:58 +04:00
ordered = btrfs_lookup_ordered_range ( inode , start_pos ,
last_pos - start_pos + 1 ) ;
2008-07-17 20:53:50 +04:00
if ( ordered & &
ordered - > file_offset + ordered - > len > start_pos & &
2013-12-10 15:25:04 +04:00
ordered - > file_offset < = last_pos ) {
2010-02-03 22:33:23 +03:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
2013-12-10 15:25:04 +04:00
start_pos , last_pos ,
cached_state , GFP_NOFS ) ;
2008-07-17 20:53:50 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
unlock_page ( pages [ i ] ) ;
page_cache_release ( pages [ i ] ) ;
}
2014-03-06 09:54:58 +04:00
btrfs_start_ordered_extent ( inode , ordered , 1 ) ;
btrfs_put_ordered_extent ( ordered ) ;
return - EAGAIN ;
2008-07-17 20:53:50 +04:00
}
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
2010-02-03 22:33:23 +03:00
clear_extent_bit ( & BTRFS_I ( inode ) - > io_tree , start_pos ,
2013-12-10 15:25:04 +04:00
last_pos , EXTENT_DIRTY | EXTENT_DELALLOC |
2012-09-06 05:10:51 +04:00
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG ,
2013-12-10 15:25:04 +04:00
0 , 0 , cached_state , GFP_NOFS ) ;
* lockstart = start_pos ;
* lockend = last_pos ;
ret = 1 ;
2008-02-19 19:29:24 +03:00
}
2013-12-10 15:25:04 +04:00
2008-07-17 20:53:50 +04:00
for ( i = 0 ; i < num_pages ; i + + ) {
2011-08-09 01:19:47 +04:00
if ( clear_page_dirty_for_io ( pages [ i ] ) )
account_page_redirty ( pages [ i ] ) ;
2008-07-17 20:53:50 +04:00
set_page_extent_mapped ( pages [ i ] ) ;
WARN_ON ( ! PageLocked ( pages [ i ] ) ) ;
}
2011-02-28 17:52:08 +03:00
2013-12-10 15:25:04 +04:00
return ret ;
2007-06-12 14:35:45 +04:00
}
2013-06-22 00:37:03 +04:00
static noinline int check_can_nocow ( struct inode * inode , loff_t pos ,
size_t * write_bytes )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_ordered_extent * ordered ;
u64 lockstart , lockend ;
u64 num_bytes ;
int ret ;
2014-03-06 09:38:19 +04:00
ret = btrfs_start_nocow_write ( root ) ;
if ( ! ret )
return - ENOSPC ;
2013-06-22 00:37:03 +04:00
lockstart = round_down ( pos , root - > sectorsize ) ;
Btrfs: fix wrong lock range and write size in check_can_nocow()
The write range may not be sector-aligned, for example:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------|--------| <- correct lock range, size: 3blocks
But according to the old code, we used the size of write range to calculate
the lock range directly, not considered the offset, we would get a wrong lock
range:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------| <- wrong lock range, size: 2blocks
And besides that, the old code also had the same problem when calculating
the real write size. Correct them.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-02-27 09:58:04 +04:00
lockend = round_up ( pos + * write_bytes , root - > sectorsize ) - 1 ;
2013-06-22 00:37:03 +04:00
while ( 1 ) {
lock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
ordered = btrfs_lookup_ordered_range ( inode , lockstart ,
lockend - lockstart + 1 ) ;
if ( ! ordered ) {
break ;
}
unlock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
btrfs_start_ordered_extent ( inode , ordered , 1 ) ;
btrfs_put_ordered_extent ( ordered ) ;
}
num_bytes = lockend - lockstart + 1 ;
2013-08-14 22:02:47 +04:00
ret = can_nocow_extent ( inode , lockstart , & num_bytes , NULL , NULL , NULL ) ;
2013-06-22 00:37:03 +04:00
if ( ret < = 0 ) {
ret = 0 ;
2014-03-06 09:38:19 +04:00
btrfs_end_nocow_write ( root ) ;
2013-06-22 00:37:03 +04:00
} else {
Btrfs: fix wrong lock range and write size in check_can_nocow()
The write range may not be sector-aligned, for example:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------|--------| <- correct lock range, size: 3blocks
But according to the old code, we used the size of write range to calculate
the lock range directly, not considered the offset, we would get a wrong lock
range:
|--------|--------| <- write range, sector-unaligned, size: 2blocks
|--------|--------| <- wrong lock range, size: 2blocks
And besides that, the old code also had the same problem when calculating
the real write size. Correct them.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-02-27 09:58:04 +04:00
* write_bytes = min_t ( size_t , * write_bytes ,
num_bytes - pos + lockstart ) ;
2013-06-22 00:37:03 +04:00
}
unlock_extent ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ) ;
return ret ;
}
2011-01-25 22:57:24 +03:00
static noinline ssize_t __btrfs_buffered_write ( struct file * file ,
struct iov_iter * i ,
loff_t pos )
2010-05-23 19:00:55 +04:00
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2010-05-23 19:07:21 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct page * * pages = NULL ;
2013-12-10 15:25:04 +04:00
struct extent_state * cached_state = NULL ;
2013-06-22 00:37:03 +04:00
u64 release_bytes = 0 ;
2013-12-10 15:25:04 +04:00
u64 lockstart ;
u64 lockend ;
2007-06-12 14:35:45 +04:00
unsigned long first_index ;
2011-01-25 22:57:24 +03:00
size_t num_written = 0 ;
int nrptrs ;
2011-03-30 04:57:23 +04:00
int ret = 0 ;
2013-06-22 00:37:03 +04:00
bool only_release_metadata = false ;
2011-09-30 23:23:54 +04:00
bool force_page_uptodate = false ;
2013-12-10 15:25:04 +04:00
bool need_unlock ;
2010-05-23 19:00:55 +04:00
2011-01-25 22:57:24 +03:00
nrptrs = min ( ( iov_iter_count ( i ) + PAGE_CACHE_SIZE - 1 ) /
2010-05-23 19:07:21 +04:00
PAGE_CACHE_SIZE , PAGE_CACHE_SIZE /
( sizeof ( struct page * ) ) ) ;
2011-12-16 21:32:57 +04:00
nrptrs = min ( nrptrs , current - > nr_dirtied_pause - current - > nr_dirtied ) ;
nrptrs = max ( nrptrs , 8 ) ;
2007-06-18 17:57:58 +04:00
pages = kmalloc ( nrptrs * sizeof ( struct page * ) , GFP_KERNEL ) ;
2011-01-25 22:57:24 +03:00
if ( ! pages )
return - ENOMEM ;
2009-10-01 20:29:10 +04:00
2007-06-12 14:35:45 +04:00
first_index = pos > > PAGE_CACHE_SHIFT ;
2011-01-25 22:57:24 +03:00
while ( iov_iter_count ( i ) > 0 ) {
2007-06-12 14:35:45 +04:00
size_t offset = pos & ( PAGE_CACHE_SIZE - 1 ) ;
2011-01-25 22:57:24 +03:00
size_t write_bytes = min ( iov_iter_count ( i ) ,
2010-05-23 19:07:21 +04:00
nrptrs * ( size_t ) PAGE_CACHE_SIZE -
2007-06-18 17:57:58 +04:00
offset ) ;
2011-01-18 08:34:40 +03:00
size_t num_pages = ( write_bytes + offset +
PAGE_CACHE_SIZE - 1 ) > > PAGE_CACHE_SHIFT ;
2013-06-22 00:37:03 +04:00
size_t reserve_bytes ;
2011-01-25 22:57:24 +03:00
size_t dirty_pages ;
size_t copied ;
2007-06-12 14:35:45 +04:00
2007-06-18 17:57:58 +04:00
WARN_ON ( num_pages > nrptrs ) ;
2007-12-22 00:27:21 +03:00
2010-12-09 12:30:14 +03:00
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
2011-01-25 22:57:24 +03:00
if ( unlikely ( iov_iter_fault_in_readable ( i , write_bytes ) ) ) {
2010-12-09 12:30:14 +03:00
ret = - EFAULT ;
2011-01-25 22:57:24 +03:00
break ;
2010-12-09 12:30:14 +03:00
}
2013-06-22 00:37:03 +04:00
reserve_bytes = num_pages < < PAGE_CACHE_SHIFT ;
ret = btrfs_check_data_free_space ( inode , reserve_bytes ) ;
if ( ret = = - ENOSPC & &
( BTRFS_I ( inode ) - > flags & ( BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC ) ) ) {
ret = check_can_nocow ( inode , pos , & write_bytes ) ;
if ( ret > 0 ) {
only_release_metadata = true ;
/*
* our prealloc extent may be smaller than
* write_bytes , so scale down .
*/
num_pages = ( write_bytes + offset +
PAGE_CACHE_SIZE - 1 ) > >
PAGE_CACHE_SHIFT ;
reserve_bytes = num_pages < < PAGE_CACHE_SHIFT ;
ret = 0 ;
} else {
ret = - ENOSPC ;
}
}
2007-12-22 00:27:21 +03:00
if ( ret )
2011-01-25 22:57:24 +03:00
break ;
2007-12-22 00:27:21 +03:00
2013-06-22 00:37:03 +04:00
ret = btrfs_delalloc_reserve_metadata ( inode , reserve_bytes ) ;
if ( ret ) {
if ( ! only_release_metadata )
btrfs_free_reserved_data_space ( inode ,
reserve_bytes ) ;
2014-03-06 09:38:19 +04:00
else
btrfs_end_nocow_write ( root ) ;
2013-06-22 00:37:03 +04:00
break ;
}
release_bytes = reserve_bytes ;
2013-12-10 15:25:04 +04:00
need_unlock = false ;
again :
2011-01-25 23:10:08 +03:00
/*
* This is going to setup the pages array with the number of
* pages we want , so we don ' t really need to worry about the
* contents of pages from loop to loop
*/
2013-12-10 15:25:03 +04:00
ret = prepare_pages ( inode , pages , num_pages ,
pos , write_bytes ,
2011-09-30 23:23:54 +04:00
force_page_uptodate ) ;
2013-06-22 00:37:03 +04:00
if ( ret )
2011-01-25 22:57:24 +03:00
break ;
2007-06-12 14:35:45 +04:00
2013-12-10 15:25:04 +04:00
ret = lock_and_cleanup_extent_if_need ( inode , pages , num_pages ,
pos , & lockstart , & lockend ,
& cached_state ) ;
if ( ret < 0 ) {
if ( ret = = - EAGAIN )
goto again ;
break ;
} else if ( ret > 0 ) {
need_unlock = true ;
ret = 0 ;
}
2010-12-09 12:30:14 +03:00
copied = btrfs_copy_from_user ( pos , num_pages ,
2011-01-25 22:57:24 +03:00
write_bytes , pages , i ) ;
2011-02-28 17:52:08 +03:00
/*
* if we have trouble faulting in the pages , fall
* back to one page at a time
*/
if ( copied < write_bytes )
nrptrs = 1 ;
2011-09-30 23:23:54 +04:00
if ( copied = = 0 ) {
force_page_uptodate = true ;
2011-02-28 17:52:08 +03:00
dirty_pages = 0 ;
2011-09-30 23:23:54 +04:00
} else {
force_page_uptodate = false ;
2011-02-28 17:52:08 +03:00
dirty_pages = ( copied + offset +
PAGE_CACHE_SIZE - 1 ) > >
PAGE_CACHE_SHIFT ;
2011-09-30 23:23:54 +04:00
}
2010-12-09 12:30:14 +03:00
2011-01-25 22:57:24 +03:00
/*
* If we had a short copy we need to release the excess delaloc
* bytes we reserved . We need to increment outstanding_extents
* because btrfs_delalloc_release_space will decrement it , but
* we still have an outstanding extent for the chunk we actually
* managed to copy .
*/
2010-12-09 12:30:14 +03:00
if ( num_pages > dirty_pages ) {
2013-06-22 00:37:03 +04:00
release_bytes = ( num_pages - dirty_pages ) < <
PAGE_CACHE_SHIFT ;
2011-07-15 19:16:44 +04:00
if ( copied > 0 ) {
spin_lock ( & BTRFS_I ( inode ) - > lock ) ;
BTRFS_I ( inode ) - > outstanding_extents + + ;
spin_unlock ( & BTRFS_I ( inode ) - > lock ) ;
}
2013-06-22 00:37:03 +04:00
if ( only_release_metadata )
btrfs_delalloc_release_metadata ( inode ,
release_bytes ) ;
else
btrfs_delalloc_release_space ( inode ,
release_bytes ) ;
2010-12-09 12:30:14 +03:00
}
2013-06-22 00:37:03 +04:00
release_bytes = dirty_pages < < PAGE_CACHE_SHIFT ;
2013-12-10 15:25:04 +04:00
if ( copied > 0 )
2011-04-06 21:05:22 +04:00
ret = btrfs_dirty_pages ( root , inode , pages ,
dirty_pages , pos , copied ,
NULL ) ;
2013-12-10 15:25:04 +04:00
if ( need_unlock )
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
lockstart , lockend , & cached_state ,
GFP_NOFS ) ;
2014-01-09 06:06:10 +04:00
if ( ret ) {
btrfs_drop_pages ( pages , num_pages ) ;
2013-12-10 15:25:04 +04:00
break ;
2014-01-09 06:06:10 +04:00
}
2007-06-12 14:35:45 +04:00
2013-12-10 15:25:04 +04:00
release_bytes = 0 ;
2014-03-06 09:38:19 +04:00
if ( only_release_metadata )
btrfs_end_nocow_write ( root ) ;
2013-06-22 00:37:03 +04:00
if ( only_release_metadata & & copied > 0 ) {
u64 lockstart = round_down ( pos , root - > sectorsize ) ;
u64 lockend = lockstart +
( dirty_pages < < PAGE_CACHE_SHIFT ) - 1 ;
set_extent_bit ( & BTRFS_I ( inode ) - > io_tree , lockstart ,
lockend , EXTENT_NORESERVE , NULL ,
NULL , GFP_NOFS ) ;
only_release_metadata = false ;
}
2014-01-09 06:06:10 +04:00
btrfs_drop_pages ( pages , num_pages ) ;
2011-01-25 22:57:24 +03:00
cond_resched ( ) ;
2012-12-12 04:00:21 +04:00
balance_dirty_pages_ratelimited ( inode - > i_mapping ) ;
2011-01-25 22:57:24 +03:00
if ( dirty_pages < ( root - > leafsize > > PAGE_CACHE_SHIFT ) + 1 )
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2008-10-03 20:30:02 +04:00
2010-12-09 12:30:14 +03:00
pos + = copied ;
num_written + = copied ;
2011-01-25 22:57:24 +03:00
}
2007-06-12 14:35:45 +04:00
2011-01-25 22:57:24 +03:00
kfree ( pages ) ;
2013-06-22 00:37:03 +04:00
if ( release_bytes ) {
2014-03-06 09:38:19 +04:00
if ( only_release_metadata ) {
btrfs_end_nocow_write ( root ) ;
2013-06-22 00:37:03 +04:00
btrfs_delalloc_release_metadata ( inode , release_bytes ) ;
2014-03-06 09:38:19 +04:00
} else {
2013-06-22 00:37:03 +04:00
btrfs_delalloc_release_space ( inode , release_bytes ) ;
2014-03-06 09:38:19 +04:00
}
2013-06-22 00:37:03 +04:00
}
2011-01-25 22:57:24 +03:00
return num_written ? num_written : ret ;
}
static ssize_t __btrfs_direct_write ( struct kiocb * iocb ,
const struct iovec * iov ,
unsigned long nr_segs , loff_t pos ,
2014-02-12 04:31:06 +04:00
size_t count , size_t ocount )
2011-01-25 22:57:24 +03:00
{
struct file * file = iocb - > ki_filp ;
struct iov_iter i ;
ssize_t written ;
ssize_t written_buffered ;
loff_t endbyte ;
int err ;
2014-02-12 05:58:20 +04:00
written = generic_file_direct_write ( iocb , iov , & nr_segs , pos ,
2011-01-25 22:57:24 +03:00
count , ocount ) ;
if ( written < 0 | | written = = count )
return written ;
pos + = written ;
count - = written ;
iov_iter_init ( & i , iov , nr_segs , count , written ) ;
written_buffered = __btrfs_buffered_write ( file , & i , pos ) ;
if ( written_buffered < 0 ) {
err = written_buffered ;
goto out ;
2007-06-12 14:35:45 +04:00
}
2011-01-25 22:57:24 +03:00
endbyte = pos + written_buffered - 1 ;
err = filemap_write_and_wait_range ( file - > f_mapping , pos , endbyte ) ;
if ( err )
goto out ;
written + = written_buffered ;
2014-02-12 04:31:06 +04:00
iocb - > ki_pos = pos + written_buffered ;
2011-01-25 22:57:24 +03:00
invalidate_mapping_pages ( file - > f_mapping , pos > > PAGE_CACHE_SHIFT ,
endbyte > > PAGE_CACHE_SHIFT ) ;
2007-06-12 14:35:45 +04:00
out :
2011-01-25 22:57:24 +03:00
return written ? written : err ;
}
2008-01-03 21:46:11 +03:00
2012-11-09 19:53:21 +04:00
static void update_time_for_write ( struct inode * inode )
{
struct timespec now ;
if ( IS_NOCMTIME ( inode ) )
return ;
now = current_fs_time ( inode - > i_sb ) ;
if ( ! timespec_equal ( & inode - > i_mtime , & now ) )
inode - > i_mtime = now ;
if ( ! timespec_equal ( & inode - > i_ctime , & now ) )
inode - > i_ctime = now ;
if ( IS_I_VERSION ( inode ) )
inode_inc_iversion ( inode ) ;
}
2011-01-25 22:57:24 +03:00
static ssize_t btrfs_file_aio_write ( struct kiocb * iocb ,
const struct iovec * iov ,
unsigned long nr_segs , loff_t pos )
{
struct file * file = iocb - > ki_filp ;
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2011-01-25 22:57:24 +03:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-09-11 18:52:24 +04:00
u64 start_pos ;
2014-03-27 06:51:58 +04:00
u64 end_pos ;
2011-01-25 22:57:24 +03:00
ssize_t num_written = 0 ;
ssize_t err = 0 ;
size_t count , ocount ;
2012-11-16 22:56:32 +04:00
bool sync = ( file - > f_flags & O_DSYNC ) | | IS_SYNC ( file - > f_mapping - > host ) ;
2011-01-25 22:57:24 +03:00
mutex_lock ( & inode - > i_mutex ) ;
err = generic_segment_checks ( iov , & nr_segs , & ocount , VERIFY_READ ) ;
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
count = ocount ;
current - > backing_dev_info = inode - > i_mapping - > backing_dev_info ;
err = generic_write_checks ( file , & pos , & count , S_ISBLK ( inode - > i_mode ) ) ;
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
if ( count = = 0 ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
err = file_remove_suid ( file ) ;
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
/*
* If BTRFS flips readonly due to some impossible error
* ( fs_info - > fs_state now has BTRFS_SUPER_FLAG_ERROR ) ,
* although we have opened a file as writable , we have
* to stop this write operation to ensure FS consistency .
*/
2013-01-29 14:14:48 +04:00
if ( test_bit ( BTRFS_FS_STATE_ERROR , & root - > fs_info - > fs_state ) ) {
2011-01-25 22:57:24 +03:00
mutex_unlock ( & inode - > i_mutex ) ;
err = - EROFS ;
goto out ;
}
2012-11-09 19:53:21 +04:00
/*
* We reserve space for updating the inode when we reserve space for the
* extent we are going to write , so we will enospc out there . We don ' t
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write .
*/
update_time_for_write ( inode ) ;
2011-01-25 22:57:24 +03:00
2011-09-11 18:52:24 +04:00
start_pos = round_down ( pos , root - > sectorsize ) ;
if ( start_pos > i_size_read ( inode ) ) {
2014-03-27 06:51:58 +04:00
/* Expand hole size to cover write data, preventing empty gap */
2014-04-15 06:41:00 +04:00
end_pos = round_up ( pos + count , root - > sectorsize ) ;
2014-03-27 06:51:58 +04:00
err = btrfs_cont_expand ( inode , i_size_read ( inode ) , end_pos ) ;
2011-09-11 18:52:24 +04:00
if ( err ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
}
2012-11-16 22:56:32 +04:00
if ( sync )
atomic_inc ( & BTRFS_I ( inode ) - > sync_writers ) ;
2011-01-25 22:57:24 +03:00
if ( unlikely ( file - > f_flags & O_DIRECT ) ) {
num_written = __btrfs_direct_write ( iocb , iov , nr_segs ,
2014-02-12 04:31:06 +04:00
pos , count , ocount ) ;
2011-01-25 22:57:24 +03:00
} else {
struct iov_iter i ;
iov_iter_init ( & i , iov , nr_segs , count , num_written ) ;
num_written = __btrfs_buffered_write ( file , & i , pos ) ;
if ( num_written > 0 )
2014-02-12 04:31:06 +04:00
iocb - > ki_pos = pos + num_written ;
2011-01-25 22:57:24 +03:00
}
mutex_unlock ( & inode - > i_mutex ) ;
2007-10-29 21:36:41 +03:00
2009-03-31 21:27:11 +04:00
/*
* we want to make sure fsync finds this change
* but we haven ' t joined a transaction running right now .
*
* Later on , someone is sure to update the inode and get the
* real transid recorded .
*
* We set last_trans now to the fs_info generation + 1 ,
* this will either be one more than the running transaction
* or the generation used for the next transaction if there isn ' t
* one running right now .
2012-11-09 19:53:21 +04:00
*
* We also have to set last_sub_trans to the current log transid ,
* otherwise subsequent syncs to a file that ' s been synced in this
* transaction will appear to have already occured .
2009-03-31 21:27:11 +04:00
*/
BTRFS_I ( inode ) - > last_trans = root - > fs_info - > generation + 1 ;
2012-11-09 19:53:21 +04:00
BTRFS_I ( inode ) - > last_sub_trans = root - > log_transid ;
2013-09-04 17:04:40 +04:00
if ( num_written > 0 ) {
2011-01-25 22:57:24 +03:00
err = generic_write_sync ( file , pos , num_written ) ;
2014-04-04 01:47:17 +04:00
if ( err < 0 )
2007-10-29 21:36:41 +03:00
num_written = err ;
}
2013-01-28 16:34:55 +04:00
2012-11-16 22:56:32 +04:00
if ( sync )
atomic_dec ( & BTRFS_I ( inode ) - > sync_writers ) ;
2013-01-28 16:34:55 +04:00
out :
2007-06-12 14:35:45 +04:00
current - > backing_dev_info = NULL ;
return num_written ? num_written : err ;
}
2009-01-06 05:25:51 +03:00
int btrfs_release_file ( struct inode * inode , struct file * filp )
2008-05-27 18:55:43 +04:00
{
2009-03-31 21:27:11 +04:00
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non - zero size to a zero size . This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place .
*/
2012-05-23 22:13:11 +04:00
if ( test_and_clear_bit ( BTRFS_INODE_ORDERED_DATA_CLOSE ,
& BTRFS_I ( inode ) - > runtime_flags ) ) {
2013-02-13 20:09:14 +04:00
struct btrfs_trans_handle * trans ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
/*
* We need to block on a committing transaction to keep us from
* throwing a ordered operation on to the list and causing
* something like sync to deadlock trying to flush out this
* inode .
*/
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) )
return PTR_ERR ( trans ) ;
btrfs_add_ordered_operation ( trans , BTRFS_I ( inode ) - > root , inode ) ;
btrfs_end_transaction ( trans , root ) ;
2009-03-31 21:27:11 +04:00
if ( inode - > i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT )
filemap_flush ( inode - > i_mapping ) ;
}
2008-06-10 18:07:39 +04:00
if ( filp - > private_data )
btrfs_ioctl_trans_end ( filp ) ;
2008-05-27 18:55:43 +04:00
return 0 ;
}
2008-09-29 23:18:18 +04:00
/*
* fsync call for both files and directories . This logs the inode into
* the tree log instead of forcing full commits whenever possible .
*
* It needs to call filemap_fdatawait so that all ordered extent updates are
* in the metadata btree are up to date for copying to the log .
*
* It drops the inode mutex before doing the tree log commit . This is an
* important optimization for directories because holding the mutex prevents
* new operations on the dir while we write to disk .
*/
2011-07-17 04:44:56 +04:00
int btrfs_sync_file ( struct file * file , loff_t start , loff_t end , int datasync )
2007-06-12 14:35:45 +04:00
{
2010-05-26 19:53:25 +04:00
struct dentry * dentry = file - > f_path . dentry ;
2007-06-12 14:35:45 +04:00
struct inode * inode = dentry - > d_inode ;
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct btrfs_trans_handle * trans ;
2014-02-20 14:08:58 +04:00
struct btrfs_log_ctx ctx ;
int ret = 0 ;
2012-10-12 23:27:49 +04:00
bool full_sync = 0 ;
2007-06-12 14:35:45 +04:00
Btrfs: add initial tracepoint support for btrfs
Tracepoints can provide insight into why btrfs hits bugs and be greatly
helpful for debugging, e.g
dd-7822 [000] 2121.641088: btrfs_inode_request: root = 5(FS_TREE), gen = 4, ino = 256, blocks = 8, disk_i_size = 0, last_trans = 8, logged_trans = 0
dd-7822 [000] 2121.641100: btrfs_inode_new: root = 5(FS_TREE), gen = 8, ino = 257, blocks = 0, disk_i_size = 0, last_trans = 0, logged_trans = 0
btrfs-transacti-7804 [001] 2146.935420: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29368320 (orig_level = 0), cow_buf = 29388800 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.935473: btrfs_cow_block: root = 1(ROOT_TREE), refs = 2, orig_buf = 29364224 (orig_level = 0), cow_buf = 29392896 (cow_level = 0)
btrfs-transacti-7804 [001] 2146.972221: btrfs_transaction_commit: root = 1(ROOT_TREE), gen = 8
flush-btrfs-2-7821 [001] 2155.824210: btrfs_chunk_alloc: root = 3(CHUNK_TREE), offset = 1103101952, size = 1073741824, num_stripes = 1, sub_stripes = 0, type = DATA
flush-btrfs-2-7821 [001] 2155.824241: btrfs_cow_block: root = 2(EXTENT_TREE), refs = 2, orig_buf = 29388800 (orig_level = 0), cow_buf = 29396992 (cow_level = 0)
flush-btrfs-2-7821 [001] 2155.824255: btrfs_cow_block: root = 4(DEV_TREE), refs = 2, orig_buf = 29372416 (orig_level = 0), cow_buf = 29401088 (cow_level = 0)
flush-btrfs-2-7821 [000] 2155.824329: btrfs_cow_block: root = 3(CHUNK_TREE), refs = 2, orig_buf = 20971520 (orig_level = 0), cow_buf = 20975616 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898019: btrfs_cow_block: root = 5(FS_TREE), refs = 2, orig_buf = 29384704 (orig_level = 0), cow_buf = 29405184 (cow_level = 0)
btrfs-endio-wri-7800 [001] 2155.898043: btrfs_cow_block: root = 7(CSUM_TREE), refs = 2, orig_buf = 29376512 (orig_level = 0), cow_buf = 29409280 (cow_level = 0)
Here is what I have added:
1) ordere_extent:
btrfs_ordered_extent_add
btrfs_ordered_extent_remove
btrfs_ordered_extent_start
btrfs_ordered_extent_put
These provide critical information to understand how ordered_extents are
updated.
2) extent_map:
btrfs_get_extent
extent_map is used in both read and write cases, and it is useful for tracking
how btrfs specific IO is running.
3) writepage:
__extent_writepage
btrfs_writepage_end_io_hook
Pages are cirtical resourses and produce a lot of corner cases during writeback,
so it is valuable to know how page is written to disk.
4) inode:
btrfs_inode_new
btrfs_inode_request
btrfs_inode_evict
These can show where and when a inode is created, when a inode is evicted.
5) sync:
btrfs_sync_file
btrfs_sync_fs
These show sync arguments.
6) transaction:
btrfs_transaction_commit
In transaction based filesystem, it will be useful to know the generation and
who does commit.
7) back reference and cow:
btrfs_delayed_tree_ref
btrfs_delayed_data_ref
btrfs_delayed_ref_head
btrfs_cow_block
Btrfs natively supports back references, these tracepoints are helpful on
understanding btrfs's COW mechanism.
8) chunk:
btrfs_chunk_alloc
btrfs_chunk_free
Chunk is a link between physical offset and logical offset, and stands for space
infomation in btrfs, and these are helpful on tracing space things.
9) reserved_extent:
btrfs_reserved_extent_alloc
btrfs_reserved_extent_free
These can show how btrfs uses its space.
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-03-24 14:18:59 +03:00
trace_btrfs_sync_file ( file , datasync ) ;
2009-10-13 21:21:08 +04:00
2012-09-13 14:53:47 +04:00
/*
* We write the dirty pages in the range and wait until they complete
* out of the - > i_mutex . If so , we can flush the dirty pages by
2012-10-12 23:27:49 +04:00
* multi - task , and make the performance up . See
* btrfs_wait_ordered_range for an explanation of the ASYNC check .
2012-09-13 14:53:47 +04:00
*/
2012-11-16 22:56:32 +04:00
atomic_inc ( & BTRFS_I ( inode ) - > sync_writers ) ;
2012-10-12 23:27:49 +04:00
ret = filemap_fdatawrite_range ( inode - > i_mapping , start , end ) ;
if ( ! ret & & test_bit ( BTRFS_INODE_HAS_ASYNC_EXTENT ,
& BTRFS_I ( inode ) - > runtime_flags ) )
ret = filemap_fdatawrite_range ( inode - > i_mapping , start , end ) ;
2012-11-16 22:56:32 +04:00
atomic_dec ( & BTRFS_I ( inode ) - > sync_writers ) ;
2012-09-13 14:53:47 +04:00
if ( ret )
return ret ;
2011-07-17 04:44:56 +04:00
mutex_lock ( & inode - > i_mutex ) ;
2012-04-23 23:09:39 +04:00
/*
2012-09-13 14:53:47 +04:00
* We flush the dirty pages again to avoid some dirty pages in the
* range being left .
2012-04-23 23:09:39 +04:00
*/
2012-09-06 14:04:27 +04:00
atomic_inc ( & root - > log_batch ) ;
2012-10-12 23:27:49 +04:00
full_sync = test_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
2013-10-26 00:13:35 +04:00
if ( full_sync ) {
ret = btrfs_wait_ordered_range ( inode , start , end - start + 1 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
goto out ;
}
}
2012-09-06 14:04:27 +04:00
atomic_inc ( & root - > log_batch ) ;
2009-10-13 21:21:08 +04:00
2007-06-12 14:35:45 +04:00
/*
2007-08-11 00:22:09 +04:00
* check the transaction that last modified this inode
* and see if its already been committed
2007-06-12 14:35:45 +04:00
*/
2011-07-17 04:44:56 +04:00
if ( ! BTRFS_I ( inode ) - > last_trans ) {
mutex_unlock ( & inode - > i_mutex ) ;
2007-08-11 00:22:09 +04:00
goto out ;
2011-07-17 04:44:56 +04:00
}
2008-06-26 00:01:30 +04:00
2009-10-13 21:21:08 +04:00
/*
* if the last transaction that changed this file was before
* the current transaction , we can bail out now without any
* syncing
*/
2011-04-12 01:25:13 +04:00
smp_mb ( ) ;
2012-05-30 00:57:49 +04:00
if ( btrfs_inode_in_log ( inode , root - > fs_info - > generation ) | |
BTRFS_I ( inode ) - > last_trans < =
2007-08-11 00:22:09 +04:00
root - > fs_info - > last_trans_committed ) {
BTRFS_I ( inode ) - > last_trans = 0 ;
Btrfs: turbo charge fsync
At least for the vm workload. Currently on fsync we will
1) Truncate all items in the log tree for the given inode if they exist
and
2) Copy all items for a given inode into the log
The problem with this is that for things like VMs you can have lots of
extents from the fragmented writing behavior, and worst yet you may have
only modified a few extents, not the entire thing. This patch fixes this
problem by tracking which transid modified our extent, and then when we do
the tree logging we find all of the extents we've modified in our current
transaction, sort them and commit them. We also only truncate up to the
xattrs of the inode and copy that stuff in normally, and then just drop any
extents in the range we have that exist in the log already. Here are some
numbers of a 50 meg fio job that does random writes and fsync()s after every
write
Original Patched
SATA drive 82KB/s 140KB/s
Fusion drive 431KB/s 2532KB/s
So around 2-6 times faster depending on your hardware. There are a few
corner cases, for example if you truncate at all we have to do it the old
way since there is no way to be sure what is in the log is ok. This
probably could be done smarter, but if you write-fsync-truncate-write-fsync
you deserve what you get. All this work is in RAM of course so if your
inode gets evicted from cache and you read it in and fsync it we'll do it
the slow way if we are still in the same transaction that we last modified
the inode in.
The biggest cool part of this is that it requires no changes to the recovery
code, so if you fsync with this patch and crash and load an old kernel, it
will run the recovery and be a-ok. I have tested this pretty thoroughly
with an fsync tester and everything comes back fine, as well as xfstests.
Thanks,
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
2012-08-17 21:14:17 +04:00
/*
* We ' v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
* reason , it ' s no longer relevant .
*/
clear_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2007-08-11 00:22:09 +04:00
goto out ;
}
/*
2007-08-28 00:49:44 +04:00
* ok we haven ' t committed the transaction yet , lets do a commit
*/
2010-05-29 13:49:07 +04:00
if ( file - > private_data )
2008-06-10 18:07:39 +04:00
btrfs_ioctl_trans_end ( file ) ;
2014-01-15 22:34:13 +04:00
/*
* We use start here because we will need to wait on the IO to complete
* in btrfs_sync_log , which could require joining a transaction ( for
* example checking cross references in the nocow path ) . If we use join
* here we could get into a situation where we ' re waiting on IO to
* happen that is blocked on a transaction trying to commit . With start
* we inc the extwriter counter , so we wait for all extwriters to exit
* before we start blocking join ' ers . This comment is to keep somebody
* from thinking they are super smart and changing this to
* btrfs_join_transaction * cough * Josef * cough * .
*/
2010-05-16 18:48:46 +04:00
trans = btrfs_start_transaction ( root , 0 ) ;
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2007-06-12 14:35:45 +04:00
goto out ;
}
2014-01-15 22:34:13 +04:00
trans - > sync = true ;
2008-09-06 00:13:11 +04:00
2014-02-20 14:08:58 +04:00
btrfs_init_log_ctx ( & ctx ) ;
ret = btrfs_log_dentry_safe ( trans , root , dentry , & ctx ) ;
2011-07-17 04:44:56 +04:00
if ( ret < 0 ) {
2013-09-11 23:36:44 +04:00
/* Fallthrough and commit/free transaction. */
ret = 1 ;
2011-07-17 04:44:56 +04:00
}
2008-09-11 23:53:12 +04:00
/* we've logged all the items and now have a consistent
* version of the file in the log . It is possible that
* someone will come in and modify the file , but that ' s
* fine because the log is consistent on disk , and we
* have references to all of the file ' s extents
*
* It is possible that someone will come in and log the
* file again , but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe .
*/
2011-07-17 04:44:56 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
2008-09-11 23:53:12 +04:00
2009-10-13 21:21:08 +04:00
if ( ret ! = BTRFS_NO_LOG_SYNC ) {
2013-10-26 00:13:35 +04:00
if ( ! ret ) {
2014-02-20 14:08:58 +04:00
ret = btrfs_sync_log ( trans , root , & ctx ) ;
2013-10-26 00:13:35 +04:00
if ( ! ret ) {
2009-10-13 21:21:08 +04:00
ret = btrfs_end_transaction ( trans , root ) ;
2013-10-26 00:13:35 +04:00
goto out ;
2012-10-12 23:27:49 +04:00
}
2009-10-13 21:21:08 +04:00
}
2013-10-26 00:13:35 +04:00
if ( ! full_sync ) {
ret = btrfs_wait_ordered_range ( inode , start ,
end - start + 1 ) ;
if ( ret )
goto out ;
}
ret = btrfs_commit_transaction ( trans , root ) ;
2009-10-13 21:21:08 +04:00
} else {
ret = btrfs_end_transaction ( trans , root ) ;
2008-09-06 00:13:11 +04:00
}
2007-06-12 14:35:45 +04:00
out :
2010-01-29 13:42:11 +03:00
return ret > 0 ? - EIO : ret ;
2007-06-12 14:35:45 +04:00
}
2009-09-27 22:29:37 +04:00
static const struct vm_operations_struct btrfs_file_vm_ops = {
2007-07-25 20:31:35 +04:00
. fault = filemap_fault ,
2014-04-08 02:37:19 +04:00
. map_pages = filemap_map_pages ,
2007-06-15 21:50:00 +04:00
. page_mkwrite = btrfs_page_mkwrite ,
2012-10-09 03:28:46 +04:00
. remap_pages = generic_file_remap_pages ,
2007-06-15 21:50:00 +04:00
} ;
static int btrfs_file_mmap ( struct file * filp , struct vm_area_struct * vma )
{
2010-05-20 11:21:50 +04:00
struct address_space * mapping = filp - > f_mapping ;
if ( ! mapping - > a_ops - > readpage )
return - ENOEXEC ;
2007-06-15 21:50:00 +04:00
file_accessed ( filp ) ;
2010-05-20 11:21:50 +04:00
vma - > vm_ops = & btrfs_file_vm_ops ;
2007-06-15 21:50:00 +04:00
return 0 ;
}
2012-08-29 22:27:18 +04:00
static int hole_mergeable ( struct inode * inode , struct extent_buffer * leaf ,
int slot , u64 start , u64 end )
{
struct btrfs_file_extent_item * fi ;
struct btrfs_key key ;
if ( slot < 0 | | slot > = btrfs_header_nritems ( leaf ) )
return 0 ;
btrfs_item_key_to_cpu ( leaf , & key , slot ) ;
if ( key . objectid ! = btrfs_ino ( inode ) | |
key . type ! = BTRFS_EXTENT_DATA_KEY )
return 0 ;
fi = btrfs_item_ptr ( leaf , slot , struct btrfs_file_extent_item ) ;
if ( btrfs_file_extent_type ( leaf , fi ) ! = BTRFS_FILE_EXTENT_REG )
return 0 ;
if ( btrfs_file_extent_disk_bytenr ( leaf , fi ) )
return 0 ;
if ( key . offset = = end )
return 1 ;
if ( key . offset + btrfs_file_extent_num_bytes ( leaf , fi ) = = start )
return 1 ;
return 0 ;
}
static int fill_holes ( struct btrfs_trans_handle * trans , struct inode * inode ,
struct btrfs_path * path , u64 offset , u64 end )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct extent_buffer * leaf ;
struct btrfs_file_extent_item * fi ;
struct extent_map * hole_em ;
struct extent_map_tree * em_tree = & BTRFS_I ( inode ) - > extent_tree ;
struct btrfs_key key ;
int ret ;
2013-10-22 20:18:51 +04:00
if ( btrfs_fs_incompat ( root - > fs_info , NO_HOLES ) )
goto out ;
2012-08-29 22:27:18 +04:00
key . objectid = btrfs_ino ( inode ) ;
key . type = BTRFS_EXTENT_DATA_KEY ;
key . offset = offset ;
ret = btrfs_search_slot ( trans , root , & key , path , 0 , 1 ) ;
if ( ret < 0 )
return ret ;
BUG_ON ( ! ret ) ;
leaf = path - > nodes [ 0 ] ;
if ( hole_mergeable ( inode , leaf , path - > slots [ 0 ] - 1 , offset , end ) ) {
u64 num_bytes ;
path - > slots [ 0 ] - - ;
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
num_bytes = btrfs_file_extent_num_bytes ( leaf , fi ) +
end - offset ;
btrfs_set_file_extent_num_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_ram_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_offset ( leaf , fi , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
if ( hole_mergeable ( inode , leaf , path - > slots [ 0 ] + 1 , offset , end ) ) {
u64 num_bytes ;
path - > slots [ 0 ] + + ;
key . offset = offset ;
2013-04-16 09:18:22 +04:00
btrfs_set_item_key_safe ( root , path , & key ) ;
2012-08-29 22:27:18 +04:00
fi = btrfs_item_ptr ( leaf , path - > slots [ 0 ] ,
struct btrfs_file_extent_item ) ;
num_bytes = btrfs_file_extent_num_bytes ( leaf , fi ) + end -
offset ;
btrfs_set_file_extent_num_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_ram_bytes ( leaf , fi , num_bytes ) ;
btrfs_set_file_extent_offset ( leaf , fi , 0 ) ;
btrfs_mark_buffer_dirty ( leaf ) ;
goto out ;
}
btrfs_release_path ( path ) ;
ret = btrfs_insert_file_extent ( trans , root , btrfs_ino ( inode ) , offset ,
0 , 0 , end - offset , 0 , end - offset ,
0 , 0 , 0 ) ;
if ( ret )
return ret ;
out :
btrfs_release_path ( path ) ;
hole_em = alloc_extent_map ( ) ;
if ( ! hole_em ) {
btrfs_drop_extent_cache ( inode , offset , end - 1 , 0 ) ;
set_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
} else {
hole_em - > start = offset ;
hole_em - > len = end - offset ;
2013-04-04 22:31:27 +04:00
hole_em - > ram_bytes = hole_em - > len ;
2012-08-29 22:27:18 +04:00
hole_em - > orig_start = offset ;
hole_em - > block_start = EXTENT_MAP_HOLE ;
hole_em - > block_len = 0 ;
2012-12-03 19:31:19 +04:00
hole_em - > orig_block_len = 0 ;
2012-08-29 22:27:18 +04:00
hole_em - > bdev = root - > fs_info - > fs_devices - > latest_bdev ;
hole_em - > compress_type = BTRFS_COMPRESS_NONE ;
hole_em - > generation = trans - > transid ;
do {
btrfs_drop_extent_cache ( inode , offset , end - 1 , 0 ) ;
write_lock ( & em_tree - > lock ) ;
2013-04-06 00:51:15 +04:00
ret = add_extent_mapping ( em_tree , hole_em , 1 ) ;
2012-08-29 22:27:18 +04:00
write_unlock ( & em_tree - > lock ) ;
} while ( ret = = - EEXIST ) ;
free_extent_map ( hole_em ) ;
if ( ret )
set_bit ( BTRFS_INODE_NEEDS_FULL_SYNC ,
& BTRFS_I ( inode ) - > runtime_flags ) ;
}
return 0 ;
}
static int btrfs_punch_hole ( struct inode * inode , loff_t offset , loff_t len )
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
struct extent_state * cached_state = NULL ;
struct btrfs_path * path ;
struct btrfs_block_rsv * rsv ;
struct btrfs_trans_handle * trans ;
2012-12-05 14:54:12 +04:00
u64 lockstart = round_up ( offset , BTRFS_I ( inode ) - > root - > sectorsize ) ;
u64 lockend = round_down ( offset + len ,
BTRFS_I ( inode ) - > root - > sectorsize ) - 1 ;
2012-08-29 22:27:18 +04:00
u64 cur_offset = lockstart ;
u64 min_size = btrfs_calc_trunc_metadata_size ( root , 1 ) ;
u64 drop_end ;
int ret = 0 ;
int err = 0 ;
2013-10-22 20:18:51 +04:00
int rsv_count ;
2012-12-05 14:53:45 +04:00
bool same_page = ( ( offset > > PAGE_CACHE_SHIFT ) = =
( ( offset + len - 1 ) > > PAGE_CACHE_SHIFT ) ) ;
2013-10-22 20:18:51 +04:00
bool no_holes = btrfs_fs_incompat ( root - > fs_info , NO_HOLES ) ;
2014-04-26 04:35:31 +04:00
u64 ino_size ;
2012-08-29 22:27:18 +04:00
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , offset , len ) ;
if ( ret )
return ret ;
2012-08-29 22:27:18 +04:00
mutex_lock ( & inode - > i_mutex ) ;
2014-04-26 04:35:31 +04:00
ino_size = round_up ( inode - > i_size , PAGE_CACHE_SIZE ) ;
2012-12-05 14:54:52 +04:00
/*
* We needn ' t truncate any page which is beyond the end of the file
* because we are sure there is no data there .
*/
2012-08-29 22:27:18 +04:00
/*
* Only do this if we are in the same page and we aren ' t doing the
* entire page .
*/
if ( same_page & & len < PAGE_CACHE_SIZE ) {
2014-02-15 19:55:58 +04:00
if ( offset < ino_size )
2012-12-05 14:54:52 +04:00
ret = btrfs_truncate_page ( inode , offset , len , 0 ) ;
2012-08-29 22:27:18 +04:00
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
/* zero back part of the first page */
2014-02-15 19:55:58 +04:00
if ( offset < ino_size ) {
2012-12-05 14:54:52 +04:00
ret = btrfs_truncate_page ( inode , offset , 0 , 0 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
}
/* zero the front end of the last page */
2014-02-15 19:55:58 +04:00
if ( offset + len < ino_size ) {
2012-12-05 14:54:12 +04:00
ret = btrfs_truncate_page ( inode , offset + len , 0 , 1 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
}
if ( lockend < lockstart ) {
mutex_unlock ( & inode - > i_mutex ) ;
return 0 ;
}
while ( 1 ) {
struct btrfs_ordered_extent * ordered ;
truncate_pagecache_range ( inode , lockstart , lockend ) ;
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
0 , & cached_state ) ;
ordered = btrfs_lookup_first_ordered_extent ( inode , lockend ) ;
/*
* We need to make sure we have no ordered extents in this range
* and nobody raced in and read a page in this range , if we did
* we need to try again .
*/
if ( ( ! ordered | |
2013-11-19 20:19:24 +04:00
( ordered - > file_offset + ordered - > len < = lockstart | |
2012-08-29 22:27:18 +04:00
ordered - > file_offset > lockend ) ) & &
! test_range_bit ( & BTRFS_I ( inode ) - > io_tree , lockstart ,
lockend , EXTENT_UPTODATE , 0 ,
cached_state ) ) {
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
break ;
}
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart ,
lockend , & cached_state , GFP_NOFS ) ;
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , lockstart ,
lockend - lockstart + 1 ) ;
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
2012-08-29 22:27:18 +04:00
}
path = btrfs_alloc_path ( ) ;
if ( ! path ) {
ret = - ENOMEM ;
goto out ;
}
2012-09-06 14:02:28 +04:00
rsv = btrfs_alloc_block_rsv ( root , BTRFS_BLOCK_RSV_TEMP ) ;
2012-08-29 22:27:18 +04:00
if ( ! rsv ) {
ret = - ENOMEM ;
goto out_free ;
}
rsv - > size = btrfs_calc_trunc_metadata_size ( root , 1 ) ;
rsv - > failfast = 1 ;
/*
* 1 - update the inode
* 1 - removing the extents in the range
2013-10-22 20:18:51 +04:00
* 1 - adding the hole extent if no_holes isn ' t set
2012-08-29 22:27:18 +04:00
*/
2013-10-22 20:18:51 +04:00
rsv_count = no_holes ? 2 : 3 ;
trans = btrfs_start_transaction ( root , rsv_count ) ;
2012-08-29 22:27:18 +04:00
if ( IS_ERR ( trans ) ) {
err = PTR_ERR ( trans ) ;
goto out_free ;
}
ret = btrfs_block_rsv_migrate ( & root - > fs_info - > trans_block_rsv , rsv ,
min_size ) ;
BUG_ON ( ret ) ;
trans - > block_rsv = rsv ;
while ( cur_offset < lockend ) {
ret = __btrfs_drop_extents ( trans , root , inode , path ,
cur_offset , lockend + 1 ,
2014-01-07 15:42:27 +04:00
& drop_end , 1 , 0 , 0 , NULL ) ;
2012-08-29 22:27:18 +04:00
if ( ret ! = - ENOSPC )
break ;
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
2014-02-15 19:55:58 +04:00
if ( cur_offset < ino_size ) {
ret = fill_holes ( trans , inode , path , cur_offset ,
drop_end ) ;
if ( ret ) {
err = ret ;
break ;
}
2012-08-29 22:27:18 +04:00
}
cur_offset = drop_end ;
ret = btrfs_update_inode ( trans , root , inode ) ;
if ( ret ) {
err = ret ;
break ;
}
btrfs_end_transaction ( trans , root ) ;
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2012-08-29 22:27:18 +04:00
2013-10-22 20:18:51 +04:00
trans = btrfs_start_transaction ( root , rsv_count ) ;
2012-08-29 22:27:18 +04:00
if ( IS_ERR ( trans ) ) {
ret = PTR_ERR ( trans ) ;
trans = NULL ;
break ;
}
ret = btrfs_block_rsv_migrate ( & root - > fs_info - > trans_block_rsv ,
rsv , min_size ) ;
BUG_ON ( ret ) ; /* shouldn't happen */
trans - > block_rsv = rsv ;
}
if ( ret ) {
err = ret ;
goto out_trans ;
}
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
2014-02-15 19:55:58 +04:00
if ( cur_offset < ino_size ) {
ret = fill_holes ( trans , inode , path , cur_offset , drop_end ) ;
if ( ret ) {
err = ret ;
goto out_trans ;
}
2012-08-29 22:27:18 +04:00
}
out_trans :
if ( ! trans )
goto out_free ;
2012-11-08 08:47:33 +04:00
inode_inc_iversion ( inode ) ;
inode - > i_mtime = inode - > i_ctime = CURRENT_TIME ;
2012-08-29 22:27:18 +04:00
trans - > block_rsv = & root - > fs_info - > trans_block_rsv ;
ret = btrfs_update_inode ( trans , root , inode ) ;
btrfs_end_transaction ( trans , root ) ;
2012-11-14 18:34:34 +04:00
btrfs_btree_balance_dirty ( root ) ;
2012-08-29 22:27:18 +04:00
out_free :
btrfs_free_path ( path ) ;
btrfs_free_block_rsv ( root , rsv ) ;
out :
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
& cached_state , GFP_NOFS ) ;
mutex_unlock ( & inode - > i_mutex ) ;
if ( ret & & ! err )
err = ret ;
return err ;
}
2011-01-14 15:07:43 +03:00
static long btrfs_fallocate ( struct file * file , int mode ,
loff_t offset , loff_t len )
{
2013-01-24 02:07:38 +04:00
struct inode * inode = file_inode ( file ) ;
2011-01-14 15:07:43 +03:00
struct extent_state * cached_state = NULL ;
2013-03-19 14:57:14 +04:00
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2011-01-14 15:07:43 +03:00
u64 cur_offset ;
u64 last_byte ;
u64 alloc_start ;
u64 alloc_end ;
u64 alloc_hint = 0 ;
u64 locked_end ;
struct extent_map * em ;
2012-11-28 14:28:07 +04:00
int blocksize = BTRFS_I ( inode ) - > root - > sectorsize ;
2011-01-14 15:07:43 +03:00
int ret ;
2012-11-28 14:28:07 +04:00
alloc_start = round_down ( offset , blocksize ) ;
alloc_end = round_up ( offset + len , blocksize ) ;
2011-01-14 15:07:43 +03:00
2012-08-29 22:27:18 +04:00
/* Make sure we aren't being give some crap mode */
if ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE ) )
2011-01-14 15:07:43 +03:00
return - EOPNOTSUPP ;
2012-08-29 22:27:18 +04:00
if ( mode & FALLOC_FL_PUNCH_HOLE )
return btrfs_punch_hole ( inode , offset , len ) ;
2012-02-01 05:27:41 +04:00
/*
* Make sure we have enough space before we do the
* allocation .
*/
2012-11-28 14:28:54 +04:00
ret = btrfs_check_data_free_space ( inode , alloc_end - alloc_start ) ;
2012-02-01 05:27:41 +04:00
if ( ret )
return ret ;
2013-03-19 14:57:14 +04:00
if ( root - > fs_info - > quota_enabled ) {
ret = btrfs_qgroup_reserve ( root , alloc_end - alloc_start ) ;
if ( ret )
goto out_reserve_fail ;
}
2012-02-01 05:27:41 +04:00
2011-01-14 15:07:43 +03:00
mutex_lock ( & inode - > i_mutex ) ;
ret = inode_newsize_ok ( inode , alloc_end ) ;
if ( ret )
goto out ;
if ( alloc_start > inode - > i_size ) {
2011-01-31 23:30:16 +03:00
ret = btrfs_cont_expand ( inode , i_size_read ( inode ) ,
alloc_start ) ;
2011-01-14 15:07:43 +03:00
if ( ret )
goto out ;
2013-06-18 01:14:39 +04:00
} else {
/*
* If we are fallocating from the end of the file onward we
* need to zero out the end of the page if i_size lands in the
* middle of a page .
*/
ret = btrfs_truncate_page ( inode , inode - > i_size , 0 , 0 ) ;
if ( ret )
goto out ;
2011-01-14 15:07:43 +03:00
}
2013-06-18 01:14:39 +04:00
/*
* wait for ordered IO before we have any locks . We ' ll loop again
* below with the locks held .
*/
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , alloc_start ,
alloc_end - alloc_start ) ;
if ( ret )
goto out ;
2013-06-18 01:14:39 +04:00
2011-01-14 15:07:43 +03:00
locked_end = alloc_end - 1 ;
while ( 1 ) {
struct btrfs_ordered_extent * ordered ;
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , alloc_start ,
2012-03-01 17:57:19 +04:00
locked_end , 0 , & cached_state ) ;
2011-01-14 15:07:43 +03:00
ordered = btrfs_lookup_first_ordered_extent ( inode ,
alloc_end - 1 ) ;
if ( ordered & &
ordered - > file_offset + ordered - > len > alloc_start & &
ordered - > file_offset < alloc_end ) {
btrfs_put_ordered_extent ( ordered ) ;
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree ,
alloc_start , locked_end ,
& cached_state , GFP_NOFS ) ;
/*
* we can ' t wait on the range with the transaction
* running or with the extent lock held
*/
2013-10-26 00:13:35 +04:00
ret = btrfs_wait_ordered_range ( inode , alloc_start ,
alloc_end - alloc_start ) ;
if ( ret )
goto out ;
2011-01-14 15:07:43 +03:00
} else {
if ( ordered )
btrfs_put_ordered_extent ( ordered ) ;
break ;
}
}
cur_offset = alloc_start ;
while ( 1 ) {
2011-08-18 18:36:39 +04:00
u64 actual_end ;
2011-01-14 15:07:43 +03:00
em = btrfs_get_extent ( inode , NULL , 0 , cur_offset ,
alloc_end - cur_offset , 0 ) ;
2012-03-12 19:03:00 +04:00
if ( IS_ERR_OR_NULL ( em ) ) {
if ( ! em )
ret = - ENOMEM ;
else
ret = PTR_ERR ( em ) ;
break ;
}
2011-01-14 15:07:43 +03:00
last_byte = min ( extent_map_end ( em ) , alloc_end ) ;
2011-08-18 18:36:39 +04:00
actual_end = min_t ( u64 , extent_map_end ( em ) , offset + len ) ;
2012-11-28 14:28:07 +04:00
last_byte = ALIGN ( last_byte , blocksize ) ;
2011-08-18 18:36:39 +04:00
2011-01-14 15:07:43 +03:00
if ( em - > block_start = = EXTENT_MAP_HOLE | |
( cur_offset > = inode - > i_size & &
! test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) ) {
ret = btrfs_prealloc_file_range ( inode , mode , cur_offset ,
last_byte - cur_offset ,
1 < < inode - > i_blkbits ,
offset + len ,
& alloc_hint ) ;
2011-08-17 18:19:52 +04:00
2011-01-14 15:07:43 +03:00
if ( ret < 0 ) {
free_extent_map ( em ) ;
break ;
}
2011-08-18 18:36:39 +04:00
} else if ( actual_end > inode - > i_size & &
! ( mode & FALLOC_FL_KEEP_SIZE ) ) {
/*
* We didn ' t need to allocate any more space , but we
* still extended the size of the file so we need to
* update i_size .
*/
inode - > i_ctime = CURRENT_TIME ;
i_size_write ( inode , actual_end ) ;
btrfs_ordered_update_i_size ( inode , actual_end , NULL ) ;
2011-01-14 15:07:43 +03:00
}
free_extent_map ( em ) ;
cur_offset = last_byte ;
if ( cur_offset > = alloc_end ) {
ret = 0 ;
break ;
}
}
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , alloc_start , locked_end ,
& cached_state , GFP_NOFS ) ;
out :
mutex_unlock ( & inode - > i_mutex ) ;
2013-03-19 14:57:14 +04:00
if ( root - > fs_info - > quota_enabled )
btrfs_qgroup_free ( root , alloc_end - alloc_start ) ;
out_reserve_fail :
2012-02-01 05:27:41 +04:00
/* Let go of our reservation. */
2012-11-28 14:28:54 +04:00
btrfs_free_reserved_data_space ( inode , alloc_end - alloc_start ) ;
2011-01-14 15:07:43 +03:00
return ret ;
}
2012-12-18 03:59:39 +04:00
static int find_desired_extent ( struct inode * inode , loff_t * offset , int whence )
2011-07-18 21:21:36 +04:00
{
struct btrfs_root * root = BTRFS_I ( inode ) - > root ;
2013-10-18 19:44:46 +04:00
struct extent_map * em = NULL ;
2011-07-18 21:21:36 +04:00
struct extent_state * cached_state = NULL ;
u64 lockstart = * offset ;
u64 lockend = i_size_read ( inode ) ;
u64 start = * offset ;
u64 len = i_size_read ( inode ) ;
int ret = 0 ;
lockend = max_t ( u64 , root - > sectorsize , lockend ) ;
if ( lockend < = lockstart )
lockend = lockstart + root - > sectorsize ;
2013-01-07 07:53:08 +04:00
lockend - - ;
2011-07-18 21:21:36 +04:00
len = lockend - lockstart + 1 ;
len = max_t ( u64 , len , root - > sectorsize ) ;
if ( inode - > i_size = = 0 )
return - ENXIO ;
lock_extent_bits ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend , 0 ,
2012-03-01 17:57:19 +04:00
& cached_state ) ;
2011-07-18 21:21:36 +04:00
2013-10-18 19:44:46 +04:00
while ( start < inode - > i_size ) {
2011-07-18 21:21:36 +04:00
em = btrfs_get_extent_fiemap ( inode , NULL , 0 , start , len , 0 ) ;
if ( IS_ERR ( em ) ) {
2012-02-09 10:25:50 +04:00
ret = PTR_ERR ( em ) ;
2013-10-18 19:44:46 +04:00
em = NULL ;
2011-07-18 21:21:36 +04:00
break ;
}
2013-10-18 19:44:46 +04:00
if ( whence = = SEEK_HOLE & &
( em - > block_start = = EXTENT_MAP_HOLE | |
test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) )
break ;
else if ( whence = = SEEK_DATA & &
( em - > block_start ! = EXTENT_MAP_HOLE & &
! test_bit ( EXTENT_FLAG_PREALLOC , & em - > flags ) ) )
break ;
2011-07-18 21:21:36 +04:00
start = em - > start + em - > len ;
free_extent_map ( em ) ;
2013-10-18 19:44:46 +04:00
em = NULL ;
2011-07-18 21:21:36 +04:00
cond_resched ( ) ;
}
2013-10-18 19:44:46 +04:00
free_extent_map ( em ) ;
if ( ! ret ) {
if ( whence = = SEEK_DATA & & start > = inode - > i_size )
ret = - ENXIO ;
else
* offset = min_t ( loff_t , start , inode - > i_size ) ;
}
2011-07-18 21:21:36 +04:00
unlock_extent_cached ( & BTRFS_I ( inode ) - > io_tree , lockstart , lockend ,
& cached_state , GFP_NOFS ) ;
return ret ;
}
2012-12-18 03:59:39 +04:00
static loff_t btrfs_file_llseek ( struct file * file , loff_t offset , int whence )
2011-07-18 21:21:36 +04:00
{
struct inode * inode = file - > f_mapping - > host ;
int ret ;
mutex_lock ( & inode - > i_mutex ) ;
2012-12-18 03:59:39 +04:00
switch ( whence ) {
2011-07-18 21:21:36 +04:00
case SEEK_END :
case SEEK_CUR :
2012-12-18 03:59:39 +04:00
offset = generic_file_llseek ( file , offset , whence ) ;
2011-07-18 21:21:36 +04:00
goto out ;
case SEEK_DATA :
case SEEK_HOLE :
2011-09-18 18:34:02 +04:00
if ( offset > = i_size_read ( inode ) ) {
mutex_unlock ( & inode - > i_mutex ) ;
return - ENXIO ;
}
2012-12-18 03:59:39 +04:00
ret = find_desired_extent ( inode , & offset , whence ) ;
2011-07-18 21:21:36 +04:00
if ( ret ) {
mutex_unlock ( & inode - > i_mutex ) ;
return ret ;
}
}
2013-06-25 08:02:13 +04:00
offset = vfs_setpos ( file , offset , inode - > i_sb - > s_maxbytes ) ;
2011-07-18 21:21:36 +04:00
out :
mutex_unlock ( & inode - > i_mutex ) ;
return offset ;
}
2009-10-02 02:43:56 +04:00
const struct file_operations btrfs_file_operations = {
2011-07-18 21:21:36 +04:00
. llseek = btrfs_file_llseek ,
2007-06-12 14:35:45 +04:00
. read = do_sync_read ,
2010-06-07 07:38:51 +04:00
. write = do_sync_write ,
2007-06-15 21:50:00 +04:00
. aio_read = generic_file_aio_read ,
2007-12-14 20:56:58 +03:00
. splice_read = generic_file_splice_read ,
2010-05-23 19:07:21 +04:00
. aio_write = btrfs_file_aio_write ,
2007-06-15 21:50:00 +04:00
. mmap = btrfs_file_mmap ,
2007-06-12 14:35:45 +04:00
. open = generic_file_open ,
2008-05-27 18:55:43 +04:00
. release = btrfs_release_file ,
2007-06-12 14:35:45 +04:00
. fsync = btrfs_sync_file ,
2011-01-14 15:07:43 +03:00
. fallocate = btrfs_fallocate ,
2007-09-14 18:22:47 +04:00
. unlocked_ioctl = btrfs_ioctl ,
2007-06-12 14:35:45 +04:00
# ifdef CONFIG_COMPAT
2007-09-14 18:22:47 +04:00
. compat_ioctl = btrfs_ioctl ,
2007-06-12 14:35:45 +04:00
# endif
} ;
2012-11-26 13:24:43 +04:00
void btrfs_auto_defrag_exit ( void )
{
if ( btrfs_inode_defrag_cachep )
kmem_cache_destroy ( btrfs_inode_defrag_cachep ) ;
}
int btrfs_auto_defrag_init ( void )
{
btrfs_inode_defrag_cachep = kmem_cache_create ( " btrfs_inode_defrag " ,
sizeof ( struct inode_defrag ) , 0 ,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD ,
NULL ) ;
if ( ! btrfs_inode_defrag_cachep )
return - ENOMEM ;
return 0 ;
}