2007-06-12 09:07:21 -04:00
/*
* Copyright ( C ) 2007 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
2008-01-08 15:46:30 -05:00
# ifndef __BTRFS_TRANSACTION__
# define __BTRFS_TRANSACTION__
2007-04-30 15:25:45 -04:00
# include "btrfs_inode.h"
2009-03-13 10:10:06 -04:00
# include "delayed-ref.h"
2012-06-28 18:03:02 +02:00
# include "ctree.h"
2007-03-16 16:20:31 -04:00
2007-03-22 15:59:16 -04:00
struct btrfs_transaction {
u64 transid ;
2009-03-12 20:12:45 -04:00
/*
* total writers in this transaction , it must be zero before the
* transaction can end
*/
2011-04-11 15:45:29 -04:00
atomic_t num_writers ;
2011-04-11 17:25:13 -04:00
atomic_t use_count ;
2009-03-12 20:12:45 -04:00
2007-08-10 16:22:09 -04:00
unsigned long num_joined ;
2011-04-11 17:25:13 -04:00
spinlock_t commit_lock ;
2007-03-22 15:59:16 -04:00
int in_commit ;
int commit_done ;
2008-07-17 12:54:14 -04:00
int blocked ;
2007-04-19 21:01:03 -04:00
struct list_head list ;
2008-01-24 16:13:08 -05:00
struct extent_io_tree dirty_pages ;
2007-06-08 15:33:54 -04:00
unsigned long start_time ;
2007-03-22 15:59:16 -04:00
wait_queue_head_t writer_wait ;
wait_queue_head_t commit_wait ;
2008-01-08 15:46:30 -05:00
struct list_head pending_snapshots ;
2009-03-13 10:10:06 -04:00
struct btrfs_delayed_ref_root delayed_refs ;
2012-03-01 17:24:58 +01:00
int aborted ;
2007-03-22 15:59:16 -04:00
} ;
2012-09-20 01:51:59 -06:00
enum btrfs_trans_type {
TRANS_START ,
TRANS_JOIN ,
TRANS_USERSPACE ,
TRANS_JOIN_NOLOCK ,
Btrfs: fix orphan transaction on the freezed filesystem
With the following debug patch:
static int btrfs_freeze(struct super_block *sb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_transaction *trans;
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans) {
+ printk("Transid %llu, use_count %d, num_writer %d\n",
+ trans->transid, atomic_read(&trans->use_count),
+ atomic_read(&trans->num_writers));
+ }
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
I found there was a orphan transaction after the freeze operation was done.
It is because the transaction may not be committed when the transaction handle
end even though it is the last handle of the current transaction. This design
avoid committing the transaction frequently, but also introduce the above
problem.
So I add btrfs_attach_transaction() which can catch the current transaction
and commit it. If there is no transaction, it will return ENOENT, and do not
anything.
This function also can be used to instead of btrfs_join_transaction_freeze()
because it don't increase the writer counter and don't start a new transaction,
so it also can fix the deadlock between sync and freeze.
Besides that, it is used to instead of btrfs_join_transaction() in
transaction_kthread(), because if there is no transaction, the transaction
kthread needn't anything.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2012-09-20 01:54:00 -06:00
TRANS_ATTACH ,
2012-09-20 01:51:59 -06:00
} ;
2007-03-16 16:20:31 -04:00
struct btrfs_trans_handle {
u64 transid ;
2010-05-16 10:46:25 -04:00
u64 bytes_reserved ;
2011-09-14 15:44:05 +02:00
u64 qgroup_reserved ;
2011-04-13 15:15:59 -04:00
unsigned long use_count ;
2007-03-16 16:20:31 -04:00
unsigned long blocks_reserved ;
unsigned long blocks_used ;
2009-03-13 10:10:06 -04:00
unsigned long delayed_ref_updates ;
2010-05-16 10:46:25 -04:00
struct btrfs_transaction * transaction ;
struct btrfs_block_rsv * block_rsv ;
2011-04-13 15:15:59 -04:00
struct btrfs_block_rsv * orig_rsv ;
2012-09-20 01:51:59 -06:00
short aborted ;
short adding_csums ;
enum btrfs_trans_type type ;
2011-09-13 11:40:09 +02:00
/*
* this root is only needed to validate that the root passed to
* start_transaction is the same as the one passed to end_transaction .
* Subvolume quota depends on this
*/
struct btrfs_root * root ;
2012-06-28 18:03:02 +02:00
struct seq_list delayed_ref_elem ;
struct list_head qgroup_ref_list ;
2012-09-11 16:57:25 -04:00
struct list_head new_bgs ;
2007-03-16 16:20:31 -04:00
} ;
2008-01-08 15:46:30 -05:00
struct btrfs_pending_snapshot {
2008-11-17 21:02:50 -05:00
struct dentry * dentry ;
2008-01-08 15:46:30 -05:00
struct btrfs_root * root ;
2010-05-16 10:48:46 -04:00
struct btrfs_root * snap ;
2011-09-14 15:58:21 +02:00
struct btrfs_qgroup_inherit * inherit ;
2010-05-16 10:48:46 -04:00
/* block reservation for the operation */
struct btrfs_block_rsv block_rsv ;
/* extra metadata reseration for relocation */
int error ;
2010-12-20 16:04:08 +08:00
bool readonly ;
2008-01-08 15:46:30 -05:00
struct list_head list ;
} ;
2007-08-10 16:22:09 -04:00
static inline void btrfs_set_inode_last_trans ( struct btrfs_trans_handle * trans ,
struct inode * inode )
{
BTRFS_I ( inode ) - > last_trans = trans - > transaction - > transid ;
2009-10-13 13:21:08 -04:00
BTRFS_I ( inode ) - > last_sub_trans = BTRFS_I ( inode ) - > root - > log_transid ;
2012-08-29 01:07:55 -06:00
BTRFS_I ( inode ) - > last_log_commit = BTRFS_I ( inode ) - > root - > last_log_commit ;
2007-08-10 16:22:09 -04:00
}
2007-03-22 15:59:16 -04:00
int btrfs_end_transaction ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
struct btrfs_trans_handle * btrfs_start_transaction ( struct btrfs_root * root ,
2010-05-16 10:48:46 -04:00
int num_items ) ;
Btrfs: fix corrupted metadata in the snapshot
When we delete a inode, we will remove all the delayed items including delayed
inode update, and then truncate all the relative metadata. If there is lots of
metadata, we will end the current transaction, and start a new transaction to
truncate the left metadata. In this way, we will leave a inode item that its
link counter is > 0, and also may leave some directory index items in fs/file tree
after the current transaction ends. In other words, the metadata in this fs/file tree
is inconsistent. If we create a snapshot for this tree now, we will find a inode with
corrupted metadata in the new snapshot, and we won't continue to drop the left metadata,
because its link counter is not 0.
We fix this problem by updating the inode item before the current transaction ends.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2012-09-07 01:43:32 -06:00
struct btrfs_trans_handle * btrfs_start_transaction_noflush (
struct btrfs_root * root , int num_items ) ;
2011-04-13 12:54:33 -04:00
struct btrfs_trans_handle * btrfs_join_transaction ( struct btrfs_root * root ) ;
struct btrfs_trans_handle * btrfs_join_transaction_nolock ( struct btrfs_root * root ) ;
Btrfs: fix orphan transaction on the freezed filesystem
With the following debug patch:
static int btrfs_freeze(struct super_block *sb)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_transaction *trans;
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans) {
+ printk("Transid %llu, use_count %d, num_writer %d\n",
+ trans->transid, atomic_read(&trans->use_count),
+ atomic_read(&trans->num_writers));
+ }
+ spin_unlock(&fs_info->trans_lock);
return 0;
}
I found there was a orphan transaction after the freeze operation was done.
It is because the transaction may not be committed when the transaction handle
end even though it is the last handle of the current transaction. This design
avoid committing the transaction frequently, but also introduce the above
problem.
So I add btrfs_attach_transaction() which can catch the current transaction
and commit it. If there is no transaction, it will return ENOENT, and do not
anything.
This function also can be used to instead of btrfs_join_transaction_freeze()
because it don't increase the writer counter and don't start a new transaction,
so it also can fix the deadlock between sync and freeze.
Besides that, it is used to instead of btrfs_join_transaction() in
transaction_kthread(), because if there is no transaction, the transaction
kthread needn't anything.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
2012-09-20 01:54:00 -06:00
struct btrfs_trans_handle * btrfs_attach_transaction ( struct btrfs_root * root ) ;
2011-04-13 12:54:33 -04:00
struct btrfs_trans_handle * btrfs_start_ioctl_transaction ( struct btrfs_root * root ) ;
Btrfs: add START_SYNC, WAIT_SYNC ioctls
START_SYNC will start a sync/commit, but not wait for it to
complete. Any modification started after the ioctl returns is
guaranteed not to be included in the commit. If a non-NULL
pointer is passed, the transaction id will be returned to
userspace.
WAIT_SYNC will wait for any in-progress commit to complete. If a
transaction id is specified, the ioctl will block and then
return (success) when the specified transaction has committed.
If it has already committed when we call the ioctl, it returns
immediately. If the specified transaction doesn't exist, it
returns EINVAL.
If no transaction id is specified, WAIT_SYNC will wait for the
currently committing transaction to finish it's commit to disk.
If there is no currently committing transaction, it returns
success.
These ioctls are useful for applications which want to impose an
ordering on when fs modifications reach disk, but do not want to
wait for the full (slow) commit process to do so.
Picky callers can take the transid returned by START_SYNC and
feed it to WAIT_SYNC, and be certain to wait only as long as
necessary for the transaction _they_ started to reach disk.
Sloppy callers can START_SYNC and WAIT_SYNC without a transid,
and provided they didn't wait too long between the calls, they
will get the same result. However, if a second commit starts
before they call WAIT_SYNC, they may end up waiting longer for
it to commit as well. Even so, a START_SYNC+WAIT_SYNC still
guarantees that any operation completed before the START_SYNC
reaches disk.
Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-29 15:41:32 -04:00
int btrfs_wait_for_commit ( struct btrfs_root * root , u64 transid ) ;
2007-03-22 15:59:16 -04:00
int btrfs_write_and_wait_transaction ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2007-06-08 15:33:54 -04:00
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
int btrfs_add_dead_root ( struct btrfs_root * root ) ;
2007-08-10 14:06:19 -04:00
int btrfs_defrag_root ( struct btrfs_root * root , int cacheonly ) ;
int btrfs_clean_old_snapshots ( struct btrfs_root * root ) ;
2007-10-15 16:14:19 -04:00
int btrfs_commit_transaction ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2010-10-29 15:37:34 -04:00
int btrfs_commit_transaction_async ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
int wait_for_unblock ) ;
2008-06-25 16:01:31 -04:00
int btrfs_end_transaction_throttle ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
btrfs: implement delayed inode items operation
Changelog V5 -> V6:
- Fix oom when the memory load is high, by storing the delayed nodes into the
root's radix tree, and letting btrfs inodes go.
Changelog V4 -> V5:
- Fix the race on adding the delayed node to the inode, which is spotted by
Chris Mason.
- Merge Chris Mason's incremental patch into this patch.
- Fix deadlock between readdir() and memory fault, which is reported by
Itaru Kitayama.
Changelog V3 -> V4:
- Fix nested lock, which is reported by Itaru Kitayama, by updating space cache
inode in time.
Changelog V2 -> V3:
- Fix the race between the delayed worker and the task which does delayed items
balance, which is reported by Tsutomu Itoh.
- Modify the patch address David Sterba's comment.
- Fix the bug of the cpu recursion spinlock, reported by Chris Mason
Changelog V1 -> V2:
- break up the global rb-tree, use a list to manage the delayed nodes,
which is created for every directory and file, and used to manage the
delayed directory name index items and the delayed inode item.
- introduce a worker to deal with the delayed nodes.
Compare with Ext3/4, the performance of file creation and deletion on btrfs
is very poor. the reason is that btrfs must do a lot of b+ tree insertions,
such as inode item, directory name item, directory name index and so on.
If we can do some delayed b+ tree insertion or deletion, we can improve the
performance, so we made this patch which implemented delayed directory name
index insertion/deletion and delayed inode update.
Implementation:
- introduce a delayed root object into the filesystem, that use two lists to
manage the delayed nodes which are created for every file/directory.
One is used to manage all the delayed nodes that have delayed items. And the
other is used to manage the delayed nodes which is waiting to be dealt with
by the work thread.
- Every delayed node has two rb-tree, one is used to manage the directory name
index which is going to be inserted into b+ tree, and the other is used to
manage the directory name index which is going to be deleted from b+ tree.
- introduce a worker to deal with the delayed operation. This worker is used
to deal with the works of the delayed directory name index items insertion
and deletion and the delayed inode update.
When the delayed items is beyond the lower limit, we create works for some
delayed nodes and insert them into the work queue of the worker, and then
go back.
When the delayed items is beyond the upper bound, we create works for all
the delayed nodes that haven't been dealt with, and insert them into the work
queue of the worker, and then wait for that the untreated items is below some
threshold value.
- When we want to insert a directory name index into b+ tree, we just add the
information into the delayed inserting rb-tree.
And then we check the number of the delayed items and do delayed items
balance. (The balance policy is above.)
- When we want to delete a directory name index from the b+ tree, we search it
in the inserting rb-tree at first. If we look it up, just drop it. If not,
add the key of it into the delayed deleting rb-tree.
Similar to the delayed inserting rb-tree, we also check the number of the
delayed items and do delayed items balance.
(The same to inserting manipulation)
- When we want to update the metadata of some inode, we cached the data of the
inode into the delayed node. the worker will flush it into the b+ tree after
dealing with the delayed insertion and deletion.
- We will move the delayed node to the tail of the list after we access the
delayed node, By this way, we can cache more delayed items and merge more
inode updates.
- If we want to commit transaction, we will deal with all the delayed node.
- the delayed node will be freed when we free the btrfs inode.
- Before we log the inode items, we commit all the directory name index items
and the delayed inode update.
I did a quick test by the benchmark tool[1] and found we can improve the
performance of file creation by ~15%, and file deletion by ~20%.
Before applying this patch:
Create files:
Total files: 50000
Total time: 1.096108
Average time: 0.000022
Delete files:
Total files: 50000
Total time: 1.510403
Average time: 0.000030
After applying this patch:
Create files:
Total files: 50000
Total time: 0.932899
Average time: 0.000019
Delete files:
Total files: 50000
Total time: 1.215732
Average time: 0.000024
[1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3
Many thanks for Kitayama-san's help!
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Reviewed-by: David Sterba <dave@jikos.cz>
Tested-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Tested-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2011-04-22 18:12:22 +08:00
int btrfs_end_transaction_dmeta ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2010-05-16 10:49:58 -04:00
int btrfs_should_end_transaction ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2008-07-29 16:15:18 -04:00
void btrfs_throttle ( struct btrfs_root * root ) ;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 10:45:14 -04:00
int btrfs_record_root_in_trans ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ) ;
2008-09-11 16:17:57 -04:00
int btrfs_write_and_wait_marked_extents ( struct btrfs_root * root ,
2009-11-12 09:33:26 +00:00
struct extent_io_tree * dirty_pages , int mark ) ;
2009-10-13 13:29:19 -04:00
int btrfs_write_marked_extents ( struct btrfs_root * root ,
2009-11-12 09:33:26 +00:00
struct extent_io_tree * dirty_pages , int mark ) ;
2009-10-13 13:29:19 -04:00
int btrfs_wait_marked_extents ( struct btrfs_root * root ,
2009-11-12 09:33:26 +00:00
struct extent_io_tree * dirty_pages , int mark ) ;
2010-05-16 10:49:58 -04:00
int btrfs_transaction_blocked ( struct btrfs_fs_info * info ) ;
2009-07-30 10:04:48 -04:00
int btrfs_transaction_in_commit ( struct btrfs_fs_info * info ) ;
2012-03-01 17:24:58 +01:00
void put_transaction ( struct btrfs_transaction * transaction ) ;
2007-03-16 16:20:31 -04:00
# endif