Btrfs: add tree modification log functions
The tree mod log will log modifications made fs-tree nodes. Most modifications are done by autobalance of the tree. Such changes are recorded as long as a block entry exists. When released, the log is cleaned. With the tree modification log, it's possible to reconstruct a consistent old state of the tree. This is required to do backref walking on a busy file system. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
This commit is contained in:
parent
f29021b29a
commit
bd989ba359
408
fs/btrfs/ctree.c
408
fs/btrfs/ctree.c
@ -18,6 +18,7 @@
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "transaction.h"
|
||||
@ -288,6 +289,412 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum mod_log_op {
|
||||
MOD_LOG_KEY_REPLACE,
|
||||
MOD_LOG_KEY_ADD,
|
||||
MOD_LOG_KEY_REMOVE,
|
||||
MOD_LOG_KEY_REMOVE_WHILE_FREEING,
|
||||
MOD_LOG_KEY_REMOVE_WHILE_MOVING,
|
||||
MOD_LOG_MOVE_KEYS,
|
||||
MOD_LOG_ROOT_REPLACE,
|
||||
};
|
||||
|
||||
struct tree_mod_move {
|
||||
int dst_slot;
|
||||
int nr_items;
|
||||
};
|
||||
|
||||
struct tree_mod_root {
|
||||
u64 logical;
|
||||
u8 level;
|
||||
};
|
||||
|
||||
struct tree_mod_elem {
|
||||
struct rb_node node;
|
||||
u64 index; /* shifted logical */
|
||||
struct seq_list elem;
|
||||
enum mod_log_op op;
|
||||
|
||||
/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
|
||||
int slot;
|
||||
|
||||
/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
|
||||
u64 generation;
|
||||
|
||||
/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
|
||||
struct btrfs_disk_key key;
|
||||
u64 blockptr;
|
||||
|
||||
/* this is used for op == MOD_LOG_MOVE_KEYS */
|
||||
struct tree_mod_move move;
|
||||
|
||||
/* this is used for op == MOD_LOG_ROOT_REPLACE */
|
||||
struct tree_mod_root old_root;
|
||||
};
|
||||
|
||||
static inline void
|
||||
__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
|
||||
{
|
||||
elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
|
||||
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
|
||||
}
|
||||
|
||||
void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem)
|
||||
{
|
||||
elem->flags = 1;
|
||||
spin_lock(&fs_info->tree_mod_seq_lock);
|
||||
__get_tree_mod_seq(fs_info, elem);
|
||||
spin_unlock(&fs_info->tree_mod_seq_lock);
|
||||
}
|
||||
|
||||
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node *node;
|
||||
struct rb_node *next;
|
||||
struct seq_list *cur_elem;
|
||||
struct tree_mod_elem *tm;
|
||||
u64 min_seq = (u64)-1;
|
||||
u64 seq_putting = elem->seq;
|
||||
|
||||
if (!seq_putting)
|
||||
return;
|
||||
|
||||
BUG_ON(!(elem->flags & 1));
|
||||
spin_lock(&fs_info->tree_mod_seq_lock);
|
||||
list_del(&elem->list);
|
||||
|
||||
list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
|
||||
if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
|
||||
if (seq_putting > cur_elem->seq) {
|
||||
/*
|
||||
* blocker with lower sequence number exists, we
|
||||
* cannot remove anything from the log
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
min_seq = cur_elem->seq;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* anything that's lower than the lowest existing (read: blocked)
|
||||
* sequence number can be removed from the tree.
|
||||
*/
|
||||
write_lock(&fs_info->tree_mod_log_lock);
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
for (node = rb_first(tm_root); node; node = next) {
|
||||
next = rb_next(node);
|
||||
tm = container_of(node, struct tree_mod_elem, node);
|
||||
if (tm->elem.seq > min_seq)
|
||||
continue;
|
||||
rb_erase(node, tm_root);
|
||||
list_del(&tm->elem.list);
|
||||
kfree(tm);
|
||||
}
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
out:
|
||||
spin_unlock(&fs_info->tree_mod_seq_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* key order of the log:
|
||||
* index -> sequence
|
||||
*
|
||||
* the index is the shifted logical of the *new* root node for root replace
|
||||
* operations, or the shifted logical of the affected block for all other
|
||||
* operations.
|
||||
*/
|
||||
static noinline int
|
||||
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node **new;
|
||||
struct rb_node *parent = NULL;
|
||||
struct tree_mod_elem *cur;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(!tm || !tm->elem.seq);
|
||||
|
||||
write_lock(&fs_info->tree_mod_log_lock);
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
new = &tm_root->rb_node;
|
||||
while (*new) {
|
||||
cur = container_of(*new, struct tree_mod_elem, node);
|
||||
parent = *new;
|
||||
if (cur->index < tm->index)
|
||||
new = &((*new)->rb_left);
|
||||
else if (cur->index > tm->index)
|
||||
new = &((*new)->rb_right);
|
||||
else if (cur->elem.seq < tm->elem.seq)
|
||||
new = &((*new)->rb_left);
|
||||
else if (cur->elem.seq > tm->elem.seq)
|
||||
new = &((*new)->rb_right);
|
||||
else {
|
||||
kfree(tm);
|
||||
ret = -EEXIST;
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
rb_link_node(&tm->node, parent, new);
|
||||
rb_insert_color(&tm->node, tm_root);
|
||||
unlock:
|
||||
write_unlock(&fs_info->tree_mod_log_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
|
||||
struct tree_mod_elem **tm_ret)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
u64 seq = 0;
|
||||
|
||||
smp_mb();
|
||||
if (list_empty(&fs_info->tree_mod_seq_list))
|
||||
return 0;
|
||||
|
||||
tm = *tm_ret = kzalloc(sizeof(*tm), flags);
|
||||
if (!tm)
|
||||
return -ENOMEM;
|
||||
|
||||
__get_tree_mod_seq(fs_info, &tm->elem);
|
||||
seq = tm->elem.seq;
|
||||
tm->elem.flags = 0;
|
||||
|
||||
return seq;
|
||||
}
|
||||
|
||||
static noinline int
|
||||
tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb, int slot,
|
||||
enum mod_log_op op, gfp_t flags)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
int ret;
|
||||
|
||||
ret = tree_mod_alloc(fs_info, flags, &tm);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
tm->index = eb->start >> PAGE_CACHE_SHIFT;
|
||||
if (op != MOD_LOG_KEY_ADD) {
|
||||
btrfs_node_key(eb, &tm->key, slot);
|
||||
tm->blockptr = btrfs_node_blockptr(eb, slot);
|
||||
}
|
||||
tm->op = op;
|
||||
tm->slot = slot;
|
||||
tm->generation = btrfs_node_ptr_generation(eb, slot);
|
||||
|
||||
return __tree_mod_log_insert(fs_info, tm);
|
||||
}
|
||||
|
||||
static noinline int
|
||||
tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
|
||||
int slot, enum mod_log_op op)
|
||||
{
|
||||
return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
|
||||
}
|
||||
|
||||
static noinline int
|
||||
tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb, int dst_slot, int src_slot,
|
||||
int nr_items, gfp_t flags)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
ret = tree_mod_alloc(fs_info, flags, &tm);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
|
||||
ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
|
||||
MOD_LOG_KEY_REMOVE_WHILE_MOVING);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
|
||||
tm->index = eb->start >> PAGE_CACHE_SHIFT;
|
||||
tm->slot = src_slot;
|
||||
tm->move.dst_slot = dst_slot;
|
||||
tm->move.nr_items = nr_items;
|
||||
tm->op = MOD_LOG_MOVE_KEYS;
|
||||
|
||||
return __tree_mod_log_insert(fs_info, tm);
|
||||
}
|
||||
|
||||
static noinline int
|
||||
tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *old_root,
|
||||
struct extent_buffer *new_root, gfp_t flags)
|
||||
{
|
||||
struct tree_mod_elem *tm;
|
||||
int ret;
|
||||
|
||||
ret = tree_mod_alloc(fs_info, flags, &tm);
|
||||
if (ret <= 0)
|
||||
return ret;
|
||||
|
||||
tm->index = new_root->start >> PAGE_CACHE_SHIFT;
|
||||
tm->old_root.logical = old_root->start;
|
||||
tm->old_root.level = btrfs_header_level(old_root);
|
||||
tm->generation = btrfs_header_generation(old_root);
|
||||
tm->op = MOD_LOG_ROOT_REPLACE;
|
||||
|
||||
return __tree_mod_log_insert(fs_info, tm);
|
||||
}
|
||||
|
||||
static struct tree_mod_elem *
|
||||
__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
|
||||
int smallest)
|
||||
{
|
||||
struct rb_root *tm_root;
|
||||
struct rb_node *node;
|
||||
struct tree_mod_elem *cur = NULL;
|
||||
struct tree_mod_elem *found = NULL;
|
||||
u64 index = start >> PAGE_CACHE_SHIFT;
|
||||
|
||||
read_lock(&fs_info->tree_mod_log_lock);
|
||||
tm_root = &fs_info->tree_mod_log;
|
||||
node = tm_root->rb_node;
|
||||
while (node) {
|
||||
cur = container_of(node, struct tree_mod_elem, node);
|
||||
if (cur->index < index) {
|
||||
node = node->rb_left;
|
||||
} else if (cur->index > index) {
|
||||
node = node->rb_right;
|
||||
} else if (cur->elem.seq < min_seq) {
|
||||
node = node->rb_left;
|
||||
} else if (!smallest) {
|
||||
/* we want the node with the highest seq */
|
||||
if (found)
|
||||
BUG_ON(found->elem.seq > cur->elem.seq);
|
||||
found = cur;
|
||||
node = node->rb_left;
|
||||
} else if (cur->elem.seq > min_seq) {
|
||||
/* we want the node with the smallest seq */
|
||||
if (found)
|
||||
BUG_ON(found->elem.seq < cur->elem.seq);
|
||||
found = cur;
|
||||
node = node->rb_right;
|
||||
} else {
|
||||
found = cur;
|
||||
break;
|
||||
}
|
||||
}
|
||||
read_unlock(&fs_info->tree_mod_log_lock);
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* this returns the element from the log with the smallest time sequence
|
||||
* value that's in the log (the oldest log item). any element with a time
|
||||
* sequence lower than min_seq will be ignored.
|
||||
*/
|
||||
static struct tree_mod_elem *
|
||||
tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
|
||||
u64 min_seq)
|
||||
{
|
||||
return __tree_mod_log_search(fs_info, start, min_seq, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* this returns the element from the log with the largest time sequence
|
||||
* value that's in the log (the most recent log item). any element with
|
||||
* a time sequence lower than min_seq will be ignored.
|
||||
*/
|
||||
static struct tree_mod_elem *
|
||||
tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
|
||||
{
|
||||
return __tree_mod_log_search(fs_info, start, min_seq, 0);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
|
||||
struct extent_buffer *src, unsigned long dst_offset,
|
||||
unsigned long src_offset, int nr_items)
|
||||
{
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
smp_mb();
|
||||
if (list_empty(&fs_info->tree_mod_seq_list))
|
||||
return;
|
||||
|
||||
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
|
||||
return;
|
||||
|
||||
/* speed this up by single seq for all operations? */
|
||||
for (i = 0; i < nr_items; i++) {
|
||||
ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
|
||||
MOD_LOG_KEY_REMOVE);
|
||||
BUG_ON(ret < 0);
|
||||
ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
|
||||
MOD_LOG_KEY_ADD);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
|
||||
int dst_offset, int src_offset, int nr_items)
|
||||
{
|
||||
int ret;
|
||||
ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
|
||||
nr_items, GFP_NOFS);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
|
||||
static inline void
|
||||
tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb,
|
||||
struct btrfs_disk_key *disk_key, int slot, int atomic)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
|
||||
MOD_LOG_KEY_REPLACE,
|
||||
atomic ? GFP_ATOMIC : GFP_NOFS);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
|
||||
static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
|
||||
struct extent_buffer *eb)
|
||||
{
|
||||
int i;
|
||||
int ret;
|
||||
u32 nritems;
|
||||
|
||||
smp_mb();
|
||||
if (list_empty(&fs_info->tree_mod_seq_list))
|
||||
return;
|
||||
|
||||
if (btrfs_header_level(eb) == 0)
|
||||
return;
|
||||
|
||||
nritems = btrfs_header_nritems(eb);
|
||||
for (i = nritems - 1; i >= 0; i--) {
|
||||
ret = tree_mod_log_insert_key(fs_info, eb, i,
|
||||
MOD_LOG_KEY_REMOVE_WHILE_FREEING);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
tree_mod_log_set_root_pointer(struct btrfs_root *root,
|
||||
struct extent_buffer *new_root_node)
|
||||
{
|
||||
int ret;
|
||||
tree_mod_log_free_eb(root->fs_info, root->node);
|
||||
ret = tree_mod_log_insert_root(root->fs_info, root->node,
|
||||
new_root_node, GFP_NOFS);
|
||||
BUG_ON(ret < 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the tree block can be shared by multiple trees
|
||||
*/
|
||||
@ -2271,7 +2678,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
|
||||
(unsigned long)btrfs_header_chunk_tree_uuid(split),
|
||||
BTRFS_UUID_SIZE);
|
||||
|
||||
|
||||
copy_extent_buffer(split, c,
|
||||
btrfs_node_key_ptr_offset(0),
|
||||
btrfs_node_key_ptr_offset(mid),
|
||||
|
@ -3114,4 +3114,9 @@ struct seq_list {
|
||||
u32 flags;
|
||||
};
|
||||
|
||||
void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem);
|
||||
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
|
||||
struct seq_list *elem);
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user