bcachefs: Add (partial) support for fixing btree topology

When we walk the btrees during recovery, part of that is checking that
btree topology is correct: for every interior btree node, its child
nodes should exactly span the range the parent node covers.

Previously, we had checks for this, but not repair code. Now that we
have the ability to do btree updates during initial GC, this patch adds
that repair code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-01-26 20:59:00 -05:00 committed by Kent Overstreet
parent 5b593ee172
commit a0b73c1c53
5 changed files with 158 additions and 46 deletions

View File

@ -509,7 +509,8 @@ enum {
BCH_FS_ERRORS_FIXED,
/* misc: */
BCH_FS_FIXED_GENS,
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_DELETED_NODES,
BCH_FS_NEED_ALLOC_WRITE,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,

View File

@ -7,6 +7,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "error.h"
#include "trace.h"
#include <linux/prefetch.h>
@ -813,9 +814,12 @@ lock_node:
return ERR_PTR(-EIO);
}
EBUG_ON(b->c.btree_id != iter->btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->c.btree_id != iter->btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
bkey_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
@ -823,7 +827,8 @@ lock_node:
struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
const struct bkey_i *k,
enum btree_id btree_id,
unsigned level)
unsigned level,
bool nofill)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
@ -838,6 +843,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
retry:
b = btree_cache_find(bc, k);
if (unlikely(!b)) {
if (nofill)
return NULL;
b = bch2_btree_node_fill(c, NULL, k, btree_id,
level, SIX_LOCK_read, true);
@ -884,9 +892,12 @@ lock_node:
return ERR_PTR(-EIO);
}
EBUG_ON(b->c.btree_id != btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->c.btree_id != btree_id);
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
bkey_cmp(b->data->min_key,
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
return b;
}
@ -996,8 +1007,22 @@ out:
if (sib != btree_prev_sib)
swap(n1, n2);
BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
n2->data->min_key));
if (bkey_cmp(bkey_successor(n1->key.k.p),
n2->data->min_key)) {
char buf1[200], buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
"prev: %s\n"
"next: %s\n",
bch2_btree_ids[iter->btree_id], level,
buf1, buf2);
six_unlock_intent(&ret->c.lock);
ret = NULL;
}
}
bch2_btree_trans_verify_locks(trans);

View File

@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned);
enum btree_id, unsigned, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);

View File

@ -50,6 +50,10 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
/*
* Missing: if an interior btree node is empty, we need to do something -
* perhaps just kill it
*/
static int bch2_gc_check_topology(struct bch_fs *c,
struct btree *b,
struct bkey_buf *prev,
@ -62,6 +66,8 @@ static int bch2_gc_check_topology(struct bch_fs *c,
? node_start
: bkey_successor(prev->k->k.p);
char buf1[200], buf2[200];
bool update_min = false;
bool update_max = false;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@ -75,22 +81,79 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
"btree node with incorrect min_key:\n prev %s\n cur %s",
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
buf1,
(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
BUG();
}
(bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
update_min = true;
}
if (fsck_err_on(is_last &&
bkey_cmp(cur.k->k.p, node_end), c,
"btree node with incorrect max_key:\n %s\n expected %s",
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
(bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
BUG();
}
(bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
update_max = true;
bch2_bkey_buf_copy(prev, c, cur.k);
if (update_min || update_max) {
struct bkey_i *new;
struct bkey_i_btree_ptr_v2 *bp = NULL;
struct btree *n;
if (update_max) {
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur.k->k.p);
if (ret)
return ret;
}
new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
if (!new)
return -ENOMEM;
bkey_copy(new, cur.k);
if (new->k.type == KEY_TYPE_btree_ptr_v2)
bp = bkey_i_to_btree_ptr_v2(new);
if (update_min)
bp->v.min_key = expected_start;
if (update_max)
new->k.p = node_end;
if (bp)
SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
if (ret) {
kfree(new);
return ret;
}
n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
b->c.level - 1, true);
if (n) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, n);
bkey_copy(&n->key, new);
if (update_min)
n->data->min_key = expected_start;
if (update_max)
n->data->max_key = node_end;
ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
six_unlock_read(&n->c.lock);
}
}
fsck_err:
return ret;
}
@ -147,12 +210,13 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
ptr->dev, PTR_BUCKET_NR(ca, ptr),
bch2_data_types[ptr_data_type(k.k, ptr)],
ptr->gen, g->mark.gen)) {
/* XXX if it's a cached ptr, drop it */
g2->_mark.gen = g->_mark.gen = ptr->gen;
g2->gen_valid = g->gen_valid = true;
g2->_mark.data_type = 0;
g2->_mark.dirty_sectors = 0;
g2->_mark.cached_sectors = 0;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
}
}
@ -298,8 +362,6 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
break;
if (b->c.level) {
struct btree *child;
bch2_bkey_buf_reassemble(&cur, c, k);
k = bkey_i_to_s_c(cur.k);
@ -310,26 +372,49 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
!bch2_btree_and_journal_iter_peek(&iter).k);
if (ret)
break;
if (b->c.level > target_depth) {
child = bch2_btree_node_get_noiter(c, cur.k,
b->c.btree_id, b->c.level - 1);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
break;
ret = bch2_gc_btree_init_recurse(c, child,
target_depth);
six_unlock_read(&child->c.lock);
if (ret)
break;
}
} else {
bch2_btree_and_journal_iter_advance(&iter);
}
}
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
bch2_bkey_buf_reassemble(&cur, c, k);
bch2_btree_and_journal_iter_advance(&iter);
child = bch2_btree_node_get_noiter(c, cur.k,
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
if (fsck_err_on(ret == -EIO, c,
"unreadable btree node")) {
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur.k->k.p);
if (ret)
return ret;
set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
continue;
}
if (ret)
break;
ret = bch2_gc_btree_init_recurse(c, child,
target_depth);
six_unlock_read(&child->c.lock);
if (ret)
break;
}
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
@ -816,16 +901,15 @@ again:
bch2_mark_allocator_buckets(c);
c->gc_count++;
out:
if (!ret &&
(test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
(!iter && bch2_test_restart_gc))) {
if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
bch_info(c, "Second GC pass needed, restarting:");
clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
percpu_down_write(&c->mark_lock);
@ -840,7 +924,7 @@ out:
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
out:
if (!ret) {
bch2_journal_block(&c->journal);

View File

@ -342,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
bch2_btree_and_journal_iter_advance(&iter);
child = bch2_btree_node_get_noiter(c, tmp.k,
b->c.btree_id, b->c.level - 1);
b->c.btree_id, b->c.level - 1,
false);
ret = PTR_ERR_OR_ZERO(child);
if (ret)
@ -766,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c,
bch2_journal_flush_all_pins(j);
return bch2_journal_error(j);
err:
bch_err(c, "journal replay: error %d while replaying key", ret);
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[i->btree_id], i->level);
return ret;
}