From 5222a4607cd8b9d8882e81796917c10193d10be0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 25 Dec 2021 20:07:00 -0500 Subject: [PATCH] bcachefs: BTREE_ITER_WITH_JOURNAL This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is automatically enabled when initializing a btree iterator before journal replay has completed - it overlays the contents of the journal with the btree. This lets us delete bch2_btree_and_journal_walk() and just use the normal btree iterator interface instead - which also lets us delete a significant amount of duplicated code. Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch - we're redoing the binary search over keys in the journal every time we call bch2_btree_iter_peek(). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 62 ++++----- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/btree_gc.c | 197 +++++++--------------------- fs/bcachefs/btree_iter.c | 194 +++++++++++++++++++++++---- fs/bcachefs/btree_types.h | 10 +- fs/bcachefs/btree_update_interior.c | 4 + fs/bcachefs/btree_update_leaf.c | 2 +- fs/bcachefs/ec.c | 72 +++++----- fs/bcachefs/recovery.c | 158 +++++++--------------- fs/bcachefs/recovery.h | 10 +- 10 files changed, 344 insertions(+), 366 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 30bf363d2ff3..cb4b059e796c 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, #undef x } -static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct bch_dev *ca; - struct bucket *g; - struct bkey_alloc_unpacked u; - - if (!bkey_is_alloc(k.k)) - return 0; - - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = bucket(ca, k.k->p.offset); - u = bch2_alloc_unpack(k); - - *bucket_gen(ca, k.k->p.offset) = u.gen; - g->_mark.gen = u.gen; - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->_mark.stripe = u.stripe != 0; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; - g->gen_valid = 1; - - return 0; -} - int bch2_alloc_read(struct bch_fs *c) { struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + struct bucket *g; + struct bkey_alloc_unpacked u; int ret; bch2_trans_init(&trans, c, 0, 0); down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (!bkey_is_alloc(k.k)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); + + *bucket_gen(ca, k.k->p.offset) = u.gen; + g->_mark.gen = u.gen; + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; + } + bch2_trans_iter_exit(&trans, &iter); + up_read(&c->gc_lock); bch2_trans_exit(&trans); + if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 9452b6cf04a5..431cf25b38db 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -860,7 +860,6 @@ mempool_t bio_bounce_pages; u64 reflink_hint; reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; - size_t reflink_gc_idx; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 77c30157792b..d7de00af81c9 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c, return 0; } -static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - char buf[200]; - int ret = 0; - - if (!refcount) - return 0; - - r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); - if (!r) - return -ENOMEM; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - r->refcount)) { - struct bkey_i *new; - - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; - } - - bkey_reassemble(new, k); - - if (!r->refcount) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - } else { - *bkey_refcount(new) = cpu_to_le64(r->refcount); - } - - ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); - kfree(new); - } -fsck_err: - return ret; -} - static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); - if (initial) { - c->reflink_gc_idx = 0; - - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_done_initial_fn); - goto out; - } - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (!refcount) continue; - r = genradix_ptr(&c->reflink_gc_table, idx); + r = genradix_ptr(&c->reflink_gc_table, idx++); if (!r || r->offset != k.k->p.offset || r->size != k.k->size) { @@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, else *bkey_refcount(new) = cpu_to_le64(r->refcount); - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) + : __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); kfree(new); @@ -1466,64 +1407,21 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, } fsck_err: bch2_trans_iter_exit(&trans, &iter); -out: c->reflink_gc_nr = 0; bch2_trans_exit(&trans); return ret; } -static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - struct gc_stripe *m; - const struct bch_stripe *s; - char buf[200]; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - - for (i = 0; i < s->nr_blocks; i++) - if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) - goto inconsistent; - return 0; -inconsistent: - if (fsck_err_on(true, c, - "stripe has wrong block sector count %u:\n" - " %s\n" - " should be %u", i, - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - m ? m->block_sectors[i] : 0)) { - struct bkey_i_stripe *new; - - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; - } - - bkey_reassemble(&new->k_i, k); - - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); - - ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i); - kfree(new); - } -fsck_err: - return ret; -} - static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, bool metadata_only) { struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct gc_stripe *m; + const struct bch_stripe *s; + char buf[200]; + unsigned i; int ret = 0; if (metadata_only) @@ -1531,39 +1429,52 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_gc_stripes_done_initial_fn); - } else { - BUG(); + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + + for (i = 0; i < s->nr_blocks; i++) + if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) + goto inconsistent; + continue; +inconsistent: + if (fsck_err_on(true, c, + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + break; + } + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) + : __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); + kfree(new); + } } +fsck_err: + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } -static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans, - struct bkey_s_c k) -{ - - struct bch_fs *c = trans->c; - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - return 0; - - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - return 0; -} - static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bool metadata_only) { @@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bch2_trans_init(&trans, c, 0, 0); c->reflink_gc_nr = 0; - if (initial) { - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink, - bch2_gc_reflink_start_initial_fn); - goto out; - } - for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { const __le64 *refcount = bkey_refcount_c(k); @@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, r->refcount = 0; } bch2_trans_iter_exit(&trans, &iter); -out: + bch2_trans_exit(&trans); return ret; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 0b5bf75fbf89..01c130a3ce8d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "recovery.h" #include "replicas.h" #include "subvolume.h" #include "trace.h" @@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, static void btree_path_verify_new_node(struct btree_trans *trans, struct btree_path *path, struct btree *b) { + struct bch_fs *c = trans->c; struct btree_path_level *l; unsigned plevel; bool parent_locked; @@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) return; + if (trans->journal_replay_not_finished) + return; + plevel = b->c.level + 1; if (!btree_path_node(path, plevel)) return; @@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans, char buf4[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_dump_btree_node(trans->c, l->b); + bch2_dump_btree_node(c, l->b); bch2_bpos_to_text(&PBUF(buf1), path->pos); bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); @@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat return ret; } +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, struct btree_path *path, unsigned plevel, struct btree *b) @@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, btree_node_unlock(path, plevel); } +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -1321,8 +1385,21 @@ static __always_inline int btree_path_down(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, path->level)); bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (unlikely(trans->journal_replay_not_finished)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); @@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans, mark_btree_node_locked(path, level, lock_type); btree_path_level_init(trans, path, b); - if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + if (likely(!trans->journal_replay_not_finished && + tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); - if (flags & BTREE_ITER_PREFETCH) - ret = btree_path_prefetch(trans, path); - if (btree_node_read_locked(path, level + 1)) btree_node_unlock(path, level + 1); path->level = level; @@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) return ret; } +static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, + struct btree_path *path) +{ + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, path->btree_id, + path->level, path->pos); + + while (idx < keys->nr && keys->d[idx].overwritten) + idx++; + + return (idx < keys->nr && + keys->d[idx].btree_id == path->btree_id && + keys->d[idx].level == path->level) + ? keys->d[idx].k + : NULL; +} + +static noinline +struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path); + + if (k && !bpos_cmp(k->k.p, iter->pos)) { + iter->k = k->k; + return bkey_i_to_s_c(k); + } else { + return bkey_s_c_null; + } +} + +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *next_journal = + __btree_trans_peek_journal(trans, iter->path); + + if (next_journal && + bpos_cmp(next_journal->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + /** * bch2_btree_iter_peek: returns first key greater than or equal to iterator's * current position @@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) goto out; } - next_update = btree_trans_peek_updates(iter); k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); - /* * In the btree, deleted keys sort before non deleted: */ - if (k.k && bkey_deleted(k.k) && - (!next_update || - bpos_cmp(k.k->p, next_update->k.p) <= 0)) { - search_key = k.k->p; - continue; - } + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + + next_update = btree_trans_peek_updates(iter); if (next_update && bpos_cmp(next_update->k.p, @@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) k = bkey_i_to_s_c(next_update); } + if (k.k && bkey_deleted(k.k)) { + /* + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. + */ + search_key = bpos_cmp(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + if (likely(k.k)) { /* * We can never have a key in a leaf node at POS_MAX, so @@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) EBUG_ON(iter->path->cached || iter->path->level); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - next_update = btree_trans_peek_updates(iter); - if (next_update && + if ((next_update = btree_trans_peek_updates(iter)) && !bpos_cmp(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); - } else { - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + goto out; } - if (!k.k || - ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) - ? bpos_cmp(iter->pos, k.k->p) - : bkey_cmp(iter->pos, k.k->p))) { - bkey_init(&iter->k); - iter->k.p = iter->pos; - k = (struct bkey_s_c) { &iter->k, NULL }; - } + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (k = btree_trans_peek_slot_journal(trans, iter)).k) + goto out; + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { struct bpos next; @@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = (struct bkey_s_c) { &iter->k, NULL }; } } - +out: iter->path->should_be_locked = true; bch2_btree_iter_verify_entry_exit(iter); @@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, btree_type_has_snapshots(btree_id)) flags |= BTREE_ITER_FILTER_SNAPSHOTS; + if (trans->journal_replay_not_finished) + flags |= BTREE_ITER_WITH_JOURNAL; + iter->trans = trans; iter->path = NULL; iter->btree_id = btree_id; @@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, memset(trans, 0, sizeof(*trans)); trans->c = c; trans->ip = _RET_IP_; + trans->journal_replay_not_finished = + !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); bch2_trans_alloc_paths(trans, c); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 2c8b30949e6f..1fd0cebe30ac 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -207,10 +207,11 @@ struct btree_node_iter { #define BTREE_ITER_CACHED_NOFILL (1 << 8) #define BTREE_ITER_CACHED_NOCREATE (1 << 9) #define BTREE_ITER_WITH_UPDATES (1 << 10) -#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12) -#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13) -#define BTREE_ITER_NOPRESERVE (1 << 14) +#define BTREE_ITER_WITH_JOURNAL (1 << 11) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) +#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) +#define BTREE_ITER_NOPRESERVE (1 << 15) enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -381,6 +382,7 @@ struct btree_trans { bool restarted:1; bool paths_sorted:1; bool journal_transaction_names:1; + bool journal_replay_not_finished:1; /* * For when bch2_trans_update notices we'll be splitting a compressed * extent: diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 6ef0711431a1..17111c4228bd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 8af9ba464b25..e95940ffad6b 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -711,7 +711,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); - if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); trans_for_each_update(trans, i) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index e18d2ecf7f07..86421f65d139 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c) bch2_stripes_heap_insert(c, m, iter.pos); } -static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k) -{ - const struct bch_stripe *s; - struct bch_fs *c = trans->c; - struct stripe *m; - unsigned i; - int ret = 0; - - if (k.k->type != KEY_TYPE_stripe) - return 0; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - return ret; - - s = bkey_s_c_to_stripe(k).v; - - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->alive = true; - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); - - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); - - return ret; -} - int bch2_stripes_read(struct bch_fs *c) { struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_stripe *s; + struct stripe *m; + unsigned i; int ret; bch2_trans_init(&trans, c, 0, 0); - ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes, - bch2_stripes_read_fn); + + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; + + s = bkey_s_c_to_stripe(k).v; + + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + if (ret) bch_err(c, "error reading stripes: %i", ret); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 219351654564..57311ad283c7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) static int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, - struct journal_key *r) + const struct journal_key *r) { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: bpos_cmp(l_pos, r->k->k.p)); } -static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { - return (cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p)); + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -static size_t journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +size_t bch2_journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; @@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, }; struct journal_keys *keys = &c->journal_keys; struct journal_iter *iter; - unsigned idx = journal_key_search(keys, id, level, k->k.p); + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); BUG_ON(test_bit(BCH_FS_RW, &c->flags)); @@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, return 0; } +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, unsigned level, struct bkey_i *k) { @@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { struct journal_keys *keys = &c->journal_keys; - size_t idx = journal_key_search(keys, btree, level, pos); + size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->nr && keys->d[idx].btree_id == btree && @@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->idx - iter->keys->nr - ? iter->keys->d + iter->idx : NULL; + struct journal_key *k = iter->keys->d + iter->idx; - if (k && - k->btree_id == iter->btree_id && - k->level == iter->level) - return k->k; + while (k < iter->keys->d + iter->keys->nr && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return k->k; + + iter->idx++; + k = iter->keys->d + iter->idx; + } - iter->idx = iter->keys->nr; return NULL; } @@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->btree_id = id; iter->level = level; iter->keys = &c->journal_keys; - iter->idx = journal_key_search(&c->journal_keys, id, level, pos); - list_add(&iter->list, &c->journal_iters); + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) { memset(iter, 0, sizeof(*iter)); iter->b = b; - bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(c, &iter->journal, - b->c.btree_id, b->c.level, b->data->min_key); + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); } -/* Walk btree, overlaying keys from the journal: */ - -static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, - struct btree_and_journal_iter iter) +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) { - unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; - struct bkey_s_c k; - struct bkey_buf tmp; + struct btree_node_iter node_iter; - BUG_ON(!b->c.level); - - bch2_bkey_buf_init(&tmp); - - while (i < nr && - (k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - bch2_btree_node_prefetch(c, NULL, NULL, tmp.k, - b->c.btree_id, b->c.level - 1); - - bch2_btree_and_journal_iter_advance(&iter); - i++; - } - - bch2_bkey_buf_exit(&tmp, c); -} - -static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b, - enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct bch_fs *c = trans->c; - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf tmp; - struct btree *child; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (b->c.level) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1, - false); - - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; - - btree_and_journal_iter_prefetch(c, b, iter); - - ret = bch2_btree_and_journal_walk_recurse(trans, child, - btree_id, key_fn); - six_unlock_read(&child->c.lock); - } else { - ret = key_fn(trans, k); - } - - if (ret) - break; - - bch2_btree_and_journal_iter_advance(&iter); - } - - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); - return ret; -} - -int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct bch_fs *c = trans->c; - struct btree *b = c->btree_roots[btree_id].b; - int ret = 0; - - if (btree_node_fake(b)) - return 0; - - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn); - six_unlock_read(&b->c.lock); - - return ret; + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p) ?: + return journal_key_cmp(l, r) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index a7a9496afb95..21bdad9db249 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -31,6 +31,9 @@ struct btree_and_journal_iter { } last; }; +size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, + unsigned, struct bpos); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c); - -int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn); - void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct list_head *);