bcachefs: Interior btree updates are now fully transactional

We now update the alloc info (bucket sector counts) atomically with
journalling the update to the interior btree nodes, and we also set new
btree roots atomically with the journalled part of the btree update.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2020-05-25 14:57:06 -04:00 committed by Kent Overstreet
parent c823c3390b
commit 00b8ccf707
21 changed files with 414 additions and 628 deletions

View File

@ -1461,11 +1461,6 @@ again:
}
rcu_read_unlock();
if (c->btree_roots_dirty) {
bch2_journal_meta(&c->journal);
goto again;
}
return !nodes_unwritten &&
!bch2_btree_interior_updates_nr_pending(c);
}

View File

@ -603,13 +603,10 @@ struct bch_fs {
struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
mempool_t btree_reserve_pool;
/*
* Cache of allocated btree nodes - if we allocate a btree node and
* don't use it, if we free it that space can't be reused until going
@ -627,6 +624,9 @@ struct bch_fs {
struct mutex btree_interior_update_lock;
struct closure_waitlist btree_interior_update_wait;
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
mempool_t btree_iters_pool;
struct workqueue_struct *wq;

View File

@ -466,6 +466,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
#if 0
/* Also see bch2_pending_btree_node_free_insert_done() */
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
@ -483,6 +484,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
mutex_unlock(&c->btree_interior_update_lock);
}
#endif
static void bch2_mark_allocator_buckets(struct bch_fs *c)
{
@ -801,6 +803,10 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
trace_gc_start(c);
down_write(&c->gc_lock);
/* flush interior btree updates: */
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
again:
ret = bch2_gc_start(c, metadata_only);
if (ret)
@ -812,7 +818,9 @@ again:
if (ret)
goto out;
#if 0
bch2_mark_pending_btree_node_frees(c);
#endif
bch2_mark_allocator_buckets(c);
c->gc_count++;
@ -1037,6 +1045,8 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
btree_node_reset_sib_u64s(n);
bch2_btree_build_aux_trees(n);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
bch2_btree_node_write(c, n, SIX_LOCK_intent);
@ -1085,7 +1095,7 @@ next:
bch2_btree_iter_node_replace(iter, new_nodes[0]);
for (i = 0; i < nr_new_nodes; i++)
bch2_open_buckets_put(c, &new_nodes[i]->ob);
bch2_btree_update_get_open_buckets(as, new_nodes[i]);
/* Free the old nodes and update our sliding window */
for (i = 0; i < nr_old_nodes; i++) {

View File

@ -310,6 +310,7 @@ struct btree_trans {
/* update path: */
struct jset_entry *extra_journal_entries;
unsigned extra_journal_entry_u64s;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct journal_preres journal_preres;

File diff suppressed because it is too large Load Diff

View File

@ -6,34 +6,13 @@
#include "btree_locking.h"
#include "btree_update.h"
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;
struct btree *b[BTREE_RESERVE_MAX];
};
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
struct bkey_format *);
/* Btree node freeing/allocation: */
#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
/*
* Tracks a btree node that has been (or is about to be) freed in memory, but
* has _not_ yet been freed on disk (because the write that makes the new
* node(s) visible and frees the old hasn't completed yet)
*/
struct pending_btree_node_free {
bool index_update_done;
__le64 seq;
enum btree_id btree_id;
unsigned level;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
#define BTREE_UPDATE_JOURNAL_RES \
((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
@ -72,9 +51,8 @@ struct btree_update {
unsigned nodes_written:1;
enum btree_id btree_id;
u8 level;
struct btree_reserve *reserve;
struct disk_reservation disk_res;
struct journal_preres journal_preres;
/*
@ -96,17 +74,28 @@ struct btree_update {
*/
struct journal_entry_pin journal;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
*/
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
/* Preallocated nodes we reserve when we start the update: */
struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_prealloc_nodes;
/* Nodes being freed: */
struct keylist old_keys;
u64 _old_keys[BTREE_UPDATE_NODES_MAX *
BKEY_BTREE_PTR_VAL_U64s_MAX];
/* Nodes being added: */
struct keylist new_keys;
u64 _new_keys[BTREE_UPDATE_NODES_MAX *
BKEY_BTREE_PTR_VAL_U64s_MAX];
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
unsigned nr_new_nodes;
u8 open_buckets[BTREE_UPDATE_NODES_MAX *
BCH_REPLICAS_MAX];
u8 nr_open_buckets;
unsigned journal_u64s;
u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
@ -120,14 +109,12 @@ struct btree_update {
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
struct btree_iter *);
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
@ -139,6 +126,7 @@ bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *,
@ -333,6 +321,10 @@ ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, struct jset_entry *);
void bch2_fs_btree_interior_update_exit(struct bch_fs *);
int bch2_fs_btree_interior_update_init(struct bch_fs *);

View File

@ -414,8 +414,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
}
if (unlikely(trans->extra_journal_entry_u64s)) {
memcpy_u64s_small(bch2_journal_reservation_entry(&c->journal,
&trans->journal_res),
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
trans->extra_journal_entries,
trans->extra_journal_entry_u64s);
@ -521,6 +520,10 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
i->iter);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq,
trans->journal_pin, NULL);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:

View File

@ -1180,7 +1180,7 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
int bch2_mark_key_locked(struct bch_fs *c,
static int bch2_mark_key_locked(struct bch_fs *c,
struct bkey_s_c k,
unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,

View File

@ -259,8 +259,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage_online *,

View File

@ -958,15 +958,12 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
void bch2_fs_journal_stop(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bch2_journal_flush_all_pins(j);
wait_event(j->wait, journal_entry_close(j));
/* do we need to write another journal entry? */
if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
c->btree_roots_dirty)
if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
bch2_journal_meta(j);
journal_quiesce(j);

View File

@ -200,25 +200,15 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
}
static inline struct jset_entry *
bch2_journal_reservation_entry(struct journal *j, struct journal_res *res)
journal_res_entry(struct journal *j, struct journal_res *res)
{
return vstruct_idx(j->buf[res->idx].data, res->offset);
}
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
unsigned type, enum btree_id id,
unsigned level,
static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
struct jset_entry *entry = bch2_journal_reservation_entry(j, res);
unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
entry->level = level;
@ -227,6 +217,23 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
entry->pad[1] = 0;
entry->pad[2] = 0;
memcpy_u64s_small(entry->_data, data, u64s);
return jset_u64s(u64s);
}
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
unsigned type, enum btree_id id,
unsigned level,
const void *data, unsigned u64s)
{
unsigned actual = journal_entry_set(journal_res_entry(j, res),
type, id, level, data, u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
}
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
@ -992,8 +993,23 @@ void bch2_journal_write(struct closure *cl)
j->write_start_time = local_clock();
start = vstruct_last(jset);
end = bch2_journal_super_entries_add_common(c, start,
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
*
* But, every journal entry we write has to contain all the btree roots
* (at least for now); so after we copy btree roots to c->btree_roots we
* have to get any missing btree roots and add them to this journal
* entry:
*/
bch2_journal_entries_to_btree_roots(c, jset);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_journal_super_entries_add_common(c, end,
le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);

View File

@ -330,7 +330,7 @@ static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
__journal_pin_drop(j, pin);
BUG_ON(!atomic_read(&pin_list->count));
BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
atomic_inc(&pin_list->count);
pin->seq = seq;

View File

@ -38,7 +38,7 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
if (unlikely(!journal_pin_active(pin)))
if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
__bch2_journal_pin_add(j, seq, pin, flush_fn);
}

View File

@ -6,7 +6,7 @@
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
size_t nr_inline_u64s, size_t new_u64s)
{
size_t oldsize = bch_keylist_u64s(l);
size_t oldsize = bch2_keylist_u64s(l);
size_t newsize = oldsize + new_u64s;
u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
u64 *new_keys;
@ -52,7 +52,7 @@ void bch2_keylist_pop_front(struct keylist *l)
memmove_u64s_down(l->keys,
bkey_next(l->keys),
bch_keylist_u64s(l));
bch2_keylist_u64s(l));
}
#ifdef CONFIG_BCACHEFS_DEBUG

View File

@ -36,14 +36,14 @@ static inline bool bch2_keylist_empty(struct keylist *l)
return l->top == l->keys;
}
static inline size_t bch_keylist_u64s(struct keylist *l)
static inline size_t bch2_keylist_u64s(struct keylist *l)
{
return l->top_p - l->keys_p;
}
static inline size_t bch2_keylist_bytes(struct keylist *l)
{
return bch_keylist_u64s(l) * sizeof(u64);
return bch2_keylist_u64s(l) * sizeof(u64);
}
static inline struct bkey_i *bch2_keylist_front(struct keylist *l)

View File

@ -151,15 +151,8 @@ retry:
}
/* flush relevant btree updates */
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (c->btree_roots_dirty)
bch2_journal_meta(&c->journal);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
}
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
ret = 0;
err:

View File

@ -774,14 +774,8 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
while (1) {
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c) ||
c->btree_roots_dirty);
if (!bch2_btree_interior_updates_nr_pending(c))
break;
bch2_journal_meta(&c->journal);
}
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
ret = bch2_replicas_gc2(c) ?: ret;

View File

@ -763,6 +763,7 @@ static int verify_superblock_clean(struct bch_fs *c,
"superblock read clock doesn't match journal after clean shutdown");
for (i = 0; i < BTREE_ID_NR; i++) {
char buf1[200], buf2[200];
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
@ -778,7 +779,11 @@ static int verify_superblock_clean(struct bch_fs *c,
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(k1)) ||
l1 != l2, c,
"superblock btree root doesn't match journal after clean shutdown");
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
}
fsck_err:
return ret;

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@ -955,7 +956,6 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
@ -989,27 +989,8 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry *entry,
u64 journal_seq)
{
struct btree_root *r;
unsigned i;
mutex_lock(&c->btree_root_lock);
for (r = c->btree_roots;
r < c->btree_roots + BTREE_ID_NR;
r++)
if (r->alive) {
entry_init_u64s(entry, r->key.u64s + 1);
entry->btree_id = r - c->btree_roots;
entry->level = r->level;
entry->type = BCH_JSET_ENTRY_btree_root;
bkey_copy(&entry->start[0], &r->key);
entry = vstruct_next(entry);
}
c->btree_roots_dirty = false;
mutex_unlock(&c->btree_root_lock);
percpu_down_read(&c->mark_lock);
if (!journal_seq) {
@ -1111,6 +1092,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
entry = bch2_journal_super_entries_add_common(c, entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,

View File

@ -227,6 +227,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work);
clean_passes = wrote ? 0 : clean_passes + 1;
} while (clean_passes < 2);
@ -234,6 +235,10 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "writing alloc info complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
nowrote_alloc:
closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);