diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index a2769a85b029..65eeab56cb4b 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -16,6 +16,7 @@ bcachefs-y := \ btree_update_interior.o \ btree_update_leaf.o \ buckets.o \ + buckets_waiting_for_journal.o \ chardev.o \ checksum.o \ clock.o \ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ab7d972aac3a..bc5053ebe18f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "ec.h" @@ -561,8 +562,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, * keys when there's only a small difference, so that we can * keep sequential buckets together: */ - return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| - (bucket_gc_gen(g) >> 4); + return bucket_gc_gen(g) >> 4; } } @@ -611,6 +611,14 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) if (!bch2_can_invalidate_bucket(ca, b, m)) continue; + if (!m.data_type && + bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + last_seq_ondisk, + ca->dev_idx, b)) { + ca->buckets_waiting_on_journal++; + continue; + } + if (e.nr && e.bucket + e.nr == b && e.key == key) { e.nr++; } else { @@ -647,6 +655,7 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; + ca->buckets_waiting_on_journal = 0; find_reclaimable_buckets_lru(c, ca); @@ -658,28 +667,6 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } -/* - * returns sequence number of most recent journal entry that updated this - * bucket: - */ -static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -{ - if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - return bucket_seq; - } else { - return 0; - } -} - static int bucket_invalidate_btree(struct btree_trans *trans, struct bch_dev *ca, u64 b) { @@ -745,9 +732,10 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, * gen in memory here, the incremented gen will be updated in the btree * by bch2_trans_mark_pointer(): */ - if (!m.cached_sectors && - !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { - BUG_ON(m.data_type); + if (!m.data_type && + !bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + ca->dev_idx, b)) { bucket_cmpxchg(g, m, m.gen++); *bucket_gen(ca, b) = m.gen; percpu_up_read(&c->mark_lock); @@ -781,13 +769,6 @@ out: if (!top->nr) heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - - /* - * Make sure we flush the last journal entry that updated this - * bucket (i.e. deleting the last reference) before writing to - * this bucket again: - */ - *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); } else { size_t b2; @@ -954,8 +935,14 @@ static int bch2_allocator_thread(void *arg) gc_count = c->gc_count; nr = find_reclaimable_buckets(c, ca); - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); + if (!nr && ca->buckets_waiting_on_journal) { + ret = bch2_journal_flush(&c->journal); + if (ret) + goto stop; + } else if (nr < (ca->mi.nbuckets >> 6) && + ca->buckets_waiting_on_journal >= nr / 2) { + bch2_journal_flush_async(&c->journal, NULL); + } if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ca->inc_gen_really_needs_gc) && @@ -963,6 +950,9 @@ static int bch2_allocator_thread(void *arg) atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } + + trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, + ca->inc_gen_really_needs_gc); } ret = bch2_invalidate_buckets(c, ca); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 4ebaefd408a4..3d1a6773393c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -355,6 +355,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" #include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" #include "clock_types.h" #include "ec_types.h" #include "journal_types.h" @@ -482,6 +483,7 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; enum allocator_states allocator_state; @@ -777,6 +779,8 @@ struct bch_fs { struct mutex write_points_hash_lock; unsigned write_points_nr; + struct buckets_waiting_for_journal buckets_waiting_for_journal; + /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index de33491f2535..24de8604740c 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -396,10 +396,11 @@ static inline void do_btree_insert_one(struct btree_trans *trans, } } -static noinline void bch2_trans_mark_gc(struct btree_trans *trans) +static noinline int bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + int ret = 0; trans_for_each_update(trans, i) { /* @@ -408,10 +409,15 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) - bch2_mark_update(trans, i->path, i->k, - i->flags|BTREE_TRIGGER_GC); + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = bch2_mark_update(trans, i->path, i->k, + i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } } + + return ret; } static inline int @@ -510,11 +516,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, return BTREE_INSERT_NEED_MARK_REPLICAS; trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) - bch2_mark_update(trans, i->path, i->k, i->flags); + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = bch2_mark_update(trans, i->path, i->k, i->flags); + if (ret) + return ret; + } - if (unlikely(c->gc_pos.phase)) - bch2_trans_mark_gc(trans); + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_mark_gc(trans); + if (ret) + return ret; + } trans_for_each_update(trans, i) do_btree_insert_one(trans, i); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index b80ab1ed22f7..f7a750aff03f 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -11,6 +11,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "ec.h" #include "error.h" #include "inode.h" @@ -43,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, } } -/* - * Clear journal_seq_valid for buckets for which it's not needed, to prevent - * wraparound: - */ -void bch2_bucket_seq_cleanup(struct bch_fs *c) -{ - u64 journal_seq = atomic64_read(&c->journal.seq); - u16 last_seq_ondisk = c->journal.flushed_seq_ondisk; - struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - struct bucket_mark m; - unsigned i; - - if (journal_seq - c->last_bucket_seq_cleanup < - (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) - return; - - c->last_bucket_seq_cleanup = journal_seq; - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) { - bucket_cmpxchg(g, m, ({ - if (!m.journal_seq_valid || - bucket_needs_journal_commit(m, last_seq_ondisk)) - break; - - m.journal_seq_valid = 0; - })); - } - up_read(&ca->bucket_lock); - } -} - void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; @@ -576,16 +540,28 @@ static int bch2_mark_alloc(struct btree_trans *trans, v->journal_seq = cpu_to_le64(new_u.journal_seq); } - ca = bch_dev_bkey_exists(c, new.k->p.inode); + if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new_u.dev, new_u.bucket, + new_u.journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } - if (new.k->p.offset >= ca->mi.nbuckets) + ca = bch_dev_bkey_exists(c, new_u.dev); + + if (new_u.bucket >= ca->mi.nbuckets) return 0; percpu_down_read(&c->mark_lock); if (!gc && new_u.gen != old_u.gen) - *bucket_gen(ca, new.k->p.offset) = new_u.gen; + *bucket_gen(ca, new_u.bucket) = new_u.gen; - g = __bucket(ca, new.k->p.offset, gc); + g = __bucket(ca, new_u.bucket, gc); old_m = bucket_cmpxchg(g, m, ({ m.gen = new_u.gen; @@ -593,11 +569,6 @@ static int bch2_mark_alloc(struct btree_trans *trans, m.dirty_sectors = new_u.dirty_sectors; m.cached_sectors = new_u.cached_sectors; m.stripe = new_u.stripe != 0; - - if (journal_seq) { - m.journal_seq_valid = 1; - m.journal_seq = journal_seq; - } })); bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); @@ -625,7 +596,7 @@ static int bch2_mark_alloc(struct btree_trans *trans, return ret; } - trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), old_m.cached_sectors); } @@ -775,9 +746,10 @@ static int check_bucket_ref(struct bch_fs *c, static int mark_stripe_bucket(struct btree_trans *trans, struct bkey_s_c k, unsigned ptr_idx, - u64 journal_seq, unsigned flags) + unsigned flags) { struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; @@ -818,11 +790,6 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (data_type) new.data_type = data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - new.stripe = true; })); @@ -894,11 +861,6 @@ static int bch2_mark_pointer(struct btree_trans *trans, new.data_type = bucket_data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - if (flags & BTREE_TRIGGER_NOATOMIC) { g->_mark = new; break; @@ -1119,7 +1081,7 @@ static int bch2_mark_stripe(struct btree_trans *trans, memset(m->block_sectors, 0, sizeof(m->block_sectors)); for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(trans, new, i, journal_seq, flags); + ret = mark_stripe_bucket(trans, new, i, flags); if (ret) return ret; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 6eeb95068b3b..4b5376684d2c 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -159,13 +159,6 @@ static inline bool is_available_bucket(struct bucket_mark mark) return !mark.dirty_sectors && !mark.stripe; } -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} - /* Device usage: */ struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); @@ -240,7 +233,6 @@ bch2_fs_usage_read_short(struct bch_fs *); /* key/bucket marking: */ -void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 18bca269b750..24139831226d 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -15,18 +15,9 @@ struct bucket_mark { u8 gen; u8 data_type:3, owned_by_allocator:1, - journal_seq_valid:1, stripe:1; u16 dirty_sectors; u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket can't be - * reused until the journal sequence number written to disk is >= the - * bucket's journal sequence number: - */ - u16 journal_seq; }; }; }; diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c new file mode 100644 index 000000000000..f3774e30b5cd --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include +#include + +static inline struct bucket_hashed * +bucket_hash(struct buckets_waiting_for_journal_table *t, + unsigned hash_seed_idx, u64 dev_bucket) +{ + return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); +} + +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) +{ + unsigned i; + + t->bits = bits; + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) + get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); + memset(t->d, 0, sizeof(t->d[0]) << t->bits); +} + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + t = b->t; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + + if (h->dev_bucket == dev_bucket) { + ret = h->journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, + struct bucket_hashed *new, + u64 flushed_seq) +{ + struct bucket_hashed *last_evicted = NULL; + unsigned tries, i; + + for (tries = 0; tries < 10; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + old = bucket_hash(t, i, new->dev_bucket); + + if (old->dev_bucket == new->dev_bucket || + old->journal_seq <= flushed_seq) { + *old = *new; + return true; + } + + if (last_evicted != old) + victim = old; + } + + /* hashed to same slot 3 times: */ + if (!victim) + break; + + /* Failed to find an empty slot: */ + swap(*new, *victim); + last_evicted = victim; + } + + return false; +} + +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal_table *t, *n; + struct bucket_hashed tmp, new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + int ret = 0; + + mutex_lock(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) + goto out; + + t = b->t; + size = 1UL << t->bits; + for (i = 0; i < size; i++) + nr_elements += t->d[i].journal_seq > flushed_seq; + + new_bits = t->bits + (nr_elements * 3 > size); + + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + +retry_rehash: + nr_rehashes++; + bucket_table_init(n, new_bits); + + tmp = new; + BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); + + for (i = 0; i < 1UL << t->bits; i++) { + if (t->d[i].journal_seq <= flushed_seq) + continue; + + tmp = t->d[i]; + if (!bucket_table_insert(n, &tmp, flushed_seq)) + goto retry_rehash; + } + + b->t = n; + kvfree(t); + + pr_debug("took %zu rehashes, table at %zu/%zu elements", + nr_rehashes, nr_elements, 1UL << b->t->bits); +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->t); +} + +#define INITIAL_TABLE_BITS 3 + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->t = kvmalloc(sizeof(*b->t) + + (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); + if (!b->t) + return -ENOMEM; + + bucket_table_init(b->t, INITIAL_TABLE_BITS); + return 0; +} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h new file mode 100644 index 000000000000..d2ae19cbe18c --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 000000000000..e593db061d81 --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +#include + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal_table { + unsigned bits; + u64 hash_seeds[3]; + struct bucket_hashed d[]; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + struct buckets_waiting_for_journal_table *t; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 4602f581198e..815310e2426f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1666,13 +1666,9 @@ retry_alloc: } } - bch2_bucket_seq_cleanup(c); - continue_at(cl, do_journal_write, c->io_complete_wq); return; no_io: - bch2_bucket_seq_cleanup(c); - continue_at(cl, journal_write_done, c->io_complete_wq); return; err: diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 55bb263a0906..3094eb1e3406 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -16,6 +16,7 @@ #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" @@ -475,6 +476,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); @@ -818,6 +820,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c); bch2_fs_subvolumes_init(c) ?: bch2_fs_io_init(c) ?: bch2_fs_encryption_init(c) ?: