bcachefs updates for 6.9

- Subvolume children btree; this is needed for providing a userspace
    interface for walking subvolumes, which will come later
  - Lots of improvements to directory structure checking
  - Improved journal pipelining, significantly improving performance on
    high iodepth write workloads
  - Discard path improvements: the discard path is more efficient, and no
    longer flushes the journal unnecessarily
  - Buffered write path can now avoid taking the inode lock
  - new mm helper: memalloc_flags_{save|restore}
  - mempool now does kvmalloc mempools
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmXycEcACgkQE6szbY3K
 bnYUTg/+K4Nv2EdAqOCyHRTKaF2OgJDUb25ZDmbGpfT1XyPrNB7/+CxHqSdEP7/e
 FVuhtP61vnQAImDv82u9iZiab/TnuCZPUrjSobFEvrWYoGRtP9Bm9MyYB28NzmMa
 AXGmS4yJGVwtxxrFNxZP98IbiHYiHSoYbkqxX2E5VgLag8Ru8peb7oD0Ro3zw0rb
 z+6UM/seJ7on5i/9IJEMKKXFVEoZC2J5DAVoe1TghG2kgOw3cKu5OUdltLPOY5jL
 jkm5J5wa6Ep46nufHat92yiMxXIQrf4U9LkXxzTi5ThoSmt+Af2qXcBjqTTVqd2D
 1dGxj+UG8iu4DCCbQC6EA7J5EMvxfJM0+9lk1ULUgxUs3X69co6nlI6XH1fwEMqk
 KpIqd35+Y/IYgogt9ioXI0dtXyL7dbaTVt6NZhc9SaPGPX+C2V0+l4bqToFdNaPH
 0KATjjyQaJRE4ZFIjr6GliYOtKWDLi/HPEyoBivniUn7cF5vjSvti+cSQwNDSPpa
 6jOd5Y923Iq9ZqDAPM3+mvTH8nNaaf2T2fmbPNrc5pdWbha9bGwOU71zvKHNFGm/
 66ZsnwhKSk+uwglTMZHPKSkJJXUYAHESw3slQtEWHZVlliArc55+pBHwE00bvRt7
 KHUUqkqXBUPzbp/kdZGylMAdH9+8j9TE5QJ2RaoryFm/eCfexmI=
 =6xnj
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2024-03-13' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs updates from Kent Overstreet:

 - Subvolume children btree; this is needed for providing a userspace
   interface for walking subvolumes, which will come later

 - Lots of improvements to directory structure checking

 - Improved journal pipelining, significantly improving performance on
   high iodepth write workloads

 - Discard path improvements: the discard path is more efficient, and no
   longer flushes the journal unnecessarily

 - Buffered write path can now avoid taking the inode lock

 - new mm helper: memalloc_flags_{save|restore}

 - mempool now does kvmalloc mempools

* tag 'bcachefs-2024-03-13' of https://evilpiepirate.org/git/bcachefs: (128 commits)
  bcachefs: time_stats: shrink time_stat_buffer for better alignment
  bcachefs: time_stats: split stats-with-quantiles into a separate structure
  bcachefs: mean_and_variance: put struct mean_and_variance_weighted on a diet
  bcachefs: time_stats: add larger units
  bcachefs: pull out time_stats.[ch]
  bcachefs: reconstruct_alloc cleanup
  bcachefs: fix bch_folio_sector padding
  bcachefs: Fix btree key cache coherency during replay
  bcachefs: Always flush write buffer in delete_dead_inodes()
  bcachefs: Fix order of gc_done passes
  bcachefs: fix deletion of indirect extents in btree_gc
  bcachefs: Prefer struct_size over open coded arithmetic
  bcachefs: Kill unused flags argument to btree_split()
  bcachefs: Check for writing superblocks with nonsense member seq fields
  bcachefs: fix bch2_journal_buf_to_text()
  lib/generic-radix-tree.c: Make nodes more reasonably sized
  bcachefs: copy_(to|from)_user_errcode()
  bcachefs: Split out bkey_types.h
  bcachefs: fix lost journal buf wakeup due to improved pipelining
  bcachefs: intercept mountoption value for bool type
  ...
This commit is contained in:
Linus Torvalds 2024-03-15 09:00:09 -07:00
commit 32a50540c3
95 changed files with 3797 additions and 2280 deletions

View File

@ -0,0 +1,30 @@
.. SPDX-License-Identifier: GPL-2.0
bcachefs private error codes
----------------------------
In bcachefs, as a hard rule we do not throw or directly use standard error
codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed
in fs/bcachefs/errcode.h.
This gives us much better error messages and makes debugging much easier. Any
direct uses of standard error codes you see in the source code are simply old
code that has yet to be converted - feel free to clean it up!
Private error codes may subtype another error code, this allows for grouping of
related errors that should be handled similarly (e.g. transaction restart
errors), as well as specifying which standard error code should be returned at
the bcachefs module boundary.
At the module boundary, we use bch2_err_class() to convert to a standard error
code; this also emits a trace event so that the original error code be
recovered even if it wasn't logged.
Do not reuse error codes! Generally speaking, a private error code should only
be thrown in one place. That means that when we see it in a log message we can
see, unambiguously, exactly which file and line number it was returned from.
Try to give error codes names that are as reasonably descriptive of the error
as possible. Frequently, the error will be logged at a place far removed from
where the error was generated; good names for error codes mean much more
descriptive and useful error messages.

View File

@ -3555,6 +3555,7 @@ R: Brian Foster <bfoster@redhat.com>
L: linux-bcachefs@vger.kernel.org
S: Supported
C: irc://irc.oftc.net/bcache
T: git https://evilpiepirate.org/git/bcachefs.git
F: fs/bcachefs/
BDISP ST MEDIA DRIVER

View File

@ -82,6 +82,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
time_stats.o \
thread_with_file.o \
trace.o \
two_state_shared_lock.o \
@ -90,3 +91,6 @@ bcachefs-y := \
xattr.o
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
# Silence "note: xyz changed in GCC X.X" messages
subdir-ccflags-y += $(call cc-disable-warning, psabi)

View File

@ -29,6 +29,8 @@
#include <linux/sched/task.h>
#include <linux/sort.h>
static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
/* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@ -860,23 +862,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
*bucket_gen(ca, new.k->p.offset) = new_a->gen;
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
percpu_up_read(&c->mark_lock);
if (new_a->data_type == BCH_DATA_free &&
(!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
if (statechange(a->data_type == BCH_DATA_free) &&
bucket_flushed(new_a))
closure_wake_up(&c->freelist_wait);
if (new_a->data_type == BCH_DATA_need_discard &&
(!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
bch2_do_discards(c);
if (statechange(a->data_type == BCH_DATA_need_discard) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
bucket_flushed(new_a))
bch2_discard_one_bucket_fast(c, new.k->p);
if (old_a->data_type != BCH_DATA_cached &&
new_a->data_type == BCH_DATA_cached &&
if (statechange(a->data_type == BCH_DATA_cached) &&
!bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
bch2_do_invalidates(c);
if (new_a->data_type == BCH_DATA_need_gc_gens)
if (statechange(a->data_type == BCH_DATA_need_gc_gens))
bch2_do_gc_gens(c);
percpu_up_read(&c->mark_lock);
}
if ((flags & BTREE_TRIGGER_GC) &&
@ -1045,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (k.k->type != discard_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(k.k->type != discard_key_type,
c, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@ -1076,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (k.k->type != freespace_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(k.k->type != freespace_key_type,
c, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@ -1108,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (ret)
goto err;
if (a->gen != alloc_gen(k, gens_offset) &&
(c->opts.reconstruct_alloc ||
fsck_err(c, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
c, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_bucket_gens *g =
bch2_trans_kmalloc(trans, sizeof(*g));
@ -1167,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
*end = bkey_min(k.k->p, *end);
if (k.k->type != KEY_TYPE_set &&
(c->opts.reconstruct_alloc ||
fsck_err(c, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
end->offset))) {
if (fsck_err_on(k.k->type != KEY_TYPE_set,
c, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
end->offset)) {
struct bkey_i *update =
bch2_trans_kmalloc(trans, sizeof(*update));
@ -1604,6 +1607,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
return ret;
}
static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
{
int ret;
mutex_lock(&c->discard_buckets_in_flight_lock);
darray_for_each(c->discard_buckets_in_flight, i)
if (bkey_eq(*i, bucket)) {
ret = -EEXIST;
goto out;
}
ret = darray_push(&c->discard_buckets_in_flight, bucket);
out:
mutex_unlock(&c->discard_buckets_in_flight_lock);
return ret;
}
static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
{
mutex_lock(&c->discard_buckets_in_flight_lock);
darray_for_each(c->discard_buckets_in_flight, i)
if (bkey_eq(*i, bucket)) {
darray_remove_item(&c->discard_buckets_in_flight, i);
goto found;
}
BUG();
found:
mutex_unlock(&c->discard_buckets_in_flight_lock);
}
struct discard_buckets_state {
u64 seen;
u64 open;
@ -1642,6 +1675,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
bool discard_locked = false;
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
@ -1709,6 +1743,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
goto out;
discard_locked = true;
if (!bkey_eq(*discard_pos_done, iter.pos) &&
ca->mi.discard && !c->opts.nochanges) {
/*
@ -1740,6 +1779,8 @@ write:
count_event(c, bucket_discard);
s->discarded++;
out:
if (discard_locked)
discard_in_flight_remove(c, iter.pos);
s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
@ -1779,6 +1820,93 @@ void bch2_do_discards(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
{
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
int ret = bkey_err(k);
if (ret)
goto err;
struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto err;
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
a->v.data_type = alloc_data_type(a->v, a->v.data_type);
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static void bch2_do_discards_fast_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
while (1) {
bool got_bucket = false;
struct bpos bucket;
struct bch_dev *ca;
mutex_lock(&c->discard_buckets_in_flight_lock);
darray_for_each(c->discard_buckets_in_flight, i) {
if (i->snapshot)
continue;
ca = bch_dev_bkey_exists(c, i->inode);
if (!percpu_ref_tryget(&ca->io_ref)) {
darray_remove_item(&c->discard_buckets_in_flight, i);
continue;
}
got_bucket = true;
bucket = *i;
i->snapshot = true;
break;
}
mutex_unlock(&c->discard_buckets_in_flight_lock);
if (!got_bucket)
break;
if (ca->mi.discard && !c->opts.nochanges)
blkdev_issue_discard(ca->disk_sb.bdev,
bucket.offset * ca->mi.bucket_size,
ca->mi.bucket_size,
GFP_KERNEL);
int ret = bch2_trans_do(c, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc,
bch2_clear_bucket_needs_discard(trans, bucket));
bch_err_fn(c, ret);
percpu_ref_put(&ca->io_ref);
discard_in_flight_remove(c, bucket);
if (ret)
break;
}
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
{
struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
if (!percpu_ref_is_dying(&ca->io_ref) &&
!discard_in_flight_add(c, bucket) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
!queue_work(c->write_ref_wq, &c->discard_fast_work))
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bucket(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
@ -2210,9 +2338,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d);
}
void bch2_fs_allocator_background_exit(struct bch_fs *c)
{
darray_exit(&c->discard_buckets_in_flight);
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
{
spin_lock_init(&c->freelist_lock);
mutex_init(&c->discard_buckets_in_flight_lock);
INIT_WORK(&c->discard_work, bch2_do_discards_work);
INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
}

View File

@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_fs_allocator_background_exit(struct bch_fs *);
void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */

View File

@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
&c->blocked_allocate_open_bucket, true);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
&c->blocked_allocate_open_bucket, false);
track_event_change(&c->times[BCH_TIME_blocked_allocate],
&c->blocked_allocate, false);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
spin_unlock(&c->freelist_lock);
return ob;
@ -555,8 +551,7 @@ again:
goto again;
}
track_event_change(&c->times[BCH_TIME_blocked_allocate],
&c->blocked_allocate, true);
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;

View File

@ -131,8 +131,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
printbuf_exit(&buf);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
bch2_inconsistent_error(c);
return -EIO;
return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
} else {
return 0;
}
@ -478,8 +477,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
if (c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@ -555,60 +553,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
};
}
static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
u64 mem_bytes;
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
u64 mem_bytes = i.totalram * i.mem_unit;
return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
}
static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
{
return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
unsigned btree_leaf_mask,
unsigned btree_interior_mask,
u64 btree_leaf_mask,
u64 btree_interior_mask,
struct bbpos start, struct bbpos *end)
{
struct btree_iter iter;
struct bkey_s_c k;
size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
enum btree_id btree;
struct bch_fs *c = trans->c;
s64 mem_may_pin = mem_may_pin_bytes(c);
int ret = 0;
for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
btree_interior_mask |= btree_leaf_mask;
c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask;
c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask;
c->btree_cache.pinned_nodes_start = start;
c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX;
for (enum btree_id btree = start.btree;
btree < BTREE_ID_NR && !ret;
btree++) {
unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
struct btree_iter iter;
struct btree *b;
if (!((1U << btree) & btree_leaf_mask) &&
!((1U << btree) & btree_interior_mask))
continue;
bch2_trans_node_iter_init(trans, &iter, btree,
btree == start.btree ? start.pos : POS_MIN,
0, depth, 0);
/*
* for_each_btree_key_contineu() doesn't check the return value
* from bch2_btree_iter_advance(), which is needed when
* iterating over interior nodes where we'll see keys at
* SPOS_MAX:
*/
do {
k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
ret = bkey_err(k);
if (!k.k || ret)
break;
--btree_nodes;
if (!btree_nodes) {
*end = BBPOS(btree, k.k->p);
__for_each_btree_node(trans, iter, btree,
btree == start.btree ? start.pos : POS_MIN,
0, depth, BTREE_ITER_PREFETCH, b, ret) {
mem_may_pin -= btree_buf_bytes(b);
if (mem_may_pin <= 0) {
c->btree_cache.pinned_nodes_end = *end =
BBPOS(btree, b->key.k.p);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
} while (bch2_btree_iter_advance(&iter));
}
bch2_trans_iter_exit(trans, &iter);
}
*end = BBPOS_MAX;
return ret;
}
@ -666,62 +665,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
return 0;
}
static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
struct bpos bucket)
{
return bch2_dev_exists2(c, bucket.inode)
? bucket_pos_to_bp(c, bucket, 0)
: bucket;
}
static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
struct bpos start, struct bpos *end)
{
struct btree_iter alloc_iter;
struct btree_iter bp_iter;
struct bkey_s_c alloc_k, bp_k;
size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
bool alloc_end = false, bp_end = false;
int ret = 0;
bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
start, 0, 1, 0);
bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
while (1) {
alloc_k = !alloc_end
? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
: bkey_s_c_null;
bp_k = !bp_end
? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
: bkey_s_c_null;
ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
if ((!alloc_k.k && !bp_k.k) || ret) {
*end = SPOS_MAX;
break;
}
--btree_nodes;
if (!btree_nodes) {
*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
break;
}
if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
if (!bch2_btree_iter_advance(&alloc_iter))
alloc_end = true;
} else {
if (!bch2_btree_iter_advance(&bp_iter))
bp_end = true;
}
}
bch2_trans_iter_exit(trans, &bp_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
return ret;
}
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
@ -732,10 +675,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bkey_init(&s.last_flushed.k->k);
while (1) {
ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
struct bbpos end;
ret = bch2_get_btree_in_memory_pos(trans,
BIT_ULL(BTREE_ID_backpointers),
BIT_ULL(BTREE_ID_backpointers),
BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
if (ret)
break;
s.bucket_end = end.pos;
if ( bpos_eq(s.bucket_start, POS_MIN) &&
!bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@ -763,6 +712,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch_err_fn(c, ret);
return ret;
}
@ -868,6 +820,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
c->btree_cache.pinned_nodes_leaf_mask = 0;
c->btree_cache.pinned_nodes_interior_mask = 0;
bch_err_fn(c, ret);
return ret;
}

View File

@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */

View File

@ -212,6 +212,7 @@
#include "recovery_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "time_stats.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
@ -265,6 +266,9 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
__printf(2, 3)
void __bch2_print(struct bch_fs *c, const char *fmt, ...);
@ -504,6 +508,7 @@ enum gc_phase {
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
GC_PHASE_BTREE_subvolume_children,
GC_PHASE_PENDING_DELETE,
};
@ -593,7 +598,7 @@ struct bch_dev {
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
struct bch2_time_stats io_latency[2];
struct bch2_time_stats_quantiles io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
@ -663,6 +668,8 @@ struct journal_seq_blacklist_table {
};
struct journal_keys {
/* must match layout in darray_types.h */
size_t nr, size;
struct journal_key {
u64 journal_seq;
u32 journal_offset;
@ -671,15 +678,13 @@ struct journal_keys {
bool allocated;
bool overwritten;
struct bkey_i *k;
} *d;
} *data;
/*
* Gap buffer: instead of all the empty space in the array being at the
* end of the buffer - from @nr to @size - the empty space is at @gap.
* This means that sequential insertions are O(n) instead of O(n^2).
*/
size_t gap;
size_t nr;
size_t size;
atomic_t ref;
bool initial_ref_held;
};
@ -703,6 +708,7 @@ struct btree_trans_buf {
x(reflink) \
x(fallocate) \
x(discard) \
x(discard_fast) \
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
@ -919,8 +925,6 @@ struct bch_fs {
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
u64 blocked_allocate;
u64 blocked_allocate_open_bucket;
open_bucket_idx_t open_buckets_freelist;
open_bucket_idx_t open_buckets_nr_free;
@ -940,8 +944,11 @@ struct bch_fs {
unsigned write_points_nr;
struct buckets_waiting_for_journal buckets_waiting_for_journal;
struct work_struct discard_work;
struct work_struct invalidate_work;
struct work_struct discard_work;
struct mutex discard_buckets_in_flight_lock;
DARRAY(struct bpos) discard_buckets_in_flight;
struct work_struct discard_fast_work;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;

View File

@ -189,7 +189,11 @@ struct bversion {
__u32 hi;
__u64 lo;
#endif
} __packed __aligned(4);
} __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__aligned(4)
#endif
;
struct bkey {
/* Size of combined key and value, in u64s */
@ -222,7 +226,36 @@ struct bkey {
__u8 pad[1];
#endif
} __packed __aligned(8);
} __packed
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
/*
* The big-endian version of bkey can't be compiled by rustc with the "aligned"
* attr since it doesn't allow types to have both "packed" and "aligned" attrs.
* So for Rust compatibility, don't include this. It can be included in the LE
* version because the "packed" attr is redundant in that case.
*
* History: (quoting Kent)
*
* Specifically, when i was designing bkey, I wanted the header to be no
* bigger than necessary so that bkey_packed could use the rest. That means that
* decently offten extent keys will fit into only 8 bytes, instead of spilling over
* to 16.
*
* But packed_bkey treats the part after the header - the packed section -
* as a single multi word, variable length integer. And bkey, the unpacked
* version, is just a special case version of a bkey_packed; all the packed
* bkey code will work on keys in any packed format, the in-memory
* representation of an unpacked key also is just one type of packed key...
*
* So that constrains the key part of a bkig endian bkey to start right
* after the header.
*
* If we ever do a bkey_v2 and need to expand the hedaer by another byte for
* some reason - that will clean up this wart.
*/
__aligned(8)
#endif
;
struct bkey_packed {
__u64 _data[0];
@ -840,7 +873,9 @@ struct bch_sb_field_downgrade {
x(snapshot_skiplists, BCH_VERSION(1, 1)) \
x(deleted_inodes, BCH_VERSION(1, 2)) \
x(rebalance_work, BCH_VERSION(1, 3)) \
x(member_seq, BCH_VERSION(1, 4))
x(member_seq, BCH_VERSION(1, 4)) \
x(subvolume_fs_parent, BCH_VERSION(1, 5)) \
x(btree_subvolume_children, BCH_VERSION(1, 6))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -1275,7 +1310,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(dev_usage, 8) \
x(log, 9) \
x(overwrite, 10) \
x(write_buffer_keys, 11)
x(write_buffer_keys, 11) \
x(datetime, 12)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@ -1376,6 +1412,11 @@ struct jset_entry_log {
u8 d[];
} __packed __aligned(8);
struct jset_entry_datetime {
struct jset_entry entry;
__le64 seconds;
} __packed __aligned(8);
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique
@ -1482,7 +1523,9 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
BIT_ULL(KEY_TYPE_set))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@ -4,7 +4,7 @@
#include <linux/bug.h>
#include "bcachefs_format.h"
#include "bkey_types.h"
#include "btree_types.h"
#include "util.h"
#include "vstructs.h"
@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
const struct bkey_format *,
const struct bkey_packed *);
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
const struct bch_val *v;
};
/* bkey with split value */
struct bkey_s {
union {
struct {
struct bkey *k;
struct bch_val *v;
};
struct bkey_s_c s_c;
};
};
#define bkey_p_next(_k) vstruct_next(_k)
static inline struct bkey_i *bkey_next(struct bkey_i *k)
{
return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
{
return bkey_val_u64s(k) * sizeof(u64);
}
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
unsigned u64s = BKEY_U64s + val_u64s;
BUG_ON(u64s > U8_MAX);
k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
EBUG_ON(k->u64s < ret);
return ret;
return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
static inline struct bkey_s bkey_to_s(struct bkey *k)
{
return (struct bkey_s) { .k = k, .v = NULL };
}
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
{
return (struct bkey_s_c) { .k = k, .v = NULL };
}
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
{
return (struct bkey_s) { .k = &k->k, .v = &k->v };
}
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
{
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
}
/*
* For a given type of value (e.g. struct bch_extent), generates the types for
* bkey + bch_extent - inline, split, split const - and also all the conversion
* functions, which also check that the value is of the correct type.
*
* We use anonymous unions for upcasting - e.g. converting from e.g. a
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
#define x(name, ...) \
struct bkey_i_##name { \
union { \
struct bkey k; \
struct bkey_i k_i; \
}; \
struct bch_##name v; \
}; \
\
struct bkey_s_c_##name { \
union { \
struct { \
const struct bkey *k; \
const struct bch_##name *v; \
}; \
struct bkey_s_c s_c; \
}; \
}; \
\
struct bkey_s_##name { \
union { \
struct { \
struct bkey *k; \
struct bch_##name *v; \
}; \
struct bkey_s_c_##name c; \
struct bkey_s s; \
struct bkey_s_c s_c; \
}; \
}; \
\
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline const struct bkey_i_##name * \
bkey_i_to_##name##_c(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
{ \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_c_##name \
name##_i_to_s_c(const struct bkey_i_##name *k) \
{ \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name \
bkey_i_to_s_c_##name(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
{ \
struct bkey_i_##name *k = \
container_of(&_k->k, struct bkey_i_##name, k); \
\
bkey_init(&k->k); \
memset(&k->v, 0, sizeof(k->v)); \
k->k.type = KEY_TYPE_##name; \
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
\
return k; \
}
BCH_BKEY_TYPES();
#undef x
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__

213
fs/bcachefs/bkey_types.h Normal file
View File

@ -0,0 +1,213 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BKEY_TYPES_H
#define _BCACHEFS_BKEY_TYPES_H
#include "bcachefs_format.h"
/*
* bkey_i - bkey with inline value
* bkey_s - bkey with split value
* bkey_s_c - bkey with split value, const
*/
#define bkey_p_next(_k) vstruct_next(_k)
static inline struct bkey_i *bkey_next(struct bkey_i *k)
{
return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
}
#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
static inline size_t bkey_val_bytes(const struct bkey *k)
{
return bkey_val_u64s(k) * sizeof(u64);
}
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
unsigned u64s = BKEY_U64s + val_u64s;
BUG_ON(u64s > U8_MAX);
k->u64s = u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
}
#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
const struct bch_val *v;
};
/* bkey with split value */
struct bkey_s {
union {
struct {
struct bkey *k;
struct bch_val *v;
};
struct bkey_s_c s_c;
};
};
#define bkey_s_null ((struct bkey_s) { .k = NULL })
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
static inline struct bkey_s bkey_to_s(struct bkey *k)
{
return (struct bkey_s) { .k = k, .v = NULL };
}
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
{
return (struct bkey_s_c) { .k = k, .v = NULL };
}
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
{
return (struct bkey_s) { .k = &k->k, .v = &k->v };
}
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
{
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
}
/*
* For a given type of value (e.g. struct bch_extent), generates the types for
* bkey + bch_extent - inline, split, split const - and also all the conversion
* functions, which also check that the value is of the correct type.
*
* We use anonymous unions for upcasting - e.g. converting from e.g. a
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
#define x(name, ...) \
struct bkey_i_##name { \
union { \
struct bkey k; \
struct bkey_i k_i; \
}; \
struct bch_##name v; \
}; \
\
struct bkey_s_c_##name { \
union { \
struct { \
const struct bkey *k; \
const struct bch_##name *v; \
}; \
struct bkey_s_c s_c; \
}; \
}; \
\
struct bkey_s_##name { \
union { \
struct { \
struct bkey *k; \
struct bch_##name *v; \
}; \
struct bkey_s_c_##name c; \
struct bkey_s s; \
struct bkey_s_c s_c; \
}; \
}; \
\
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline const struct bkey_i_##name * \
bkey_i_to_##name##_c(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
{ \
EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
{ \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_c_##name \
name##_i_to_s_c(const struct bkey_i_##name *k) \
{ \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name \
bkey_i_to_s_c_##name(const struct bkey_i *k) \
{ \
EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
{ \
struct bkey_i_##name *k = \
container_of(&_k->k, struct bkey_i_##name, k); \
\
bkey_init(&k->k); \
memset(&k->v, 0, sizeof(k->v)); \
k->k.type = KEY_TYPE_##name; \
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
\
return k; \
}
BCH_BKEY_TYPES();
#undef x
#endif /* _BCACHEFS_BKEY_TYPES_H */

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_io.h"
@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
kvpfree(b->data, btree_buf_bytes(b));
kvfree(b->data);
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
b->data = kvpmalloc(btree_buf_bytes(b), gfp);
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
kvpfree(b->data, btree_buf_bytes(b));
kvfree(b->data);
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
int ret = 0;
lockdep_assert_held(&bc->lock);
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
u64 mask = b->c.level
? bc->pinned_nodes_interior_mask
: bc->pinned_nodes_leaf_mask;
if ((mask & BIT_ULL(b->c.btree_id)) &&
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
return -BCH_ERR_ENOMEM_btree_node_reclaim;
wait_on_io:
if (b->flags & ((1U << BTREE_NODE_dirty)|
(1U << BTREE_NODE_read_in_flight)|
@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
kvpfree(c->verify_ondisk, c->opts.btree_node_size);
kvfree(c->verify_ondisk);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@ -711,6 +724,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b = bch2_btree_node_mem_alloc(trans, level != 0);
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
if (!path)
return b;
trans->memory_allocation_failure = true;
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@ -760,8 +776,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
}
if (!six_relock_type(&b->c.lock, lock_type, seq)) {
if (path)
trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
BUG_ON(!path);
trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
}
@ -901,7 +918,7 @@ retry:
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
return ERR_PTR(-EIO);
return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@ -992,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->c.lock, lock_type);
return ERR_PTR(-EIO);
return ERR_PTR(-BCH_ERR_btree_node_read_error);
}
EBUG_ON(b->c.btree_id != path->btree_id);
@ -1075,7 +1092,7 @@ lock_node:
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
b = ERR_PTR(-EIO);
b = ERR_PTR(-BCH_ERR_btree_node_read_error);
goto out;
}
@ -1096,7 +1113,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
BUG_ON(trans && !btree_node_locked(path, level + 1));
BUG_ON(path && !btree_node_locked(path, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = btree_cache_find(bc, k);

View File

@ -389,7 +389,8 @@ again:
have_child = dropped_children = false;
bch2_bkey_buf_init(&prev_k);
bch2_bkey_buf_init(&cur_k);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@ -406,7 +407,7 @@ again:
printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(ret == -EIO, c,
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
@ -478,7 +479,8 @@ again:
goto err;
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_bkey_buf_reassemble(&cur_k, c, k);
@ -591,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
if (!g->gen_valid &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (fsck_err_on(!g->gen_valid,
c, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@ -609,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (!p.ptr.cached) {
g->gen_valid = true;
g->gen = p.ptr.gen;
@ -631,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
}
}
if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
(c->opts.reconstruct_alloc ||
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
c, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
do_update = true;
if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
@ -931,7 +929,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct printbuf buf = PRINTBUF;
int ret = 0;
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
bkey_init(&prev.k->k);
@ -963,7 +961,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (b->c.level > target_depth) {
bch2_btree_and_journal_iter_exit(&iter);
bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
iter.prefetch = true;
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
struct btree *child;
@ -976,7 +975,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
false);
ret = PTR_ERR_OR_ZERO(child);
if (ret == -EIO) {
if (bch2_err_matches(ret, EIO)) {
bch2_topology_error(c);
if (__fsck_err(c,
@ -1190,9 +1189,7 @@ static void bch2_gc_free(struct bch_fs *c)
genradix_free(&c->gc_stripes);
for_each_member_device(c, ca) {
kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
ca->buckets_gc = NULL;
free_percpu(ca->usage_gc);
@ -1365,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
struct bucket gc, *b;
struct bucket old_gc, gc, *b;
struct bkey_i_alloc_v4 *a;
struct bch_alloc_v4 old_convert, new;
const struct bch_alloc_v4 *old;
enum bch_data_type type;
int ret;
old = bch2_alloc_to_v4(k, &old_convert);
@ -1377,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
b = gc_bucket(ca, iter->pos.offset);
old_gc = *b;
if ((old->data_type == BCH_DATA_sb ||
old->data_type == BCH_DATA_journal) &&
!bch2_dev_is_online(ca)) {
b->data_type = old->data_type;
b->dirty_sectors = old->dirty_sectors;
}
/*
* b->data_type doesn't yet include need_discard & need_gc_gen states -
* fix that here:
*/
type = __alloc_data_type(b->dirty_sectors,
b->cached_sectors,
b->stripe,
*old,
b->data_type);
if (b->data_type != type) {
struct bch_dev_usage *u;
preempt_disable();
u = this_cpu_ptr(ca->usage_gc);
u->d[b->data_type].buckets--;
b->data_type = type;
u->d[b->data_type].buckets++;
preempt_enable();
}
b->data_type = __alloc_data_type(b->dirty_sectors,
b->cached_sectors,
b->stripe,
*old,
b->data_type);
gc = *b;
percpu_up_read(&c->mark_lock);
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors)
bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
if (metadata_only &&
gc.data_type != BCH_DATA_sb &&
gc.data_type != BCH_DATA_journal &&
@ -1410,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (gen_after(old->gen, gc.gen))
return 0;
if (c->opts.reconstruct_alloc ||
fsck_err_on(new.data_type != gc.data_type, c,
if (fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
@ -1422,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
if (c->opts.reconstruct_alloc || \
fsck_err_on(new._f != gc._f, c, _errtype, \
if (fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@ -1491,7 +1486,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
for_each_member_device(c, ca) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
@ -1585,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
" should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@ -1595,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
new->k.type = KEY_TYPE_deleted;
else
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
ret = bch2_trans_update(trans, iter, new, 0);
}
fsck_err:
printbuf_exit(&buf);
@ -1817,10 +1812,10 @@ out:
if (!ret) {
bch2_journal_block(&c->journal);
ret = bch2_gc_stripes_done(c, metadata_only) ?:
bch2_gc_reflink_done(c, metadata_only) ?:
bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only);
ret = bch2_gc_alloc_done(c, metadata_only) ?:
bch2_gc_done(c, initial, metadata_only) ?:
bch2_gc_stripes_done(c, metadata_only) ?:
bch2_gc_reflink_done(c, metadata_only);
bch2_journal_unblock(&c->journal);
}

View File

@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
if (used_mempool)
mempool_free(p, &c->btree_bounce_pool);
else
vpfree(p, size);
kvfree(p);
}
static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
if (!p) {
*used_mempool = true;
p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@ -581,8 +581,7 @@ static int __btree_err(int ret,
break;
case -BCH_ERR_btree_node_read_err_bad_node:
bch2_print_string_as_lines(KERN_ERR, out.buf);
bch2_topology_error(c);
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
ret = bch2_topology_error(c);
break;
case -BCH_ERR_btree_node_read_err_incompatible:
bch2_print_string_as_lines(KERN_ERR, out.buf);
@ -840,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
if (k->format > KEY_FORMAT_CURRENT)
return false;
if (k->u64s < bkeyp_key_u64s(&b->format, k))
return false;
struct printbuf buf = PRINTBUF;
struct bkey tmp;
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@ -881,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
"invalid bkey format %u", k->format))
goto drop_this_key;
/* XXX: validate k->u64s */
if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_u64s,
"k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
goto drop_this_key;
if (!write)
bch2_bkey_compat(b->c.level, b->c.btree_id, version,
BSET_BIG_ENDIAN(i), write,
@ -1737,7 +1745,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
list_move(&b->list, &c->btree_cache.freeable);
mutex_unlock(&c->btree_cache.lock);
ret = -EIO;
ret = -BCH_ERR_btree_node_read_error;
goto err;
}
@ -1841,7 +1849,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
ret = -BCH_ERR_btree_write_all_failed;
ret = -BCH_ERR_btree_node_write_all_failed;
goto err;
}

View File

@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct bkey_s_c k;
int ret = 0;
__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
path = &trans->paths[path_idx];
if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
goto out_uptodate;
path->level = btree_path_up_until_good_node(trans, path, 0);
@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
}
out_uptodate:
path->uptodate = BTREE_ITER_UPTODATE;
out:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
nr * sizeof(btree_path_idx_t) + 8 +
@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
btree_path_set_should_be_locked(trans->paths + iter->path);
struct btree_path *path = btree_iter_path(trans, iter);
if (btree_path_node(path, path->level))
btree_path_set_should_be_locked(path);
return 0;
}
@ -2305,7 +2307,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
return bkey_s_c_err(-EIO);
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@ -2503,6 +2505,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = bch2_btree_iter_peek_upto(&iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
iter->k = iter2.k;
k.k = &iter->k;
}
@ -2762,6 +2765,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
struct btree_trans *trans = src->trans;
*dst = *src;
#ifdef TRACK_PATH_ALLOCATED
dst->ip_allocated = _RET_IP_;
#endif
if (src->path)
__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
@ -3085,7 +3091,7 @@ void bch2_trans_put(struct btree_trans *trans)
trans->paths = NULL;
if (paths_allocated != trans->_paths_allocated)
kfree_rcu_mightsleep(paths_allocated);
kvfree_rcu_mightsleep(paths_allocated);
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);

View File

@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_cache.h"
#include "btree_journal_iter.h"
#include "journal_io.h"
@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
{
return keys->d + idx_to_pos(keys, idx);
return keys->data + idx_to_pos(keys, idx);
}
static size_t __bch2_journal_key_search(struct journal_keys *keys,
@ -180,10 +182,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
if (keys->d[idx].allocated)
kfree(keys->d[idx].k);
keys->d[idx] = n;
journal_key_cmp(&n, &keys->data[idx]) == 0) {
if (keys->data[idx].allocated)
kfree(keys->data[idx].k);
keys->data[idx] = n;
return 0;
}
@ -196,17 +198,17 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.size = max_t(size_t, keys->size, 8) * 2,
};
new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
if (!new_keys.d) {
new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
return -BCH_ERR_ENOMEM_journal_key_insert;
}
/* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
keys->d = new_keys.d;
memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
kvfree(keys->data);
keys->data = new_keys.data;
keys->nr = new_keys.nr;
keys->size = new_keys.size;
@ -216,11 +218,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
journal_iters_move_gap(c, keys->gap, idx);
move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
keys->gap = idx;
move_gap(keys, idx);
keys->nr++;
keys->d[keys->gap++] = n;
keys->data[keys->gap++] = n;
journal_iters_fix(c);
@ -267,10 +268,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->size &&
keys->d[idx].btree_id == btree &&
keys->d[idx].level == level &&
bpos_eq(keys->d[idx].k->k.p, pos))
keys->d[idx].overwritten = true;
keys->data[idx].btree_id == btree &&
keys->data[idx].level == level &&
bpos_eq(keys->data[idx].k->k.p, pos))
keys->data[idx].overwritten = true;
}
static void bch2_journal_iter_advance(struct journal_iter *iter)
@ -284,16 +285,16 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct journal_key *k = iter->keys->d + iter->idx;
struct journal_key *k = iter->keys->data + iter->idx;
while (k < iter->keys->d + iter->keys->size &&
while (k < iter->keys->data + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
k = iter->keys->d + iter->idx;
k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
@ -334,9 +335,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
iter->pos = bpos_successor(iter->pos);
}
static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
{
struct btree_and_journal_iter iter = *_iter;
struct bch_fs *c = iter.trans->c;
unsigned level = iter.journal.level;
struct bkey_buf tmp;
unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (level > 1 ? 0 : 2)
: (level > 1 ? 1 : 16);
iter.prefetch = false;
bch2_bkey_buf_init(&tmp);
while (nr--) {
bch2_btree_and_journal_iter_advance(&iter);
struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
if (!k.k)
break;
bch2_bkey_buf_reassemble(&tmp, c, k);
bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
}
bch2_bkey_buf_exit(&tmp, c);
}
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
{
struct bkey_s_c btree_k, journal_k, ret;
if (iter->prefetch && iter->journal.level)
btree_and_journal_iter_prefetch(iter);
again:
if (iter->at_end)
return bkey_s_c_null;
@ -376,17 +406,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c,
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b,
struct btree_node_iter node_iter,
struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
@ -396,15 +427,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter
* this version is used by btree_gc before filesystem has gone RW and
* multithreaded, so uses the journal_iters list:
*/
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
struct bch_fs *c,
void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
struct btree_and_journal_iter *iter,
struct btree *b)
{
struct btree_node_iter node_iter;
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
list_add(&iter->journal.list, &c->journal_iters);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@ -415,9 +446,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
struct genradix_iter iter;
genradix_for_each(&c->journal_entries, iter, i)
if (*i)
kvpfree(*i, offsetof(struct journal_replay, j) +
vstruct_bytes(&(*i)->j));
kvfree(*i);
genradix_free(&c->journal_entries);
}
@ -437,22 +466,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
void bch2_journal_keys_put(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
struct journal_key *i;
BUG_ON(atomic_read(&keys->ref) <= 0);
if (!atomic_dec_and_test(&keys->ref))
return;
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
move_gap(keys, keys->nr);
for (i = keys->d; i < keys->d + keys->nr; i++)
darray_for_each(*keys, i)
if (i->allocated)
kfree(i->k);
kvfree(keys->d);
keys->d = NULL;
kvfree(keys->data);
keys->data = NULL;
keys->nr = keys->gap = keys->size = 0;
bch2_journal_entries_free(c);
@ -460,83 +487,38 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
struct journal_key *src, *dst;
sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
struct journal_key *dst = keys->data;
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
!journal_key_cmp(src, src + 1))
src++;
darray_for_each(*keys, src) {
if (src + 1 < &darray_top(*keys) &&
!journal_key_cmp(src, src + 1))
continue;
*dst++ = *src++;
*dst++ = *src;
}
keys->nr = dst - keys->d;
keys->nr = dst - keys->data;
}
int bch2_journal_keys_sort(struct bch_fs *c)
{
struct genradix_iter iter;
struct journal_replay *i, **_i;
struct jset_entry *entry;
struct bkey_i *k;
struct journal_keys *keys = &c->journal_keys;
size_t nr_keys = 0, nr_read = 0;
size_t nr_read = 0;
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
continue;
for_each_jset_key(k, entry, &i->j)
nr_keys++;
}
if (!nr_keys)
return 0;
keys->size = roundup_pow_of_two(nr_keys);
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
if (!keys->d) {
bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
nr_keys);
do {
keys->size >>= 1;
keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
} while (!keys->d && keys->size > nr_keys / 8);
if (!keys->d) {
bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
keys->size);
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
}
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
cond_resched();
for_each_jset_key(k, entry, &i->j) {
if (keys->nr == keys->size) {
__journal_keys_sort(keys);
if (keys->nr > keys->size * 7 / 8) {
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
keys->nr, keys->size, nr_read, nr_keys);
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
}
keys->d[keys->nr++] = (struct journal_key) {
struct journal_key n = (struct journal_key) {
.btree_id = entry->btree_id,
.level = entry->level,
.k = k,
@ -544,6 +526,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
.journal_offset = k->_data - i->j._data,
};
if (darray_push(keys, n)) {
__journal_keys_sort(keys);
if (keys->nr * 8 > keys->size * 7) {
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
return -BCH_ERR_ENOMEM_journal_keys_sort;
}
BUG_ON(darray_push(keys, n));
}
nr_read++;
}
}
@ -551,6 +545,6 @@ int bch2_journal_keys_sort(struct bch_fs *c)
__journal_keys_sort(keys);
keys->gap = keys->nr;
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
return 0;
}

View File

@ -15,6 +15,7 @@ struct journal_iter {
*/
struct btree_and_journal_iter {
struct btree_trans *trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey unpacked;
@ -22,6 +23,7 @@ struct btree_and_journal_iter {
struct journal_iter journal;
struct bpos pos;
bool at_end;
bool prefetch;
};
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);
int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
struct btree_and_journal_iter *);
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *, struct btree *,
void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
struct btree_and_journal_iter *, struct btree *,
struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
struct btree_and_journal_iter *, struct btree *);
void bch2_journal_keys_put(struct bch_fs *);

View File

@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
struct bkey_i *new_k = NULL;
int ret;
k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
BTREE_ITER_KEY_CACHE_FILL|
BTREE_ITER_CACHED_NOFILL);
bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
BTREE_ITER_KEY_CACHE_FILL|
BTREE_ITER_CACHED_NOFILL);
iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;

View File

@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
return;
trans_for_each_path(trans, path, i)
bch2_btree_path_downgrade(trans, path);
if (path->ref)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)

View File

@ -5,6 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
#include "bbpos_types.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
@ -173,6 +174,11 @@ struct btree_cache {
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
struct bbpos pinned_nodes_start;
struct bbpos pinned_nodes_end;
u64 pinned_nodes_leaf_mask;
u64 pinned_nodes_interior_mask;
};
struct btree_node_iter {
@ -654,6 +660,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_subvolumes)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
@ -727,7 +734,7 @@ struct btree_root {
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
s8 error;
s16 error;
};
enum btree_gc_coalesce_fail_reason {

View File

@ -452,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
* the key cache - but the key has to exist in the btree for that to
* work:
*/
if (path->cached && bkey_deleted(&i->old_k))
if (path->cached && !i->old_btree_u64s)
return flush_new_cached_update(trans, i, flags, ip);
return 0;
@ -787,6 +787,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
int ret = PTR_ERR_OR_ZERO(k);
if (ret)
return ret;
bkey_init(&k->k);
k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
k->k.p = pos;
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_trans_update(trans, &iter, k, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
struct bkey_i k;

View File

@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, unsigned, u64 *);
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
enum btree_id btree, struct bpos pos)
{
return bch2_btree_bit_mod(trans, btree, pos, false);
return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,

View File

@ -25,8 +25,7 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *,
struct keylist *, unsigned);
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
BUG_ON(btree_node_root(c, b) &&
(b->c.level < btree_node_root(c, b)->c.level ||
!btree_node_dying(btree_node_root(c, b))));
bch2_btree_id_root(c, b->c.btree_id)->b = b;
mutex_unlock(&c->btree_root_lock);
@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as,
static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path, struct btree *b,
struct keylist *keys, unsigned flags)
struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree *parent = btree_node_parent(trans->paths + path, b);
@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (parent) {
/* Split a non root node */
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err;
} else if (n3) {
@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
* @flags: transaction commit flags
*
* Returns: 0 on success, typically transaction restart error on failure
*
@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx, struct btree *b,
struct keylist *keys, unsigned flags)
struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
@ -1739,7 +1733,7 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
return btree_split(as, trans, path_idx, b, keys, flags);
return btree_split(as, trans, path_idx, b, keys);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
unsigned flags)
{
/* btree_split & merge may both cause paths array to be reallocated */
struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
if (IS_ERR(as))
return PTR_ERR(as);
ret = btree_split(as, trans, path, b, NULL, flags);
ret = btree_split(as, trans, path, b, NULL);
if (ret) {
bch2_btree_update_free(as, trans);
return ret;
@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
return ret;
}
static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
btree_path_idx_t path_idx)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
BUG_ON(!btree_node_locked(path, b->c.level));
n = __btree_root_alloc(as, trans, b->c.level + 1);
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
path->locks_want++;
BUG_ON(btree_node_locked(path, n->c.level));
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
bch2_btree_path_level_init(trans, path, n);
n->sib_u64s[0] = U16_MAX;
n->sib_u64s[1] = U16_MAX;
bch2_keylist_add(&as->parent_keys, &b->key);
btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
bch2_btree_set_root(as, trans, path, n);
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
bch2_trans_node_add(trans, path, n);
six_unlock_intent(&n->c.lock);
mutex_lock(&c->btree_cache.lock);
list_add_tail(&b->list, &c->btree_cache.live);
mutex_unlock(&c->btree_cache.lock);
bch2_trans_verify_locks(trans);
}
int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
struct btree_update *as =
bch2_btree_update_start(trans, trans->paths + path,
b->c.level, true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
__btree_increase_depth(as, trans, path);
bch2_btree_update_done(as, trans);
return 0;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
btree_path_idx_t path,
unsigned level,
@ -1845,8 +1892,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
__func__, buf1.buf, buf2.buf);
printbuf_exit(&buf1);
printbuf_exit(&buf2);
bch2_topology_error(c);
ret = -EIO;
ret = bch2_topology_error(c);
goto err;
}
@ -1916,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_trans_verify_paths(trans);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
if (ret)
goto err_free_update;
@ -1987,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
ret = bch2_btree_insert_node(as, trans, iter->path,
parent, &as->parent_keys, flags);
ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
if (ret)
goto err;
} else {
@ -2485,7 +2530,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;

View File

@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);

View File

@ -574,8 +574,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
{
struct journal_keys_to_wb dst;
struct jset_entry *entry;
struct bkey_i *k;
int ret = 0;
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
@ -590,7 +588,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
entry->type = BCH_JSET_ENTRY_btree_keys;
}
spin_lock(&c->journal.lock);
buf->need_flush_to_write_buffer = false;
spin_unlock(&c->journal.lock);
out:
bch2_journal_keys_to_write_buffer_end(c, &dst);
return ret;

View File

@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
new.k->p, mod > 0);
if (ret)
return ret;
}
@ -1335,7 +1336,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
struct bucket_gens *buckets =
container_of(rcu, struct bucket_gens, rcu);
kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
kvfree(buckets);
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@ -1345,16 +1346,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bool resize = ca->bucket_gens != NULL;
int ret;
if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
GFP_KERNEL|__GFP_ZERO))) {
ret = -BCH_ERR_ENOMEM_bucket_gens;
goto err;
}
if ((c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)))) {
!(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)))) {
ret = -BCH_ERR_ENOMEM_buckets_nouse;
goto err;
}
@ -1397,8 +1398,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0;
err:
kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvfree(buckets_nouse);
if (bucket_gens)
call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
@ -1407,27 +1407,21 @@ err:
void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
kvfree(ca->buckets_nouse);
kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
sizeof(struct bucket_gens) + ca->mi.nbuckets);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
unsigned i;
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -BCH_ERR_ENOMEM_usage_init;
for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -BCH_ERR_ENOMEM_usage_init;

View File

@ -22,12 +22,6 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
__must_check
static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
{
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@ -155,19 +149,35 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(void *arg)
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
if (!thr->thr.thr.ret)
bch2_fs_stop(c);
if (IS_ERR(c))
return PTR_ERR(c);
thread_with_stdio_done(&thr->thr);
return 0;
int ret = 0;
if (test_bit(BCH_FS_errors_fixed, &c->flags))
ret |= 1;
if (test_bit(BCH_FS_error, &c->flags))
ret |= 4;
bch2_fs_stop(c);
if (ret & 1)
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
if (ret & 4)
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
return ret;
}
static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
.exit = bch2_fsck_thread_exit,
.fn = bch2_fsck_offline_thread_fn,
};
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
@ -220,9 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
ret = bch2_run_thread_with_stdio(&thr->thr,
bch2_fsck_thread_exit,
bch2_fsck_offline_thread_fn);
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
err:
if (ret < 0) {
if (thr)
@ -763,9 +771,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
static int bch2_fsck_online_thread_fn(void *arg)
static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = thr->c;
c->stdio_filter = current;
@ -793,13 +801,16 @@ static int bch2_fsck_online_thread_fn(void *arg)
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
thread_with_stdio_done(&thr->thr);
up(&c->online_fsck_mutex);
bch2_ro_ref_put(c);
return 0;
return ret;
}
static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
.exit = bch2_fsck_thread_exit,
.fn = bch2_fsck_online_thread_fn,
};
static long bch2_ioctl_fsck_online(struct bch_fs *c,
struct bch_ioctl_fsck_online arg)
{
@ -840,9 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
goto err;
}
ret = bch2_run_thread_with_stdio(&thr->thr,
bch2_fsck_thread_exit,
bch2_fsck_online_thread_fn);
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
err:
if (ret < 0) {
bch_err_fn(c, ret);

View File

@ -558,7 +558,7 @@ got_key:
return 0;
}
#include "../crypto.h"
#include "crypto.h"
#endif
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)

View File

@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
return 0;
if (!mempool_initialized(&c->compression_bounce[READ]) &&
mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_read_init;
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
return -BCH_ERR_ENOMEM_compression_bounce_write_init;
for (i = compression_types;
@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
if (mempool_init_kvpmalloc_pool(
if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
return -BCH_ERR_ENOMEM_compression_workspace_init;
}
if (!mempool_initialized(&c->decompress_workspace) &&
mempool_init_kvpmalloc_pool(&c->decompress_workspace,
1, decompress_workspace_size))
mempool_init_kvmalloc_pool(&c->decompress_workspace,
1, decompress_workspace_size))
return -BCH_ERR_ENOMEM_decompression_workspace_init;
return 0;

View File

@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
kvpfree(n_ondisk, btree_buf_bytes(b));
kvfree(n_ondisk);
percpu_ref_put(&ca->io_ref);
}

View File

@ -144,19 +144,21 @@ fsck_err:
return ret;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
prt_printf(out, "%.*s -> %llu type %s",
d_name.len,
d_name.name,
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
bch2_d_type_str(d.v->d_type));
prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
if (d.v->d_type != DT_SUBVOL)
prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
else
prt_printf(out, "%u -> %u",
le32_to_cpu(d.v->d_parent_subvol),
le32_to_cpu(d.v->d_child_subvol));
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
int bch2_dirent_create_snapshot(struct btree_trans *trans,
u64 dir, u32 snapshot,
u32 dir_subvol, u64 dir, u32 snapshot,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *dir_offset,
bch_str_hash_flags_t str_hash_flags)
{
subvol_inum zero_inum = { 0 };
subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
struct bkey_i_dirent *dirent;
int ret;
dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.inode = dir;
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
zero_inum, snapshot,
&dirent->k_i, str_hash_flags,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
dir_inum, snapshot,
&dirent->k_i, str_hash_flags,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
*dir_offset = dirent->k.p.offset;
return ret;
@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
unsigned src_update_flags = 0;
bool delete_src, delete_dst;
int ret = 0;
if (src_dir.subvol != dst_dir.subvol)
return -EXDEV;
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
src_type = bkey_s_c_to_dirent(old_src).v->d_type;
if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
return -EOPNOTSUPP;
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
if (dst_type == DT_SUBVOL)
return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
if (new_dst->v.d_type == DT_SUBVOL)
new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
if ((mode == BCH_RENAME_EXCHANGE) &&
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
out_set_src:
/*
* If we're deleting a subvolume, we need to really delete the dirent,
* not just emit a whiteout in the current snapshot:
* If we're deleting a subvolume we need to really delete the dirent,
* not just emit a whiteout in the current snapshot - there can only be
* single dirent that points to a given subvolume.
*
* IOW, we don't maintain multiple versions in different snapshots of
* dirents that point to subvolumes - dirents that point to subvolumes
* are only visible in one particular subvolume so it's not necessary,
* and it would be particularly confusing for fsck to have to deal with.
*/
if (src_type == DT_SUBVOL) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter);
delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
new_src->k.p.snapshot != old_src.k->p.snapshot;
delete_dst = old_dst.k &&
bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
new_dst->k.p.snapshot != old_dst.k->p.snapshot;
if (!delete_src || !bkey_deleted(&new_src->k)) {
ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
new_src->k.p = src_iter.pos;
src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
}
ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (ret)
goto out;
if (delete_src) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter) ?:
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
if (ret)
goto out;
}
if (delete_dst) {
bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
ret = bch2_btree_iter_traverse(&dst_iter) ?:
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
if (ret)
goto out;
}
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
@ -456,41 +472,29 @@ out:
return ret;
}
int __bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
int bch2_dirent_lookup_trans(struct btree_trans *trans,
struct btree_iter *iter,
subvol_inum dir,
const struct bch_hash_info *hash_info,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
struct bkey_s_c k;
struct bkey_s_c_dirent d;
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
if (ret)
return ret;
ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
hash_info, dir, name, flags);
if (ret)
return ret;
k = bch2_btree_iter_peek_slot(iter);
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, dir, d, inum);
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
if (ret > 0)
ret = -ENOENT;
err:
if (ret)
bch2_trans_iter_exit(trans, iter);
return ret;
}
@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
struct btree_iter iter = { NULL };
int ret = lockrestart_do(trans,
__bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
SPOS(dir, 0, snapshot),
POS(dir, U64_MAX), 0, k, ret)
if (k.k->type == KEY_TYPE_dirent) {
ret = -ENOTEMPTY;
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
continue;
ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
break;
}
bch2_trans_iter_exit(trans, &iter);
@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
}
int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)

View File

@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
bch_str_hash_flags_t);
@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
subvol_inum, const struct bch_hash_info *,
const struct qstr *, subvol_inum *, unsigned);
u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
const struct bch_hash_info *,
const struct qstr *, subvol_inum *);
int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);

View File

@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
unsigned i;
for (i = 0; i < s->v.nr_blocks; i++) {
kvpfree(buf->data[i], buf->size << 9);
kvfree(buf->data[i]);
buf->data[i] = NULL;
}
}
@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < v->nr_blocks; i++) {
buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "errcode.h"
#include "trace.h"
#include <linux/errname.h>
@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
return err == class;
}
int __bch2_err_class(int err)
int __bch2_err_class(int bch_err)
{
err = -err;
BUG_ON((unsigned) err >= BCH_ERR_MAX);
int std_err = -bch_err;
BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
err = bch2_errcode_parents[err - BCH_ERR_START];
while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
return -err;
trace_error_downcast(bch_err, std_err, _RET_IP_);
return -std_err;
}
const char *bch2_blk_status_to_str(blk_status_t status)

View File

@ -5,6 +5,10 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
x(EINVAL, mount_option) \
x(BCH_ERR_mount_option, option_name) \
x(BCH_ERR_mount_option, option_value) \
x(BCH_ERR_mount_option, option_not_bool) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@ -78,6 +82,7 @@
x(ENOMEM, ENOMEM_fs_name_alloc) \
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
x(ENOMEM, ENOMEM_disk_accounting) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@ -109,6 +114,8 @@
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(0, open_buckets_empty) \
x(0, freelist_empty) \
x(BCH_ERR_freelist_empty, no_buckets_found) \
@ -176,6 +183,9 @@
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
x(EINVAL, opt_parse_error) \
x(EINVAL, remove_with_metadata_missing_unimplemented)\
x(EINVAL, remove_would_lose_data) \
x(EINVAL, btree_iter_with_journal_not_supported) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@ -225,7 +235,10 @@
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
x(EIO, sb_not_downgraded) \
x(EIO, btree_write_all_failed) \
x(EIO, btree_node_write_all_failed) \
x(EIO, btree_node_read_error) \
x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@ -238,7 +251,8 @@
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem)
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, need_inode_lock)
enum bch_errcode {
BCH_ERR_START = 2048,

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
#include "recovery.h"
#include "super.h"
#include "thread_with_file.h"
@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
void bch2_topology_error(struct bch_fs *c)
int bch2_topology_error(struct bch_fs *c)
{
set_bit(BCH_FS_topology_error, &c->flags);
if (!test_bit(BCH_FS_fsck_running, &c->flags))
if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
bch2_inconsistent_error(c);
return -BCH_ERR_btree_need_topology_repair;
} else {
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
-BCH_ERR_btree_node_read_validate_error;
}
}
void bch2_fatal_error(struct bch_fs *c)

View File

@ -30,7 +30,7 @@ struct work_struct;
bool bch2_inconsistent_error(struct bch_fs *);
void bch2_topology_error(struct bch_fs *);
int bch2_topology_error(struct bch_fs *);
#define bch2_fs_inconsistent(c, ...) \
({ \

View File

@ -43,6 +43,11 @@ enum bkey_invalid_flags;
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
#define extent_entry_next_safe(_entry, _end) \
(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \
? extent_entry_next(_entry) \
: _end)
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
for ((_entry) = (_start); \
(_entry) < (_end); \
(_entry) = extent_entry_next(_entry))
(_entry) = extent_entry_next_safe(_entry, _end))
#define __bkey_ptr_next(_ptr, _end) \
({ \
@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
(_ptr).has_ec = false; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (extent_entry_type(_entry)) { \
switch (__extent_entry_type(_entry)) { \
case BCH_EXTENT_ENTRY_ptr: \
(_ptr).ptr = _entry->ptr; \
goto out; \
@ -344,7 +349,7 @@ out: \
for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
(_entry) = _start; \
__bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
(_entry) = extent_entry_next(_entry))
(_entry) = extent_entry_next_safe(_entry, _end))
#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \

View File

@ -24,12 +24,12 @@ struct { \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
(fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
(fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
kvpfree((fifo)->data, fifo_buf_size(fifo)); \
kvfree((fifo)->data); \
(fifo)->data = NULL; \
} while (0)

View File

@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
u32 new_subvol, dir_snapshot;
ret = bch2_subvolume_create(trans, new_inode->bi_inum,
dir.subvol,
snapshot_src.subvol,
&new_subvol, &snapshot,
(flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u,
const struct qstr *name,
bool deleting_snapshot)
bool deleting_subvol)
{
struct bch_fs *c = trans->c;
struct btree_iter dir_iter = { NULL };
@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_hash = bch2_hash_info_init(c, dir_u);
ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_INTENT);
ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
name, &inum, BTREE_ITER_INTENT);
if (ret)
goto err;
@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, inum);
if (ret)
goto err;
}
if (deleting_snapshot && !inode_u->bi_subvol) {
if (deleting_subvol && !inode_u->bi_subvol) {
ret = -BCH_ERR_ENOENT_not_subvol;
goto err;
}
if (deleting_snapshot || inode_u->bi_subvol) {
if (inode_u->bi_subvol) {
/* Recursive subvolume destroy not allowed (yet?) */
ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
if (ret)
goto err;
}
if (deleting_subvol || inode_u->bi_subvol) {
ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
if (ret)
goto err;
@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
return ret;
}
static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
{
struct btree_iter iter;
struct bkey_i_subvolume *s =
bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, subvol),
BTREE_ITER_CACHED, subvolume);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
s->v.fs_path_parent = cpu_to_le32(new_parent);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
int bch2_rename_trans(struct btree_trans *trans,
subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
if (src_inode_u->bi_subvol &&
dst_dir.subvol != src_inode_u->bi_parent_subvol) {
ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
if (ret)
goto err;
}
if (mode == BCH_RENAME_EXCHANGE &&
dst_inode_u->bi_subvol &&
src_dir.subvol != dst_inode_u->bi_parent_subvol) {
ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
if (ret)
goto err;
}
/* Can't move across subvolumes, unless it's a subvolume root: */
if (src_dir.subvol != dst_dir.subvol &&
(!src_inode_u->bi_subvol ||
(dst_inum.inum && !dst_inode_u->bi_subvol))) {
ret = -EXDEV;
goto err;
}
if (src_inode_u->bi_parent_subvol)
src_inode_u->bi_parent_subvol = dst_dir.subvol;
if ((mode == BCH_RENAME_EXCHANGE) &&
dst_inode_u->bi_parent_subvol)
dst_inode_u->bi_parent_subvol = src_dir.subvol;
src_inode_u->bi_dir = dst_dir_u->bi_inum;
src_inode_u->bi_dir_offset = dst_offset;
@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
if (S_ISDIR(dst_inode_u->bi_mode) &&
bch2_empty_dir_trans(trans, dst_inum)) {
ret = -ENOTEMPTY;
goto err;
if (S_ISDIR(dst_inode_u->bi_mode)) {
ret = bch2_empty_dir_trans(trans, dst_inum);
if (ret)
goto err;
}
}

View File

@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
static int __bch2_buffered_write(struct bch_inode_info *inode,
struct address_space *mapping,
struct iov_iter *iter,
loff_t pos, unsigned len)
loff_t pos, unsigned len,
bool inode_locked)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
BUG_ON(!fs.nr);
/*
* If we're not using the inode lock, we need to lock all the folios for
* atomiticity of writes vs. other writes:
*/
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
ret = -BCH_ERR_need_inode_lock;
goto out;
}
f = darray_first(fs);
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
ret = bch2_read_single_folio(f, mapping);
@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
end = pos + copied;
spin_lock(&inode->v.i_lock);
if (end > inode->v.i_size)
if (end > inode->v.i_size) {
BUG_ON(!inode_locked);
i_size_write(&inode->v, end);
}
spin_unlock(&inode->v.i_lock);
f_pos = pos;
@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
loff_t pos = iocb->ki_pos;
ssize_t written = 0;
int ret = 0;
loff_t pos;
bool inode_locked = false;
ssize_t written = 0, written2 = 0, ret = 0;
/*
* We don't take the inode lock unless i_size will be changing. Folio
* locks provide exclusion with other writes, and the pagecache add lock
* provides exclusion with truncate and hole punching.
*
* There is one nasty corner case where atomicity would be broken
* without great care: when copying data from userspace to the page
* cache, we do that with faults disable - a page fault would recurse
* back into the filesystem, taking filesystem locks again, and
* deadlock; so it's done with faults disabled, and we fault in the user
* buffer when we aren't holding locks.
*
* If we do part of the write, but we then race and in the userspace
* buffer have been evicted and are no longer resident, then we have to
* drop our folio locks to re-fault them in, breaking write atomicity.
*
* To fix this, we restart the write from the start, if we weren't
* holding the inode lock.
*
* There is another wrinkle after that; if we restart the write from the
* start, and then get an unrecoverable error, we _cannot_ claim to
* userspace that we did not write data we actually did - so we must
* track (written2) the most we ever wrote.
*/
if ((iocb->ki_flags & IOCB_APPEND) ||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
inode_lock(&inode->v);
inode_locked = true;
}
ret = generic_write_checks(iocb, iter);
if (ret <= 0)
goto unlock;
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
if (ret) {
if (!inode_locked) {
inode_lock(&inode->v);
inode_locked = true;
ret = file_remove_privs_flags(file, 0);
}
if (ret)
goto unlock;
}
ret = file_update_time(file);
if (ret)
goto unlock;
pos = iocb->ki_pos;
bch2_pagecache_add_get(inode);
if (!inode_locked &&
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
goto get_inode_lock;
do {
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned bytes = iov_iter_count(iter);
@ -1004,12 +1072,17 @@ again:
}
}
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
goto get_inode_lock;
if (unlikely(fatal_signal_pending(current))) {
ret = -EINTR;
break;
}
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
if (ret == -BCH_ERR_need_inode_lock)
goto get_inode_lock;
if (unlikely(ret < 0))
break;
@ -1030,50 +1103,46 @@ again:
}
pos += ret;
written += ret;
written2 = max(written, written2);
if (ret != bytes && !inode_locked)
goto get_inode_lock;
ret = 0;
balance_dirty_pages_ratelimited(mapping);
if (0) {
get_inode_lock:
bch2_pagecache_add_put(inode);
inode_lock(&inode->v);
inode_locked = true;
bch2_pagecache_add_get(inode);
iov_iter_revert(iter, written);
pos -= written;
written = 0;
ret = 0;
}
} while (iov_iter_count(iter));
bch2_pagecache_add_put(inode);
return written ? written : ret;
}
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
ssize_t ret;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = bch2_direct_write(iocb, from);
goto out;
}
inode_lock(&inode->v);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto unlock;
ret = file_remove_privs(file);
if (ret)
goto unlock;
ret = file_update_time(file);
if (ret)
goto unlock;
ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
unlock:
inode_unlock(&inode->v);
if (inode_locked)
inode_unlock(&inode->v);
iocb->ki_pos += written;
ret = max(written, written2) ?: ret;
if (ret > 0)
ret = generic_write_sync(iocb, ret);
out:
return ret;
}
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
? bch2_direct_write(iocb, iter)
: bch2_buffered_write(iocb, iter);
return bch2_err_class(ret);
}

View File

@ -51,13 +51,10 @@ enum bch_folio_sector_state {
struct bch_folio_sector {
/* Uncompressed, fully allocated replicas (or on disk reservation): */
unsigned nr_replicas:4;
u8 nr_replicas:4,
/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
unsigned replicas_reserved:4;
/* i_sectors: */
enum bch_folio_sector_state state:8;
replicas_reserved:4;
u8 state;
};
struct bch_folio {

View File

@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
{
struct bch_inode_unpacked inode_u;
struct bch_inode_info *inode;
struct btree_trans *trans;
struct bch_subvolume subvol;
int ret;
subvol_inum inum = inode_inum(inode);
struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
inode = to_bch_ei(iget5_locked(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->v.i_state & I_NEW))
return &inode->v;
trans = bch2_trans_get(c);
ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
if (!ret)
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
bch2_trans_put(trans);
if (ret) {
iget_failed(&inode->v);
return ERR_PTR(bch2_err_class(ret));
if (unlikely(old != inode)) {
discard_new_inode(&inode->v);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
* we really don't want insert_inode_locked2() to be setting
* I_NEW...
*/
unlock_new_inode(&inode->v);
}
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
return inode;
}
unlock_new_inode(&inode->v);
#define memalloc_flags_do(_flags, _do) \
({ \
unsigned _saved_flags = memalloc_flags_save(_flags); \
typeof(_do) _ret = _do; \
memalloc_noreclaim_restore(_saved_flags); \
_ret; \
})
return &inode->v;
/*
* Allocate a new inode, dropping/retaking btree locks if necessary:
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct bch_inode_info *inode =
memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
to_bch_ei(new_inode(c->vfs_sb)));
if (unlikely(!inode)) {
int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
if (ret && inode)
discard_new_inode(&inode->v);
if (ret)
return ERR_PTR(ret);
}
return inode;
}
struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
{
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode)
return &inode->v;
struct btree_trans *trans = bch2_trans_get(c);
struct bch_inode_unpacked inode_u;
struct bch_subvolume subvol;
int ret = lockrestart_do(trans,
bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (!ret) {
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
}
bch2_trans_put(trans);
return ret ? ERR_PTR(ret) : &inode->v;
}
struct bch_inode_info *
@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap,
struct bch_fs *c = dir->v.i_sb->s_fs_info;
struct btree_trans *trans;
struct bch_inode_unpacked dir_u;
struct bch_inode_info *inode, *old;
struct bch_inode_info *inode;
struct bch_inode_unpacked inode_u;
struct posix_acl *default_acl = NULL, *acl = NULL;
subvol_inum inum;
@ -293,7 +336,6 @@ err_before_quota:
mutex_unlock(&dir->ei_update_lock);
}
bch2_iget5_set(&inode->v, &inum);
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@ -304,36 +346,7 @@ err_before_quota:
* bch2_trans_exit() and dropping locks, else we could race with another
* thread pulling the inode in and modifying it:
*/
inode->v.i_state |= I_CREATING;
old = to_bch_ei(inode_insert5(&inode->v,
bch2_inode_hash(inum),
bch2_iget5_test,
bch2_iget5_set,
&inum));
BUG_ON(!old);
if (unlikely(old != inode)) {
/*
* We raced, another process pulled the new inode into cache
* before us:
*/
make_bad_inode(&inode->v);
iput(&inode->v);
inode = old;
} else {
mutex_lock(&c->vfs_inodes_lock);
list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
mutex_unlock(&c->vfs_inodes_lock);
/*
* we really don't want insert_inode_locked2() to be setting
* I_NEW...
*/
unlock_new_inode(&inode->v);
}
inode = bch2_inode_insert(c, inode);
bch2_trans_put(trans);
err:
posix_acl_release(default_acl);
@ -352,23 +365,78 @@ err_trans:
/* methods */
static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
subvol_inum dir, struct bch_hash_info *dir_hash_info,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
dir_hash_info, dir, name, 0);
if (ret)
return ERR_PTR(ret);
struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
goto err;
struct bch_inode_info *inode =
to_bch_ei(ilookup5_nowait(c->vfs_sb,
bch2_inode_hash(inum),
bch2_iget5_test,
&inum));
if (inode)
goto out;
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
if (bch2_err_matches(ret, ENOENT)) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err(c, "%s points to missing inode", buf.buf);
printbuf_exit(&buf);
}
if (ret)
goto err;
bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
inode = bch2_inode_insert(c, inode);
out:
bch2_trans_iter_exit(trans, &dirent_iter);
return inode;
err:
inode = ERR_PTR(ret);
goto out;
}
static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
unsigned int flags)
{
struct bch_fs *c = vdir->i_sb->s_fs_info;
struct bch_inode_info *dir = to_bch_ei(vdir);
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct inode *vinode = NULL;
subvol_inum inum = { .subvol = 1 };
int ret;
ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
&dentry->d_name, &inum);
struct bch_inode_info *inode;
bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
&hash, &dentry->d_name)));
if (IS_ERR(inode))
inode = NULL;
if (!ret)
vinode = bch2_vfs_inode_get(c, inum);
return d_splice_alias(vinode, dentry);
return d_splice_alias(&inode->v, dentry);
}
static int bch2_mknod(struct mnt_idmap *idmap,
@ -1372,6 +1440,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
struct bch_inode_unpacked *bi,
struct bch_subvolume *subvol)
{
bch2_iget5_set(&inode->v, &inum);
bch2_inode_update_after_write(trans, inode, bi, ~0);
if (BCH_SUBVOLUME_SNAP(subvol))
@ -1572,7 +1641,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
* number:
*/
u64 avail_inodes = ((usage.capacity - usage.used) << 3);
u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_bsize = sb->s_blocksize;
@ -1583,10 +1651,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = usage.nr_inodes + avail_inodes;
buf->f_ffree = avail_inodes;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b);
buf->f_namelen = BCH_NAME_MAX;
return 0;
@ -1805,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
ret = bch2_parse_mount_opts(NULL, &opts, data);
if (ret)
if (ret) {
ret = bch2_err_class(ret);
return ERR_PTR(ret);
}
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
@ -1882,6 +1949,7 @@ got_sb:
sb->s_time_gran = c->sb.nsec_per_time_unit;
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
sb->s_uuid = c->sb.user_uuid;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
int bch2_check_subvolume_structure(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);

View File

@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
return bch2_inode_unpack_slowpath(k, unpacked);
}
static int bch2_inode_peek_nowarn(struct btree_trans *trans,
int bch2_inode_peek_nowarn(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
subvol_inum inum, unsigned flags)
@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
int __bch2_fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct bkey_inode_buf *inode_p =
bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
bch2_inode_pack(inode_p, inode);
inode_p->inode.k.p.snapshot = snapshot;
return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
&inode_p->inode.k_i,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
}
int bch2_fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_fsck_write_inode(trans, inode, snapshot));
bch_err_fn(trans->c, ret);
return ret;
}
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
{
struct bch_inode_unpacked u;
@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
bool old_deleted = bkey_is_deleted_inode(old);
bool new_deleted = bkey_is_deleted_inode(new.s_c);
if (old_deleted != new_deleted) {
int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
new.k->p, new_deleted);
if (ret)
return ret;
}
@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
if (S_ISDIR(inode.bi_mode)) {
ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
c, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
@ -1141,7 +1171,7 @@ fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bool need_another_pass;
int ret;
again:
/*
* if we ran check_inodes() unlinked inodes will have already been
* cleaned up but the write buffer will be out of sync; therefore we
* alway need a write buffer flush
*/
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
need_another_pass = false;
/*
@ -1183,12 +1222,8 @@ again:
ret;
}));
if (!ret && need_another_pass) {
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
if (!ret && need_another_pass)
goto again;
}
err:
bch2_trans_put(trans);
return ret;

View File

@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
return bch2_inode_write_flags(trans, iter, inode, 0);
}
int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
}
static inline u32 bch2_inode_flags(struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_inode:
return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
case KEY_TYPE_inode_v2:
return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
case KEY_TYPE_inode_v3:
return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
default:
return 0;
}
}
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)

View File

@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
if (!op) {
ret = -BCH_ERR_nopromote_enomem;
goto err;

View File

@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
bch2_congested_acct(ca, io_latency, now, rw);
__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}
#endif
@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
"write error while doing btree update: %s",
"%s write error while doing btree update: %s",
op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@ -1067,7 +1068,8 @@ do_write:
*_dst = dst;
return more;
csum_err:
bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
op->flags & BCH_WRITE_MOVE ? "move" : "user");
ret = -EIO;
err:
if (to_wbio(dst)->bounce)
@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
bch_err_inum_offset_ratelimited(c,
insert->k.p.inode, insert->k.p.offset << 9,
"write error while doing btree update: %s",
"%s write error while doing btree update: %s",
op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
}
@ -1449,7 +1452,9 @@ err:
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
"%s(): error: %s", __func__, bch2_err_str(ret));
"%s(): %s error: %s", __func__,
op->flags & BCH_WRITE_MOVE ? "move" : "user",
bch2_err_str(ret));
op->error = ret;
break;
}
@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write)
bch_err_inum_offset_ratelimited(c,
op->pos.inode,
op->pos.offset << 9,
"misaligned write");
"%s write error: misaligned write",
op->flags & BCH_WRITE_MOVE ? "move" : "user");
op->error = -EIO;
goto err;
}

View File

@ -27,47 +27,6 @@ static const char * const bch2_journal_errors[] = {
NULL
};
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
prt_printf(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_printf(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
prt_printf(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
prt_printf(out, "expires");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
printbuf_indent_sub(out, 2);
}
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
}
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@ -88,6 +47,66 @@ static bool journal_entry_is_open(struct journal *j)
return __journal_entry_is_open(j->reservations);
}
static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
{
union journal_res_state s = READ_ONCE(j->reservations);
unsigned i = seq & JOURNAL_BUF_MASK;
struct journal_buf *buf = j->buf + i;
prt_str(out, "seq:");
prt_tab(out);
prt_printf(out, "%llu", seq);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_str(out, "refcount:");
prt_tab(out);
prt_printf(out, "%u", journal_state_count(s, i));
prt_newline(out);
prt_str(out, "size:");
prt_tab(out);
prt_human_readable_u64(out, vstruct_bytes(buf->data));
prt_newline(out);
prt_str(out, "expires:");
prt_tab(out);
prt_printf(out, "%li jiffies", buf->expires - jiffies);
prt_newline(out);
prt_str(out, "flags:");
prt_tab(out);
if (buf->noflush)
prt_str(out, "noflush ");
if (buf->must_flush)
prt_str(out, "must_flush ");
if (buf->separate_flush)
prt_str(out, "separate_flush ");
if (buf->need_flush_to_write_buffer)
prt_str(out, "need_flush_to_write_buffer ");
if (buf->write_started)
prt_str(out, "write_started ");
if (buf->write_allocated)
prt_str(out, "write allocated ");
if (buf->write_done)
prt_str(out, "write done");
prt_newline(out);
printbuf_indent_sub(out, 2);
}
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++)
bch2_journal_buf_to_text(out, j, seq);
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
}
static inline struct journal_buf *
journal_seq_to_buf(struct journal *j, u64 seq)
{
@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
void bch2_journal_do_writes(struct journal *j)
{
for (u64 seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++) {
unsigned idx = seq & JOURNAL_BUF_MASK;
struct journal_buf *w = j->buf + idx;
if (w->write_started && !w->write_allocated)
break;
if (w->write_started)
continue;
if (!journal_state_count(j->reservations, idx)) {
w->write_started = true;
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
break;
}
}
/*
* Final processing when the last reference of a journal buffer has been
* dropped. Drop the pin list reference acquired at journal entry open and write
* the buffer, if requested.
*/
void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
void bch2_journal_buf_put_final(struct journal *j, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
lockdep_assert_held(&j->lock);
if (__bch2_journal_pin_put(j, seq))
bch2_journal_reclaim_fast(j);
if (write)
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
bch2_journal_do_writes(j);
}
/*
@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j)
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
bkey_extent_init(&buf->key);
buf->noflush = false;
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
buf->noflush = false;
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
buf->need_flush_to_write_buffer = true;
buf->write_started = false;
buf->write_allocated = false;
buf->write_done = false;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
if (nr_unwritten_journal_entries(j) == 1)
mod_delayed_work(j->wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
journal_wake(j);
if (j->early_journal_entries.nr)
@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j)
static void journal_write_work(struct work_struct *work)
{
struct journal *j = container_of(work, struct journal, write_work.work);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
long delta;
spin_lock(&j->lock);
if (!__journal_entry_is_open(j->reservations))
goto unlock;
if (__journal_entry_is_open(j->reservations)) {
long delta = journal_cur_buf(j)->expires - jiffies;
delta = journal_cur_buf(j)->expires - jiffies;
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
if (delta > 0)
mod_delayed_work(j->wq, &j->write_work, delta);
else
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
}
spin_unlock(&j->lock);
}
@ -473,33 +511,32 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
ret = JOURNAL_ERR_journal_full;
can_discard = j->can_discard;
goto out;
}
if (j->blocked)
return -BCH_ERR_journal_res_get_blocked;
if (bch2_journal_error(j))
return -BCH_ERR_erofs_journal_err;
spin_lock(&j->lock);
/* check once more in case somebody else shut things down... */
if (bch2_journal_error(j)) {
spin_unlock(&j->lock);
return -BCH_ERR_erofs_journal_err;
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
ret = JOURNAL_ERR_max_in_flight;
goto out;
}
spin_lock(&j->lock);
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
* unnecessarily
*/
if (journal_res_get_fast(j, res, flags)) {
spin_unlock(&j->lock);
return 0;
}
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
/*
* Don't want to close current journal entry, just need to
* invoke reclaim:
*/
ret = JOURNAL_ERR_journal_full;
ret = 0;
goto unlock;
}
@ -515,30 +552,30 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
if (ret == JOURNAL_ERR_max_in_flight) {
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, true);
if (trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
bch2_journal_bufs_to_text(&buf, j);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
}
count_event(c, journal_entry_full);
}
ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
if (!ret)
out:
if (ret == JOURNAL_ERR_retry)
goto retry;
if (!ret)
return 0;
if (journal_error_check_stuck(j, ret, flags))
ret = -BCH_ERR_journal_res_get_blocked;
if (ret == JOURNAL_ERR_max_in_flight &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
struct printbuf buf = PRINTBUF;
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
}
/*
* Journal is full - can't rely on reclaim from work item due to
* freezing:
@ -674,7 +711,7 @@ recheck_need_open:
return ret;
seq = res.seq;
buf = j->buf + (seq & JOURNAL_BUF_MASK);
buf = journal_seq_to_buf(j, seq);
buf->must_flush = true;
if (!buf->flush_time) {
@ -692,8 +729,8 @@ recheck_need_open:
}
/*
* if write was kicked off without a flush, flush the next sequence
* number instead
* if write was kicked off without a flush, or if we promised it
* wouldn't be a flush, flush the next sequence number instead
*/
buf = journal_seq_to_buf(j, seq);
if (buf->noflush) {
@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
unwritten_seq++) {
struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
/* journal write is already in flight, and was a flush write: */
if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
/* journal flush already in flight, or flush requseted */
if (buf->must_flush)
goto out;
buf->noflush = true;
@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
unsigned ptr;
u64 last_seq = cur_seq, nr, seq;
genradix_for_each_reverse(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
last_seq = le64_to_cpu(i->j.last_seq);
@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
p = journal_seq_pin(j, seq);
p->devs.nr = 0;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
darray_for_each(i->ptrs, ptr)
bch2_dev_list_add_dev(&p->devs, ptr->dev);
had_entries = true;
}
@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
void bch2_dev_journal_exit(struct bch_dev *ca)
{
kfree(ca->journal.bio);
kfree(ca->journal.buckets);
kfree(ca->journal.bucket_seq);
struct journal_device *ja = &ca->journal;
ca->journal.bio = NULL;
ca->journal.buckets = NULL;
ca->journal.bucket_seq = NULL;
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
kfree(ja->bio[i]);
ja->bio[i] = NULL;
}
kfree(ja->buckets);
kfree(ja->bucket_seq);
ja->buckets = NULL;
ja->bucket_seq = NULL;
}
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
bch2_sb_field_get(sb, journal);
struct bch_sb_field_journal_v2 *journal_buckets_v2 =
bch2_sb_field_get(sb, journal_v2);
unsigned i, nr_bvecs;
ja->nr = 0;
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
for (i = 0; i < nr; i++)
for (unsigned i = 0; i < nr; i++)
ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
} else if (journal_buckets) {
ja->nr = bch2_nr_journal_buckets(journal_buckets);
@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (!ja->bucket_seq)
return -BCH_ERR_ENOMEM_dev_journal_init;
nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!ca->journal.bio)
return -BCH_ERR_ENOMEM_dev_journal_init;
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
return -BCH_ERR_ENOMEM_dev_journal_init;
bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
}
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
unsigned j, dst = 0;
unsigned dst = 0;
for (i = 0; i < nr; i++)
for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
for (unsigned i = 0; i < nr; i++)
for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
ja->buckets[dst++] =
le64_to_cpu(journal_buckets_v2->d[i].start) + j;
} else if (journal_buckets) {
for (i = 0; i < ja->nr; i++)
for (unsigned i = 0; i < ja->nr; i++)
ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
}
@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
void bch2_fs_journal_exit(struct journal *j)
{
unsigned i;
if (j->wq)
destroy_workqueue(j->wq);
darray_exit(&j->early_journal_entries);
for (i = 0; i < ARRAY_SIZE(j->buf); i++)
kvpfree(j->buf[i].data, j->buf[i].buf_size);
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
kvfree(j->buf[i].data);
free_fifo(&j->pin);
}
int bch2_fs_journal_init(struct journal *j)
{
static struct lock_class_key res_key;
unsigned i;
mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j)
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
return -BCH_ERR_ENOMEM_journal_pin_fifo;
for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
if (!j->buf[i].data)
return -BCH_ERR_ENOMEM_journal_buf;
j->buf[i].idx = i;
}
j->pin.front = j->pin.back = 1;
j->wq = alloc_workqueue("bcachefs_journal",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
if (!j->wq)
return -BCH_ERR_ENOMEM_fs_other_alloc;
return 0;
}
@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
prt_printf(out, "blocked:\t\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *pin;
unsigned i;
spin_lock(&j->lock);
*seq = max(*seq, j->pin.front);
@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
prt_newline(out);
printbuf_indent_add(out, 2);
for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
list_for_each_entry(pin, &pin_list->list[i], list) {
prt_printf(out, "\t%px %ps", pin, pin->flush);
prt_newline(out);

View File

@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
}
bool bch2_journal_entry_close(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64, bool);
void bch2_journal_do_writes(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64);
static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
{
@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx))
bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
bch2_journal_buf_put_final(j, seq);
}
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
s = journal_state_buf_put(j, idx);
if (!journal_state_count(s, idx)) {
spin_lock(&j->lock);
bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
bch2_journal_buf_put_final(j, seq);
spin_unlock(&j->lock);
}
}

View File

@ -17,6 +17,37 @@
#include "sb-clean.h"
#include "trace.h"
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
u64 offset;
div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
if (i != j->ptrs.data)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
i->dev, i->bucket, i->bucket_offset, i->sector);
}
}
static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
bch2_journal_ptrs_to_text(out, c, j);
for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
break;
}
}
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c,
BUG_ON(*p != i);
*p = NULL;
kvpfree(i, offsetof(struct journal_replay, j) +
vstruct_bytes(&i->j));
kvfree(i);
}
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
{
i->ignore = true;
if (blacklisted)
i->ignore_blacklisted = true;
else
i->ignore_not_dirty = true;
if (!c->opts.read_entire_journal)
__journal_replay_free(c, i);
@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
{
struct genradix_iter iter;
struct journal_replay **_i, *i, *dup;
struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
/* Is this entry older than the range we need? */
@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
if (le64_to_cpu(i->j.seq) >= last_seq)
break;
journal_replay_free(c, i);
journal_replay_free(c, i, false);
}
}
@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
*/
dup = *_i;
if (dup) {
if (bytes == vstruct_bytes(&dup->j) &&
!memcmp(j, &dup->j, bytes)) {
i = dup;
goto found;
}
bool identical = bytes == vstruct_bytes(&dup->j) &&
!memcmp(j, &dup->j, bytes);
bool not_identical = !identical &&
entry_ptr.csum_good &&
dup->csum_good;
if (!entry_ptr.csum_good) {
i = dup;
goto found;
}
bool same_device = false;
darray_for_each(dup->ptrs, ptr)
if (ptr->dev == ca->dev_idx)
same_device = true;
if (!dup->csum_good)
ret = darray_push(&dup->ptrs, entry_ptr);
if (ret)
goto out;
bch2_journal_replay_to_text(&buf, c, dup);
fsck_err_on(same_device,
c, journal_entry_dup_same_device,
"duplicate journal entry on same device\n %s",
buf.buf);
fsck_err_on(not_identical,
c, journal_entry_replicas_data_mismatch,
"found duplicate but non identical journal entries\n %s",
buf.buf);
if (entry_ptr.csum_good && !identical)
goto replace;
fsck_err(c, journal_entry_replicas_data_mismatch,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
i = dup;
goto found;
goto out;
}
replace:
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
return -BCH_ERR_ENOMEM_journal_entry_add;
i->nr_ptrs = 0;
i->csum_good = entry_ptr.csum_good;
i->ignore = false;
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
i->ignore_blacklisted = false;
i->ignore_not_dirty = false;
unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
bch_err(c, "found too many copies of journal entry %llu",
le64_to_cpu(i->j.seq));
dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
}
/* The first ptr should represent the jset we kept: */
memcpy(i->ptrs + i->nr_ptrs,
dup->ptrs,
sizeof(dup->ptrs[0]) * dup->nr_ptrs);
i->nr_ptrs += dup->nr_ptrs;
darray_for_each(dup->ptrs, ptr)
darray_push(&i->ptrs, *ptr);
__journal_replay_free(c, dup);
} else {
darray_push(&i->ptrs, entry_ptr);
}
*_i = i;
return 0;
found:
for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
if (ptr->dev == ca->dev_idx) {
bch_err(c, "duplicate journal entry %llu on same device",
le64_to_cpu(i->j.seq));
goto out;
}
}
if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
bch_err(c, "found too many copies of journal entry %llu",
le64_to_cpu(i->j.seq));
goto out;
}
i->ptrs[i->nr_ptrs++] = entry_ptr;
out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
@ -374,7 +398,6 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
struct bkey_i *k;
bool first = true;
jset_entry_for_each_key(entry, k) {
@ -741,6 +764,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
journal_entry_btree_keys_to_text(out, c, entry);
}
static int journal_entry_datetime_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
unsigned version, int big_endian,
enum bkey_invalid_flags flags)
{
unsigned bytes = vstruct_bytes(entry);
unsigned expected = 16;
int ret = 0;
if (journal_entry_err_on(vstruct_bytes(entry) < expected,
c, version, jset, entry,
journal_entry_dev_usage_bad_size,
"bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
struct jset_entry *entry)
{
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@ -913,11 +967,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
n = kvpmalloc(new_size, GFP_KERNEL);
n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
kvpfree(b->data, b->size);
kvfree(b->data);
b->data = n;
b->size = new_size;
return 0;
@ -1102,16 +1156,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!r)
continue;
for (i = 0; i < r->nr_ptrs; i++) {
if (r->ptrs[i].dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
darray_for_each(r->ptrs, i)
if (i->dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, i->sector) +
vstruct_sectors(&r->j, c->block_bits);
ja->cur_idx = r->ptrs[i].bucket;
ja->cur_idx = i->bucket;
ja->sectors_free = ca->mi.bucket_size - wrote;
goto found;
}
}
}
found:
mutex_unlock(&jlist->lock);
@ -1144,7 +1197,7 @@ found:
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvpfree(buf.data, buf.size);
kvfree(buf.data);
percpu_ref_put(&ca->io_ref);
closure_return(cl);
return;
@ -1155,27 +1208,6 @@ err:
goto out;
}
void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct journal_replay *j)
{
unsigned i;
for (i = 0; i < j->nr_ptrs; i++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
u64 offset;
div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
prt_printf(out, " ");
prt_printf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
j->ptrs[i].bucket,
j->ptrs[i].bucket_offset,
j->ptrs[i].sector);
}
}
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
@ -1228,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c,
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
if (!*start_seq)
*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
if (JSET_NO_FLUSH(&i->j)) {
i->ignore = true;
i->ignore_blacklisted = true;
continue;
}
if (!last_write_torn && !i->csum_good) {
last_write_torn = true;
i->ignore = true;
i->ignore_blacklisted = true;
continue;
}
@ -1280,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
seq = le64_to_cpu(i->j.seq);
if (seq < *last_seq) {
journal_replay_free(c, i);
journal_replay_free(c, i, false);
continue;
}
@ -1293,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c,
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
i->ignore = true;
i->ignore_blacklisted = true;
}
}
@ -1302,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c,
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
BUG_ON(seq > le64_to_cpu(i->j.seq));
@ -1353,32 +1385,31 @@ int bch2_journal_read(struct bch_fs *c,
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
};
unsigned ptr;
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
darray_for_each(i->ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
if (!ptr->csum_good)
bch_err_dev_offset(ca, ptr->sector,
"invalid journal checksum, seq %llu%s",
le64_to_cpu(i->j.seq),
i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
bch_dev_bkey_exists(c, i->ptrs[0].dev),
bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
&i->j,
i->ptrs[0].sector,
i->ptrs.data[0].sector,
READ);
if (ret)
goto err;
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
darray_for_each(i->ptrs, ptr)
replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
bch2_replicas_entry_sort(&replicas.e);
@ -1547,7 +1578,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
return;
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@ -1558,7 +1589,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
swap(buf->buf_size, new_size);
spin_unlock(&j->lock);
kvpfree(new_buf, new_size);
kvfree(new_buf);
}
static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@ -1568,12 +1599,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 v, seq;
u64 v, seq = le64_to_cpu(w->data->seq);
int err = 0;
bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@ -1593,63 +1624,68 @@ static CLOSURE_CALLBACK(journal_write_done)
if (err)
bch2_fatal_error(c);
spin_lock(&j->lock);
seq = le64_to_cpu(w->data->seq);
closure_debug_destroy(cl);
spin_lock(&j->lock);
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = w->devs_written;
if (err && (!j->err_seq || seq < j->err_seq))
j->err_seq = seq;
w->write_done = true;
if (!err) {
if (!JSET_NO_FLUSH(w->data)) {
bool completed = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
seq++) {
w = j->buf + (seq & JOURNAL_BUF_MASK);
if (!w->write_done)
break;
if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
j->seq_ondisk = seq;
j->seq_ondisk = seq;
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
*
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(journal_state_count(new, new.unwritten_idx));
BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
BUG_ON(journal_state_count(new, new.unwritten_idx));
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
new.unwritten_idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
closure_wake_up(&w->wait);
completed = true;
}
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
if (completed) {
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
&j->max_in_flight_start, false);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
closure_wake_up(&w->wait);
journal_wake(j);
journal_wake(j);
}
if (!journal_state_count(new, new.unwritten_idx) &&
journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
spin_unlock(&j->lock);
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@ -1659,46 +1695,46 @@ static CLOSURE_CALLBACK(journal_write_done)
* previous entries still in flight - the current journal entry
* might want to be written now:
*/
spin_unlock(&j->lock);
mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
} else {
spin_unlock(&j->lock);
mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
}
spin_unlock(&j->lock);
}
static void journal_write_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
struct bch_dev *ca = jbio->ca;
struct journal *j = &ca->fs->journal;
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
struct journal_buf *w = j->buf + jbio->buf_idx;
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
closure_put(&j->io);
closure_put(&w->io);
percpu_ref_put(&ca->io_ref);
}
static CLOSURE_CALLBACK(do_journal_write)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
ca = bch_dev_bkey_exists(c, ptr->dev);
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct journal_device *ja = &ca->journal;
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
@ -1708,7 +1744,7 @@ static CLOSURE_CALLBACK(do_journal_write)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
bio = ca->journal.bio;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@ -1727,11 +1763,10 @@ static CLOSURE_CALLBACK(do_journal_write)
trace_and_count(c, journal_write, bio);
closure_bio_submit(bio, cl);
ca->journal.bucket_seq[ca->journal.cur_idx] =
le64_to_cpu(w->data->seq);
ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
}
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@ -1782,7 +1817,6 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (!wb.wb)
bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
struct bkey_i *k;
jset_entry_for_each_key(i, k) {
ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
if (ret) {
@ -1798,12 +1832,20 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
if (wb.wb)
bch2_journal_keys_to_write_buffer_end(c, &wb);
spin_lock(&c->journal.lock);
w->need_flush_to_write_buffer = false;
spin_unlock(&c->journal.lock);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
struct jset_entry_datetime *d =
container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
d->entry.type = BCH_JSET_ENTRY_datetime;
d->seconds = cpu_to_le64(ktime_get_real_seconds());
bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@ -1893,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
j->nr_noflush_writes++;
} else {
w->must_flush = true;
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
@ -1903,20 +1946,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
CLOSURE_CALLBACK(bch2_journal_write)
{
closure_type(j, struct journal, io);
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned nr_rw_members = 0;
int ret;
for_each_rw_member(c, ca)
nr_rw_members++;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
BUG_ON(!w->write_started);
BUG_ON(w->write_allocated);
BUG_ON(w->write_done);
j->write_start_time = local_clock();
spin_lock(&j->lock);
if (nr_rw_members > 1)
w->separate_flush = true;
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
@ -1956,12 +2007,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
* bch2_journal_space_available():
*/
w->sectors = 0;
w->write_allocated = true;
/*
* journal entry has been compacted and allocated, recalculate space
* available:
*/
bch2_journal_space_available(j);
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@ -1969,12 +2022,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
for_each_rw_member(c, ca)
nr_rw_members++;
if (nr_rw_members > 1)
w->separate_flush = true;
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@ -1985,25 +2032,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
if (!JSET_NO_FLUSH(w->data))
closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
bio_reset(bio, ca->disk_sb.bdev,
REQ_OP_WRITE|REQ_PREFLUSH);
REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
}
}
continue_at(cl, do_journal_write, c->io_complete_wq);
continue_at(cl, do_journal_write, j->wq);
return;
no_io:
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
return;
err:
bch2_fatal_error(c);
continue_at(cl, journal_write_done, c->io_complete_wq);
continue_at(cl, journal_write_done, j->wq);
}

View File

@ -2,26 +2,35 @@
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
#include "darray.h"
struct journal_ptr {
bool csum_good;
u8 dev;
u32 bucket;
u32 bucket_offset;
u64 sector;
};
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
struct journal_ptr {
bool csum_good;
u8 dev;
u32 bucket;
u32 bucket_offset;
u64 sector;
} ptrs[BCH_REPLICAS_MAX];
unsigned nr_ptrs;
DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
bool csum_good;
bool ignore;
bool ignore_blacklisted;
bool ignore_not_dirty;
/* must be last: */
struct jset j;
};
static inline bool journal_replay_ignore(struct journal_replay *i)
{
return !i || i->ignore_blacklisted || i->ignore_not_dirty;
}
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
for (struct jset_entry *entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define jset_entry_for_each_key(_e, _k) \
for (_k = (_e)->start; \
for (struct bkey_i *_k = (_e)->start; \
_k < vstruct_last(_e); \
_k = bkey_next(_k))
@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
CLOSURE_CALLBACK(bch2_journal_write);
static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = cpu_to_le16(u64s - 1);
*end = vstruct_next(*end);
return entry;
}
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j)
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
&j->low_on_space_start, low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
&j->low_on_pin_start, low_on_pin) ||
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
&j->write_buffer_full_start, low_on_wb))
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j,
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
u64 seq = READ_ONCE(src->seq);
@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j,
return;
}
reclaim = __journal_pin_drop(j, dst);
bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
if (seq == journal_last_seq(j))
journal_wake(j);
spin_unlock(&j->lock);
}
void bch2_journal_pin_set(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
bool reclaim;
spin_lock(&j->lock);
BUG_ON(seq < journal_last_seq(j));
reclaim = __journal_pin_drop(j, pin);
bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
spin_unlock(&j->lock);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
if (seq == journal_last_seq(j))
journal_wake(j);
spin_unlock(&j->lock);
}
/**

View File

@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
static struct bch_sb_field_journal_seq_blacklist *
blacklist_entry_try_merge(struct bch_fs *c,
struct bch_sb_field_journal_seq_blacklist *bl,
unsigned i)
{
unsigned nr = blacklist_nr_entries(bl);
if (le64_to_cpu(bl->start[i].end) >=
le64_to_cpu(bl->start[i + 1].start)) {
bl->start[i].end = bl->start[i + 1].end;
--nr;
memmove(&bl->start[i],
&bl->start[i + 1],
sizeof(bl->start[0]) * (nr - i));
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
sb_blacklist_u64s(nr));
BUG_ON(!bl);
}
return bl;
}
static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
u64 start, u64 end)
{
return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
}
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
unsigned i, nr;
unsigned i = 0, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
nr = blacklist_nr_entries(bl);
for (i = 0; i < nr; i++) {
while (i < nr) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
if (bl_entry_contig_or_overlaps(e, start, end)) {
e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
e->end = cpu_to_le64(max(end, le64_to_cpu(e->end)));
if (end < le64_to_cpu(e->start))
break;
if (i + 1 < nr)
bl = blacklist_entry_try_merge(c,
bl, i);
if (i)
bl = blacklist_entry_try_merge(c,
bl, i - 1);
goto out_write_sb;
if (start > le64_to_cpu(e->end)) {
i++;
continue;
}
/*
* Entry is contiguous or overlapping with new entry: merge it
* with new entry, and delete:
*/
start = min(start, le64_to_cpu(e->start));
end = max(end, le64_to_cpu(e->end));
array_remove_item(bl->start, nr, i);
}
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
goto out;
}
bl->start[nr].start = cpu_to_le64(start);
bl->start[nr].end = cpu_to_le64(end);
out_write_sb:
array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
.start = cpu_to_le64(start),
.end = cpu_to_le64(end),
}));
c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
ret = bch2_write_super(c);
@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
if (!bl)
return 0;
t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
GFP_KERNEL);
t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
return -BCH_ERR_ENOMEM_blacklist_table_init;

View File

@ -18,6 +18,7 @@
* the journal that are being staged or in flight.
*/
struct journal_buf {
struct closure io;
struct jset *data;
__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@ -33,10 +34,14 @@ struct journal_buf {
unsigned disk_sectors; /* maximum size entry could have been, if
buf_size was bigger */
unsigned u64s_reserved;
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
bool need_flush_to_write_buffer;
bool noflush:1; /* write has already been kicked off, and was noflush */
bool must_flush:1; /* something wants a flush */
bool separate_flush:1;
bool need_flush_to_write_buffer:1;
bool write_started:1;
bool write_allocated:1;
bool write_done:1;
u8 idx;
};
/*
@ -134,6 +139,7 @@ enum journal_flags {
/* Reasons we may fail to get a journal reservation: */
#define JOURNAL_ERRORS() \
x(ok) \
x(retry) \
x(blocked) \
x(max_in_flight) \
x(journal_full) \
@ -149,6 +155,13 @@ enum journal_errors {
typedef DARRAY(u64) darray_u64;
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
struct bio bio;
};
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
@ -203,8 +216,8 @@ struct journal {
wait_queue_head_t wait;
struct closure_waitlist async_wait;
struct closure io;
struct delayed_work write_work;
struct workqueue_struct *wq;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
@ -274,11 +287,6 @@ struct journal {
u64 nr_noflush_writes;
u64 entry_bytes_written;
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
struct bch2_time_stats *flush_seq_time;
@ -313,7 +321,7 @@ struct journal_device {
u64 *buckets;
/* Bio for journal reads/writes to this device */
struct bio *bio;
struct journal_bio *bio[JOURNAL_BUF_NR];
/* for bch_journal_read_device */
struct closure read;

View File

@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
u64 dev_bucket, u64 time, bool set)
{
return time
? bch2_btree_bit_mod(trans, BTREE_ID_lru,
lru_pos(lru_id, dev_bucket, time), set)
? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
lru_pos(lru_id, dev_bucket, time), set)
: 0;
}
@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
goto out;
}
if (c->opts.reconstruct_alloc ||
fsck_err(c, lru_entry_bad,
if (fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",

View File

@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
* @s: mean and variance number of samples and their sums
* @x: new value to include in the &mean_and_variance_weighted
* @initted: caller must track whether this is the first use or not
* @weight: ewma weight
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
*/
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
s64 x, bool initted, u8 weight)
{
// previous weighted variance.
u8 w = s->weight;
u8 w = weight;
u64 var_w0 = s->variance;
// new value weighted.
s64 x_w = x << w;
@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
// new mean weighted.
s64 u_w1 = s->mean + diff;
if (!s->init) {
if (!initted) {
s->mean = x_w;
s->variance = 0;
} else {
s->mean = u_w1;
s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
}
s->init = true;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
* @s: mean and variance number of samples and their sums
* @weight: ewma weight
*/
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
u8 weight)
{
return fast_divpow2(s.mean, s.weight);
return fast_divpow2(s.mean, weight);
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
* @s: mean and variance number of samples and their sums
* @weight: ewma weight
*/
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
u8 weight)
{
// always positive don't need fast divpow2
return s.variance >> s.weight;
return s.variance >> weight;
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
* @s: mean and variance number of samples and their sums
* @weight: ewma weight
*/
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
u8 weight)
{
return int_sqrt64(mean_and_variance_weighted_get_variance(s));
return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
}
EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);

View File

@ -154,8 +154,6 @@ struct mean_and_variance {
/* expontentially weighted variant */
struct mean_and_variance_weighted {
bool init;
u8 weight; /* base 2 logarithim */
s64 mean;
u64 variance;
};
@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
u64 mean_and_variance_get_variance(struct mean_and_variance s1);
u32 mean_and_variance_get_stddev(struct mean_and_variance s);
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
s64 v, bool initted, u8 weight);
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
u8 weight);
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
u8 weight);
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
u8 weight);
#endif // MEAN_AND_VAIRANCE_H_

View File

@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
static void mean_and_variance_weighted_test(struct kunit *test)
{
struct mean_and_variance_weighted s = { .weight = 2 };
struct mean_and_variance_weighted s = { };
mean_and_variance_weighted_update(&s, 10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
mean_and_variance_weighted_update(&s, 10, false, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
mean_and_variance_weighted_update(&s, 20);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
mean_and_variance_weighted_update(&s, 20, true, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
mean_and_variance_weighted_update(&s, 30);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
mean_and_variance_weighted_update(&s, 30, true, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
s = (struct mean_and_variance_weighted) { .weight = 2 };
s = (struct mean_and_variance_weighted) { };
mean_and_variance_weighted_update(&s, -10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
mean_and_variance_weighted_update(&s, -10, false, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
mean_and_variance_weighted_update(&s, -20);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
mean_and_variance_weighted_update(&s, -20, true, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
mean_and_variance_weighted_update(&s, -30);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
mean_and_variance_weighted_update(&s, -30, true, 2);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
}
static void mean_and_variance_weighted_advanced_test(struct kunit *test)
{
struct mean_and_variance_weighted s = { .weight = 8 };
struct mean_and_variance_weighted s = { };
bool initted = false;
s64 i;
for (i = 10; i <= 100; i += 10)
mean_and_variance_weighted_update(&s, i);
for (i = 10; i <= 100; i += 10) {
mean_and_variance_weighted_update(&s, i, initted, 8);
initted = true;
}
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
s = (struct mean_and_variance_weighted) { .weight = 8 };
s = (struct mean_and_variance_weighted) { };
initted = false;
for (i = -10; i >= -100; i -= 10)
mean_and_variance_weighted_update(&s, i);
for (i = -10; i >= -100; i -= 10) {
mean_and_variance_weighted_update(&s, i, initted, 8);
initted = true;
}
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
}
static void do_mean_and_variance_test(struct kunit *test,
@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
s64 *weighted_stddev)
{
struct mean_and_variance mv = {};
struct mean_and_variance_weighted vw = { .weight = weight };
struct mean_and_variance_weighted vw = { };
for (unsigned i = 0; i < initial_n; i++) {
mean_and_variance_update(&mv, initial_value);
mean_and_variance_weighted_update(&vw, initial_value);
mean_and_variance_weighted_update(&vw, initial_value, false, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
}
for (unsigned i = 0; i < n; i++) {
mean_and_variance_update(&mv, data[i]);
mean_and_variance_weighted_update(&vw, data[i]);
mean_and_variance_weighted_update(&vw, data[i], true, weight);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]);
KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
}
KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);

View File

@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
return -EINVAL;
return -BCH_ERR_remove_would_lose_data;
return 0;
}
@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
return -BCH_ERR_remove_with_metadata_missing_unimplemented;
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@ -132,10 +132,8 @@ retry:
ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
if (ret)
break;
}
ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {

View File

@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c,
if (ret < 0 || (*res != 0 && *res != 1)) {
if (err)
prt_printf(err, "%s: must be bool", opt->attr.name);
return ret;
return ret < 0 ? ret : -BCH_ERR_option_not_bool;
}
break;
case BCH_OPT_UINT:
@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts = kstrdup(options, GFP_KERNEL);
if (!copied_opts)
return -1;
return -ENOMEM;
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
bad_opt:
pr_err("Bad mount option %s", name);
ret = -1;
ret = -BCH_ERR_option_name;
goto out;
bad_val:
pr_err("Invalid mount option %s", err.buf);
ret = -1;
ret = -BCH_ERR_option_value;
goto out;
out:
kfree(copied_opts_start);

View File

@ -290,6 +290,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allow mounting in when data will be missing") \
x(no_splitbrain_check, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
x(discard, u8, \
OPT_FS|OPT_MOUNT|OPT_DEVICE, \
OPT_BOOL(), \
@ -332,6 +337,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Run fsck on mount") \
x(fsck_memory_usage_percent, u8, \
OPT_FS|OPT_MOUNT, \
OPT_UINT(20, 70), \
BCH2_NO_SB_OPT, 50, \
NULL, "Maximum percentage of system ram fsck is allowed to pin")\
x(fix_errors, u8, \
OPT_FS|OPT_MOUNT, \
OPT_FN(bch2_opt_fix_errors), \

View File

@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
prt_str(out, "io wait remaining: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
prt_str(out, "duration waited: ");

View File

@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id)
}
/* for -o reconstruct_alloc: */
static void drop_alloc_keys(struct journal_keys *keys)
static void do_reconstruct_alloc(struct bch_fs *c)
{
bch2_journal_log_msg(c, "dropping alloc info");
bch_info(c, "dropping and reconstructing all alloc info");
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
struct journal_keys *keys = &c->journal_keys;
size_t src, dst;
for (src = 0, dst = 0; src < keys->nr; src++)
if (!btree_id_is_alloc(keys->d[src].btree_id))
keys->d[dst++] = keys->d[src];
if (!btree_id_is_alloc(keys->data[src].btree_id))
keys->data[dst++] = keys->data[src];
keys->nr = dst;
}
@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
*/
static void zero_out_btree_mem_ptr(struct journal_keys *keys)
{
struct journal_key *i;
for (i = keys->d; i < keys->d + keys->nr; i++)
darray_for_each(*keys, i)
if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
}
@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (ret)
goto out;
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
ret = bch2_btree_iter_traverse(&iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
}
/* Must be checked with btree locked: */
if (k->overwritten)
goto out;
@ -166,11 +208,9 @@ static int bch2_journal_replay(struct bch_fs *c)
* efficient - better locality of btree access - but some might fail if
* that would cause a journal deadlock.
*/
for (size_t i = 0; i < keys->nr; i++) {
darray_for_each(*keys, k) {
cond_resched();
struct journal_key *k = keys->d + i;
/* Skip fastpath if we're low on space in the journal */
ret = c->journal.watermark ? -1 :
commit_do(trans, NULL, NULL,
@ -264,7 +304,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
r->error = -EIO;
r->error = -BCH_ERR_btree_node_read_error;
}
r->alive = true;
break;
@ -359,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c,
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
if (!i || i->ignore)
if (journal_replay_ignore(i))
continue;
vstruct_for_each(&i->j, entry) {
@ -388,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c)
if (!r->alive)
continue;
if (btree_id_is_alloc(i) &&
c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
continue;
}
if (r->error) {
__fsck_err(c,
@ -524,8 +561,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
* setting journal_key->overwritten: it will be accessed by multiple
* threads
*/
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
move_gap(keys, keys->nr);
set_bit(BCH_FS_may_go_rw, &c->flags);
@ -862,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto out;
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i && !(*i)->ignore) {
if (!journal_replay_ignore(*i)) {
last_journal_entry = &(*i)->j;
break;
}
@ -887,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c)
genradix_for_each_reverse(&c->journal_entries, iter, i)
if (*i) {
last_journal_entry = &(*i)->j;
(*i)->ignore = false;
(*i)->ignore_blacklisted = false;
(*i)->ignore_not_dirty= false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
@ -923,10 +960,8 @@ use_clean:
c->journal_replay_seq_start = last_seq;
c->journal_replay_seq_end = blacklist_seq - 1;
if (c->opts.reconstruct_alloc) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
drop_alloc_keys(&c->journal_keys);
}
if (c->opts.reconstruct_alloc)
do_reconstruct_alloc(c);
zero_out_btree_mem_ptr(&c->journal_keys);
@ -950,7 +985,7 @@ use_clean:
bch2_journal_seq_blacklist_add(c,
blacklist_seq, journal_seq);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
goto err;
}
}
@ -961,9 +996,6 @@ use_clean:
if (ret)
goto err;
if (c->opts.reconstruct_alloc)
bch2_journal_log_msg(c, "dropping alloc info");
/*
* Skip past versions that might have possibly been used (as nonces),
* but hadn't had their pointers written:

View File

@ -34,6 +34,7 @@
x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
@ -43,6 +44,7 @@
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \

View File

@ -171,22 +171,6 @@ fsck_err:
return ERR_PTR(ret);
}
static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
{
struct jset_entry *entry = *end;
unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
memset(entry, 0, u64s * sizeof(u64));
/*
* The u64s field counts from the start of data, ignoring the shared
* fields.
*/
entry->u64s = cpu_to_le16(u64s - 1);
*end = vstruct_next(*end);
return entry;
}
void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)

View File

@ -45,7 +45,13 @@
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
x(rebalance_work, \
BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
x(subvolume_fs_parent, \
BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
x(btree_subvolume_children, \
BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
BCH_FSCK_ERR_subvol_children_not_set)
#define DOWNGRADE_TABLE()
@ -253,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
if (e < BCH_SB_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
__set_bit_le64(e, ext->errors_silent);
}
}
}

View File

@ -231,7 +231,7 @@
x(dirent_name_dot_or_dotdot, 223) \
x(dirent_name_has_slash, 224) \
x(dirent_d_type_wrong, 225) \
x(dirent_d_parent_subvol_wrong, 226) \
x(inode_bi_parent_wrong, 226) \
x(dirent_in_missing_dir_inode, 227) \
x(dirent_in_non_dir_inode, 228) \
x(dirent_to_missing_inode, 229) \
@ -250,7 +250,22 @@
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243) \
x(unlinked_inode_not_on_deleted_list, 244) \
x(reflink_p_front_pad_bad, 245)
x(reflink_p_front_pad_bad, 245) \
x(journal_entry_dup_same_device, 246) \
x(inode_bi_subvol_missing, 247) \
x(inode_bi_subvol_wrong, 248) \
x(inode_points_to_missing_dirent, 249) \
x(inode_points_to_wrong_dirent, 250) \
x(inode_bi_parent_nonzero, 251) \
x(dirent_to_missing_parent_subvol, 252) \
x(dirent_not_visible_in_parent_subvol, 253) \
x(subvol_fs_path_parent_wrong, 254) \
x(subvol_root_fs_path_parent_nonzero, 255) \
x(subvol_children_not_set, 256) \
x(subvol_children_bad, 257) \
x(subvol_loop, 258) \
x(subvol_unreachable, 259) \
x(btree_node_bkey_bad_u64s, 260)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,

View File

@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
int bch2_hash_set_snapshot(struct btree_trans *trans,
int bch2_hash_set_in_snapshot(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
struct bkey_i *insert,
bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
if (ret)
return ret;
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
snapshot, insert, str_hash_flags, 0);
u32 snapshot;
return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
bch2_hash_set_in_snapshot(trans, desc, info, inum,
snapshot, insert, str_hash_flags, 0);
}
static __always_inline

View File

@ -13,13 +13,26 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
static struct bpos subvolume_children_pos(struct bkey_s_c k)
{
if (k.k->type != KEY_TYPE_subvolume)
return POS_MIN;
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
if (!s.v->fs_path_parent)
return POS_MIN;
return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
}
static int check_subvol(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_subvolume subvol;
struct btree_iter subvol_children_iter = {};
struct bch_snapshot snapshot;
struct printbuf buf = PRINTBUF;
unsigned snapid;
int ret = 0;
@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
return ret ?: -BCH_ERR_transaction_restart_nested;
}
if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
subvol.v->fs_path_parent,
c, subvol_root_fs_path_parent_nonzero,
"root subvolume has nonzero fs_path_parent\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
struct bkey_i_subvolume *n =
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(n);
if (ret)
goto err;
n->v.fs_path_parent = 0;
}
if (subvol.v->fs_path_parent) {
struct bpos pos = subvolume_children_pos(k);
struct bkey_s_c subvol_children_k =
bch2_bkey_get_iter(trans, &subvol_children_iter,
BTREE_ID_subvolume_children, pos, 0);
ret = bkey_err(subvol_children_k);
if (ret)
goto err;
if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
c, subvol_children_not_set,
"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
pos.inode, pos.offset,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
if (ret)
goto err;
}
}
struct bch_inode_unpacked inode;
struct btree_iter inode_iter = {};
ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
0);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
ret = bch2_subvolume_delete(trans, iter->pos.offset);
bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
c, subvol_root_wrong_bi_subvol,
"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
inode.bi_inum, inode_iter.k.p.snapshot,
inode.bi_subvol, subvol.k->p.offset)) {
inode.bi_subvol = subvol.k->p.offset;
ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
if (ret)
goto err;
}
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
u32 snapshot_tree;
@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans,
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
}
err:
fsck_err:
bch2_trans_iter_exit(trans, &subvol_children_iter);
printbuf_exit(&buf);
return ret;
}
@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
return ret;
}
static int check_subvol_child(struct btree_trans *trans,
struct btree_iter *child_iter,
struct bkey_s_c child_k)
{
struct bch_fs *c = trans->c;
struct bch_subvolume s;
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
0, subvolume, &s);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (fsck_err_on(ret ||
le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
c, subvol_children_bad,
"incorrect entry in subvolume_children btree %llu:%llu",
child_k.k->p.inode, child_k.k->p.offset)) {
ret = bch2_btree_delete_at(trans, child_iter, 0);
if (ret)
goto err;
}
err:
fsck_err:
return ret;
}
int bch2_check_subvol_children(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_subvol_child(trans, &iter, k)));
bch_err_fn(c, ret);
return 0;
}
/* Subvolumes: */
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
le64_to_cpu(s.v->inode),
le32_to_cpu(s.v->snapshot));
if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
}
}
static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
{
return !bpos_eq(pos, POS_MIN)
? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
: 0;
}
int bch2_subvolume_trigger(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s new,
unsigned flags)
{
if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
struct bpos children_pos_old = subvolume_children_pos(old);
struct bpos children_pos_new = subvolume_children_pos(new.s_c);
if (!bpos_eq(children_pos_old, children_pos_new)) {
int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
subvolume_children_mod(trans, children_pos_new, true);
if (ret)
return ret;
}
}
return 0;
}
int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
{
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
struct bkey_s_c k = bch2_btree_iter_peek(&iter);
bch2_trans_iter_exit(trans, &iter);
return bkey_err(k) ?: k.k && k.k->p.inode == subvol
? -BCH_ERR_ENOTEMPTY_subvol_not_empty
: 0;
}
static __always_inline int
@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
return 0;
s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
if (ret)
return ret;
s->v.parent = cpu_to_le32(new_parent);
s->v.creation_parent = cpu_to_le32(new_parent);
return 0;
}
@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
subvolid_to_delete, le32_to_cpu(s.creation_parent)));
}
/*
@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
}
int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
u32 parent_subvolid,
u32 src_subvolid,
u32 *new_subvolid,
u32 *new_snapshotid,
@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
if (ret)
goto err;
new_subvol->v.flags = 0;
new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
new_subvol->v.inode = cpu_to_le64(inode);
new_subvol->v.parent = cpu_to_le32(src_subvolid);
new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
new_subvol->v.otime.hi = 0;
new_subvol->v.flags = 0;
new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]);
new_subvol->v.inode = cpu_to_le64(inode);
new_subvol->v.creation_parent = cpu_to_le32(src_subvolid);
new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid);
new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c));
new_subvol->v.otime.hi = 0;
SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);

View File

@ -8,17 +8,22 @@
enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
int bch2_check_subvol_children(struct bch_fs *);
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \
.key_invalid = bch2_subvolume_invalid, \
.val_to_text = bch2_subvolume_to_text, \
.trigger = bch2_subvolume_trigger, \
.min_val_size = 16, \
})
int bch2_subvol_has_children(struct btree_trans *, u32);
int bch2_subvolume_get(struct btree_trans *, unsigned,
bool, int, struct bch_subvolume *);
int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@ -30,8 +35,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
void bch2_delete_dead_snapshots_async(struct bch_fs *);
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32,
u32 *, u32 *, bool);
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
int bch2_fs_subvolumes_init(struct bch_fs *);

View File

@ -19,8 +19,8 @@ struct bch_subvolume {
* This is _not_ necessarily the subvolume of the directory containing
* this subvolume:
*/
__le32 parent;
__le32 pad;
__le32 creation_parent;
__le32 fs_path_parent;
bch_le128 otime;
};

View File

@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
return ret;
}
if (rw == WRITE &&
bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
le64_to_cpu(sb->seq));
return -BCH_ERR_invalid_sb_members_missing;
}
return 0;
}
@ -717,6 +725,7 @@ retry:
if (IS_ERR(sb->s_bdev_file)) {
ret = PTR_ERR(sb->s_bdev_file);
prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
goto err;
}
sb->bdev = file_bdev(sb->s_bdev_file);
@ -743,9 +752,9 @@ retry:
prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
printk(KERN_INFO "%s", err2.buf);
bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
else
printk(KERN_ERR "%s", err2.buf);
bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
printbuf_exit(&err2);
printbuf_reset(&err);
@ -803,21 +812,20 @@ got_super:
goto err;
}
ret = 0;
sb->have_layout = true;
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
goto err_no_print;
}
out:
printbuf_exit(&err);
return ret;
err:
printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
path, err.buf);
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
path, err.buf);
err_no_print:
bch2_free_super(sb);
goto out;

View File

@ -56,6 +56,7 @@
#include "super.h"
#include "super-io.h"
#include "sysfs.h"
#include "thread_with_file.h"
#include "trace.h"
#include <linux/backing-dev.h>
@ -86,6 +87,23 @@ const char * const bch2_fs_flag_strs[] = {
NULL
};
void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
{
struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
va_list args;
va_start(args, fmt);
if (likely(!stdio)) {
vprintk(fmt, args);
} else {
if (fmt[0] == KERN_SOH[0])
fmt += 2;
bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
}
va_end(args);
}
void __bch2_print(struct bch_fs *c, const char *fmt, ...)
{
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
@ -95,16 +113,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...)
if (likely(!stdio)) {
vprintk(fmt, args);
} else {
unsigned long flags;
if (fmt[0] == KERN_SOH[0])
fmt += 2;
spin_lock_irqsave(&stdio->output_lock, flags);
prt_vprintf(&stdio->output_buf, fmt, args);
spin_unlock_irqrestore(&stdio->output_lock, flags);
wake_up(&stdio->output_wait);
bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
}
va_end(args);
}
@ -576,7 +588,7 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_update_wq);
bch2_free_super(&c->disk_sb);
kvpfree(c, sizeof(*c));
kvfree(c);
module_put(THIS_MODULE);
}
@ -715,7 +727,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
unsigned i, iter_size;
int ret = 0;
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c) {
c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
goto out;
@ -818,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
pr_uuid(&name, c->sb.user_uuid.b);
strscpy(c->name, name.buf, sizeof(c->name));
printbuf_exit(&name);
ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
if (ret)
goto err;
strscpy(c->name, name.buf, sizeof(c->name));
printbuf_exit(&name);
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@ -862,13 +874,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@ -882,8 +894,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@ -1061,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
}
static int bch2_dev_in_fs(struct bch_sb_handle *fs,
struct bch_sb_handle *sb)
struct bch_sb_handle *sb,
struct bch_opts *opts)
{
if (fs == sb)
return 0;
@ -1102,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
prt_printf(&buf, "Not using older sb");
if (!opts->no_splitbrain_check)
prt_printf(&buf, "Not using older sb");
pr_err("%s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_device_splitbrain;
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@ -1124,17 +1140,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_newline(&buf);
prt_bdevname(&buf, fs->bdev);
prt_str(&buf, "believes seq of ");
prt_str(&buf, " believes seq of ");
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " to be %llu, but ", seq_from_fs);
prt_bdevname(&buf, sb->bdev);
prt_printf(&buf, " has %llu\n", seq_from_member);
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
if (!opts->no_splitbrain_check) {
prt_str(&buf, "Not using ");
prt_bdevname(&buf, sb->bdev);
}
pr_err("%s", buf.buf);
printbuf_exit(&buf);
return -BCH_ERR_device_splitbrain;
if (!opts->no_splitbrain_check)
return -BCH_ERR_device_splitbrain;
}
return 0;
@ -1168,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_dev_buckets_free(ca);
free_page((unsigned long) ca->sb_read_scratch);
bch2_time_stats_exit(&ca->io_latency[WRITE]);
bch2_time_stats_exit(&ca->io_latency[READ]);
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
percpu_ref_exit(&ca->io_ref);
percpu_ref_exit(&ca->ref);
@ -1260,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
bch2_time_stats_init(&ca->io_latency[READ]);
bch2_time_stats_init(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
@ -1597,27 +1618,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
bch_err_msg(ca, ret, "dropping data");
bch_err_msg(ca, ret, "bch2_dev_data_drop()");
if (ret)
goto err;
ret = bch2_dev_remove_alloc(c, ca);
bch_err_msg(ca, ret, "deleting alloc info");
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
goto err;
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
bch_err_msg(ca, ret, "flushing journal");
bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
if (ret)
goto err;
ret = bch2_journal_flush(&c->journal);
bch_err(ca, "journal error");
bch_err_msg(ca, ret, "bch2_journal_flush()");
if (ret)
goto err;
ret = bch2_replicas_gc2(c);
bch_err_msg(ca, ret, "in replicas_gc2()");
bch_err_msg(ca, ret, "bch2_replicas_gc2()");
if (ret)
goto err;
@ -1835,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
ret = bch2_dev_in_fs(&c->disk_sb, &sb);
ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
@ -2023,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
ret = bch2_dev_in_fs(best, sb);
ret = bch2_dev_in_fs(best, sb, &opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {

View File

@ -930,10 +930,10 @@ SHOW(bch2_dev)
sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
if (attr == &sysfs_io_latency_stats_read)
bch2_time_stats_to_text(out, &ca->io_latency[READ]);
bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
if (attr == &sysfs_io_latency_stats_write)
bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
sysfs_printf(congested, "%u%%",
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)

View File

@ -2,7 +2,6 @@
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
#include "printbuf.h"
#include "thread_with_file.h"
#include <linux/anon_inodes.h>
@ -10,6 +9,7 @@
#include <linux/kthread.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/sched/sysctl.h>
void bch2_thread_with_file_exit(struct thread_with_file *thr)
{
@ -65,68 +65,82 @@ err:
return ret;
}
static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
/* stdio_redirect */
static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
{
return thr->stdio.output_buf.pos ||
thr->output2.nr ||
thr->thr.done;
return stdio->input.buf.nr || stdio->done;
}
static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
{
return stdio->output.buf.nr || stdio->done;
}
#define STDIO_REDIRECT_BUFSIZE 4096
static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
{
return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
}
static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
{
return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
}
static void stdio_buf_init(struct stdio_buf *buf)
{
spin_lock_init(&buf->lock);
init_waitqueue_head(&buf->wait);
darray_init(&buf->buf);
}
/* thread_with_stdio */
static void thread_with_stdio_done(struct thread_with_stdio *thr)
{
thr->thr.done = true;
thr->stdio.done = true;
wake_up(&thr->stdio.input.wait);
wake_up(&thr->stdio.output.wait);
}
static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
struct stdio_buf *buf = &thr->stdio.output;
size_t copied = 0, b;
int ret = 0;
if ((file->f_flags & O_NONBLOCK) &&
!thread_with_stdio_has_output(thr))
if (!(file->f_flags & O_NONBLOCK)) {
ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
if (ret)
return ret;
} else if (!stdio_redirect_has_output(&thr->stdio))
return -EAGAIN;
ret = wait_event_interruptible(thr->stdio.output_wait,
thread_with_stdio_has_output(thr));
if (ret)
return ret;
if (thr->thr.done)
return 0;
while (len) {
ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
if (ret)
break;
spin_lock_irq(&thr->stdio.output_lock);
b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
memmove(thr->stdio.output_buf.buf,
thr->stdio.output_buf.buf + b,
thr->stdio.output_buf.pos - b);
thr->output2.nr += b;
thr->stdio.output_buf.pos -= b;
spin_unlock_irq(&thr->stdio.output_lock);
b = min(len, thr->output2.nr);
if (!b)
break;
b -= copy_to_user(buf, thr->output2.data, b);
if (!b) {
while (len && buf->buf.nr) {
if (fault_in_writeable(ubuf, len) == len) {
ret = -EFAULT;
break;
}
copied += b;
buf += b;
len -= b;
spin_lock_irq(&buf->lock);
b = min_t(size_t, len, buf->buf.nr);
memmove(thr->output2.data,
thr->output2.data + b,
thr->output2.nr - b);
thr->output2.nr -= b;
if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
ubuf += b;
len -= b;
copied += b;
buf->buf.nr -= b;
memmove(buf->buf.data,
buf->buf.data + b,
buf->buf.nr);
}
spin_unlock_irq(&buf->lock);
}
return copied ?: ret;
@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
thread_with_stdio_done(thr);
bch2_thread_with_file_exit(&thr->thr);
printbuf_exit(&thr->stdio.input_buf);
printbuf_exit(&thr->stdio.output_buf);
darray_exit(&thr->output2);
thr->exit(thr);
darray_exit(&thr->stdio.input.buf);
darray_exit(&thr->stdio.output.buf);
thr->ops->exit(thr);
return 0;
}
#define WRITE_BUFFER 4096
static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
{
return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
}
static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
size_t len, loff_t *ppos)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
struct printbuf *buf = &thr->stdio.input_buf;
struct stdio_buf *buf = &thr->stdio.input;
size_t copied = 0;
ssize_t ret = 0;
@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
break;
}
spin_lock(&thr->stdio.input_lock);
if (buf->pos < WRITE_BUFFER)
bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
b = min(len, printbuf_remaining_size(buf));
spin_lock(&buf->lock);
if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
darray_make_room_gfp(&buf->buf,
min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
b = min(len, darray_room(buf->buf));
if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
ubuf += b;
len -= b;
copied += b;
buf->pos += b;
if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
buf->buf.nr += b;
ubuf += b;
len -= b;
copied += b;
}
spin_unlock(&thr->stdio.input_lock);
spin_unlock(&buf->lock);
if (b) {
wake_up(&thr->stdio.input_wait);
wake_up(&buf->wait);
} else {
if ((file->f_flags & O_NONBLOCK)) {
ret = -EAGAIN;
break;
}
ret = wait_event_interruptible(thr->stdio.input_wait,
thread_with_stdio_has_input_space(thr));
ret = wait_event_interruptible(buf->wait,
stdio_redirect_has_input_space(&thr->stdio));
if (ret)
break;
}
@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
poll_wait(file, &thr->stdio.output_wait, wait);
poll_wait(file, &thr->stdio.input_wait, wait);
poll_wait(file, &thr->stdio.output.wait, wait);
poll_wait(file, &thr->stdio.input.wait, wait);
__poll_t mask = 0;
if (thread_with_stdio_has_output(thr))
if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
if (thread_with_stdio_has_input_space(thr))
if (stdio_redirect_has_input_space(&thr->stdio))
mask |= EPOLLOUT;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
poll_wait(file, &thr->stdio.output.wait, wait);
__poll_t mask = 0;
if (stdio_redirect_has_output(&thr->stdio))
mask |= EPOLLIN;
if (thr->thr.done)
mask |= EPOLLHUP|EPOLLERR;
return mask;
}
static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
return thr->thr.ret;
}
static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
{
struct thread_with_stdio *thr =
container_of(file->private_data, struct thread_with_stdio, thr);
if (thr->ops->unlocked_ioctl)
return thr->ops->unlocked_ioctl(thr, cmd, p);
return -ENOTTY;
}
static const struct file_operations thread_with_stdio_fops = {
.release = thread_with_stdio_release,
.llseek = no_llseek,
.read = thread_with_stdio_read,
.write = thread_with_stdio_write,
.poll = thread_with_stdio_poll,
.llseek = no_llseek,
.flush = thread_with_stdio_flush,
.release = thread_with_stdio_release,
.unlocked_ioctl = thread_with_stdio_ioctl,
};
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
void (*exit)(struct thread_with_stdio *),
int (*fn)(void *))
static const struct file_operations thread_with_stdout_fops = {
.llseek = no_llseek,
.read = thread_with_stdio_read,
.poll = thread_with_stdout_poll,
.flush = thread_with_stdio_flush,
.release = thread_with_stdio_release,
.unlocked_ioctl = thread_with_stdio_ioctl,
};
static int thread_with_stdio_fn(void *arg)
{
thr->stdio.input_buf = PRINTBUF;
thr->stdio.input_buf.atomic++;
spin_lock_init(&thr->stdio.input_lock);
init_waitqueue_head(&thr->stdio.input_wait);
struct thread_with_stdio *thr = arg;
thr->stdio.output_buf = PRINTBUF;
thr->stdio.output_buf.atomic++;
spin_lock_init(&thr->stdio.output_lock);
init_waitqueue_head(&thr->stdio.output_wait);
thr->thr.ret = thr->ops->fn(thr);
darray_init(&thr->output2);
thr->exit = exit;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
thread_with_stdio_done(thr);
return 0;
}
int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
wait_event(stdio->input_wait,
stdio->input_buf.pos || stdio->done);
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
}
int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
const struct thread_with_stdio_ops *ops)
{
stdio_buf_init(&thr->stdio.input);
stdio_buf_init(&thr->stdio.output);
thr->ops = ops;
return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
}
EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
struct stdio_buf *buf = &stdio->input;
/*
* we're waiting on user input (or for the file descriptor to be
* closed), don't want a hung task warning:
*/
do {
wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
sysctl_hung_task_timeout_secs * HZ / 2);
} while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
spin_lock(&stdio->input_lock);
int ret = min(len, stdio->input_buf.pos);
stdio->input_buf.pos -= ret;
memcpy(buf, stdio->input_buf.buf, ret);
memmove(stdio->input_buf.buf,
stdio->input_buf.buf + ret,
stdio->input_buf.pos);
spin_unlock(&stdio->input_lock);
spin_lock(&buf->lock);
int ret = min(len, buf->buf.nr);
buf->buf.nr -= ret;
memcpy(ubuf, buf->buf.data, ret);
memmove(buf->buf.data,
buf->buf.data + ret,
buf->buf.nr);
spin_unlock(&buf->lock);
wake_up(&stdio->input_wait);
wake_up(&buf->wait);
return ret;
}
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
{
wait_event(stdio->input_wait,
stdio->input_buf.pos || stdio->done);
struct stdio_buf *buf = &stdio->input;
size_t copied = 0;
ssize_t ret = 0;
again:
do {
wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
sysctl_hung_task_timeout_secs * HZ / 2);
} while (!stdio_redirect_has_input(stdio));
if (stdio->done)
return -1;
if (stdio->done) {
ret = -1;
goto out;
}
spin_lock(&stdio->input_lock);
int ret = min(len, stdio->input_buf.pos);
char *n = memchr(stdio->input_buf.buf, '\n', ret);
spin_lock(&buf->lock);
size_t b = min(len, buf->buf.nr);
char *n = memchr(buf->buf.data, '\n', b);
if (n)
ret = min(ret, n + 1 - stdio->input_buf.buf);
stdio->input_buf.pos -= ret;
memcpy(buf, stdio->input_buf.buf, ret);
memmove(stdio->input_buf.buf,
stdio->input_buf.buf + ret,
stdio->input_buf.pos);
spin_unlock(&stdio->input_lock);
b = min_t(size_t, b, n + 1 - buf->buf.data);
buf->buf.nr -= b;
memcpy(ubuf, buf->buf.data, b);
memmove(buf->buf.data,
buf->buf.data + b,
buf->buf.nr);
ubuf += b;
len -= b;
copied += b;
spin_unlock(&buf->lock);
wake_up(&buf->wait);
if (!n && len)
goto again;
out:
return copied ?: ret;
}
__printf(3, 0)
static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
{
ssize_t ret;
do {
va_list args2;
size_t len;
va_copy(args2, args);
len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
va_end(args2);
if (len + 1 <= darray_room(*out)) {
out->nr += len;
return len;
}
ret = darray_make_room_gfp(out, len + 1, gfp);
} while (ret == 0);
return ret;
}
ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
const char *fmt, va_list args)
{
struct stdio_buf *buf = &stdio->output;
unsigned long flags;
ssize_t ret;
again:
spin_lock_irqsave(&buf->lock, flags);
ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
spin_unlock_irqrestore(&buf->lock, flags);
if (ret < 0) {
if (nonblocking)
return -EAGAIN;
ret = wait_event_interruptible(buf->wait,
stdio_redirect_has_output_space(stdio));
if (ret)
return ret;
goto again;
}
wake_up(&buf->wait);
return ret;
}
ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
const char *fmt, ...)
{
va_list args;
ssize_t ret;
va_start(args, fmt);
ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
va_end(args);
wake_up(&stdio->input_wait);
return ret;
}

View File

@ -4,6 +4,38 @@
#include "thread_with_file_types.h"
/*
* Thread with file: Run a kthread and connect it to a file descriptor, so that
* it can be interacted with via fd read/write methods and closing the file
* descriptor stops the kthread.
*
* We have two different APIs:
*
* thread_with_file, the low level version.
* You get to define the full file_operations, including your release function,
* which means that you must call bch2_thread_with_file_exit() from your
* .release method
*
* thread_with_stdio, the higher level version
* This implements full piping of input and output, including .poll.
*
* Notes on behaviour:
* - kthread shutdown behaves like writing or reading from a pipe that has been
* closed
* - Input and output buffers are 4096 bytes, although buffers may in some
* situations slightly exceed that limit so as to avoid chopping off a
* message in the middle in nonblocking mode.
* - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
* should be fine but might change in future revisions.
* - Output buffer may grow past 4096 bytes to deal with messages that are
* bigger than 4096 bytes
* - Writing may be done blocking or nonblocking; in nonblocking mode, we only
* drop entire messages.
*
* To write, use stdio_redirect_printf()
* To read, use stdio_redirect_read() or stdio_redirect_readline()
*/
struct task_struct;
struct thread_with_file {
@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *,
const struct file_operations *,
int (*fn)(void *));
struct thread_with_stdio;
struct thread_with_stdio_ops {
void (*exit)(struct thread_with_stdio *);
int (*fn)(struct thread_with_stdio *);
long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
};
struct thread_with_stdio {
struct thread_with_file thr;
struct stdio_redirect stdio;
DARRAY(char) output2;
void (*exit)(struct thread_with_stdio *);
const struct thread_with_stdio_ops *ops;
};
static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
{
thr->thr.done = true;
thr->stdio.done = true;
wake_up(&thr->stdio.input_wait);
wake_up(&thr->stdio.output_wait);
}
int bch2_run_thread_with_stdio(struct thread_with_stdio *,
void (*exit)(struct thread_with_stdio *),
int (*fn)(void *));
const struct thread_with_stdio_ops *);
int bch2_run_thread_with_stdout(struct thread_with_stdio *,
const struct thread_with_stdio_ops *);
int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
#endif /* _BCACHEFS_THREAD_WITH_FILE_H */

View File

@ -2,14 +2,21 @@
#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
#include "darray.h"
struct stdio_buf {
spinlock_t lock;
wait_queue_head_t wait;
darray_char buf;
};
struct stdio_redirect {
spinlock_t output_lock;
wait_queue_head_t output_wait;
struct printbuf output_buf;
struct stdio_buf input;
struct stdio_buf output;
spinlock_t input_lock;
wait_queue_head_t input_wait;
struct printbuf input_buf;
darray_char input_buf;
bool done;
};

165
fs/bcachefs/time_stats.c Normal file
View File

@ -0,0 +1,165 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/time.h>
#include <linux/spinlock.h>
#include "eytzinger.h"
#include "time_stats.h"
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "d", (u64) NSEC_PER_SEC * 3600 * 24},
{ "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7},
{ "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
{ "eon", U64_MAX },
};
const struct time_unit *bch2_pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
static void quantiles_update(struct quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void time_stats_update_one(struct bch2_time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
bool initted = stats->last_event != 0;
if (time_after64(end, start)) {
struct quantiles *quantiles = time_stats_to_quantiles(stats);
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted,
duration, initted, TIME_STATS_MV_WEIGHT);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
if (quantiles)
quantiles_update(quantiles, duration);
}
if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted,
freq, initted, TIME_STATS_MV_WEIGHT);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
}
stats->last_event = end;
}
void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct time_stat_buffer *b)
{
for (struct time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
struct time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}

159
fs/bcachefs/time_stats.h Normal file
View File

@ -0,0 +1,159 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* bch2_time_stats - collect statistics on events that have a duration, with nicely
* formatted textual output on demand
*
* - percpu buffering of event collection: cheap enough to shotgun
* everywhere without worrying about overhead
*
* tracks:
* - number of events
* - maximum event duration ever seen
* - sum of all event durations
* - average event duration, standard and weighted
* - standard deviation of event durations, standard and weighted
* and analagous statistics for the frequency of events
*
* We provide both mean and weighted mean (exponentially weighted), and standard
* deviation and weighted standard deviation, to give an efficient-to-compute
* view of current behaviour versus. average behaviour - "did this event source
* just become wonky, or is this typical?".
*
* Particularly useful for tracking down latency issues.
*/
#ifndef _BCACHEFS_TIME_STATS_H
#define _BCACHEFS_TIME_STATS_H
#include <linux/sched/clock.h>
#include <linux/spinlock_types.h>
#include <linux/string.h>
#include "mean_and_variance.h"
struct time_unit {
const char *name;
u64 nsecs;
};
/*
* given a nanosecond value, pick the preferred time units for printing:
*/
const struct time_unit *bch2_pick_time_units(u64 ns);
/*
* quantiles - do not use:
*
* Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
* use in new code.
*/
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
struct quantiles {
struct quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
struct time_stat_buffer {
unsigned nr;
struct time_stat_buffer_entry {
u64 start;
u64 end;
} entries[31];
};
struct bch2_time_stats {
spinlock_t lock;
bool have_quantiles;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
u64 last_event_start;
struct mean_and_variance duration_stats;
struct mean_and_variance freq_stats;
/* default weight for weighted mean and variance calculations */
#define TIME_STATS_MV_WEIGHT 8
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance_weighted freq_stats_weighted;
struct time_stat_buffer __percpu *buffer;
};
struct bch2_time_stats_quantiles {
struct bch2_time_stats stats;
struct quantiles quantiles;
};
static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
{
return stats->have_quantiles
? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
: NULL;
}
void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
/**
* time_stats_update - collect a new event being tracked
*
* @stats - bch2_time_stats to update
* @start - start time of event, recorded with local_clock()
*
* The end duration of the event will be the current time
*/
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
/**
* track_event_change - track state change events
*
* @stats - bch2_time_stats to update
* @v - new state, true or false
*
* Use this when tracking time stats for state changes, i.e. resource X becoming
* blocked/unblocked.
*/
static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
{
if (v != !!stats->last_event_start) {
if (!v) {
bch2_time_stats_update(stats, stats->last_event_start);
stats->last_event_start = 0;
} else {
stats->last_event_start = local_clock() ?: 1;
return true;
}
}
return false;
}
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
{
bch2_time_stats_exit(&statq->stats);
}
static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
{
bch2_time_stats_init(&statq->stats);
statq->stats.have_quantiles = true;
memset(&statq->quantiles, 0, sizeof(statq->quantiles));
}
#endif /* _BCACHEFS_TIME_STATS_H */

View File

@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
TRACE_EVENT(error_downcast,
TP_PROTO(int bch_err, int std_err, unsigned long ip),
TP_ARGS(bch_err, std_err, ip),
TP_STRUCT__entry(
__array(char, bch_err, 32 )
__array(char, std_err, 32 )
__array(char, ip, 32 )
),
TP_fast_assign(
strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
),
TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
);
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */

View File

@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
}
#endif
static const struct time_unit {
const char *name;
u64 nsecs;
} time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
{ "ms", NSEC_PER_MSEC },
{ "s", NSEC_PER_SEC },
{ "m", (u64) NSEC_PER_SEC * 60},
{ "h", (u64) NSEC_PER_SEC * 3600},
{ "eon", U64_MAX },
};
static const struct time_unit *pick_time_units(u64 ns)
{
const struct time_unit *u;
for (u = time_units;
u + 1 < time_units + ARRAY_SIZE(time_units) &&
ns >= u[1].nsecs << 1;
u++)
;
return u;
}
void bch2_pr_time_units(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
}
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
{
unsigned i = 0;
while (i < ARRAY_SIZE(q->entries)) {
struct bch2_quantile_entry *e = q->entries + i;
if (unlikely(!e->step)) {
e->m = v;
e->step = max_t(unsigned, v / 2, 1024);
} else if (e->m > v) {
e->m = e->m >= e->step
? e->m - e->step
: 0;
} else if (e->m < v) {
e->m = e->m + e->step > e->m
? e->m + e->step
: U32_MAX;
}
if ((e->m > v ? e->m - v : v - e->m) < e->step)
e->step = max_t(unsigned, e->step / 2, 1);
if (v >= e->m)
break;
i = eytzinger0_child(i, v > e->m);
}
}
static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
u64 start, u64 end)
{
u64 duration, freq;
if (time_after64(end, start)) {
duration = end - start;
mean_and_variance_update(&stats->duration_stats, duration);
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
}
stats->last_event = end;
}
static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
for (struct bch2_time_stat_buffer_entry *i = b->entries;
i < b->entries + ARRAY_SIZE(b->entries);
i++)
bch2_time_stats_update_one(stats, i->start, i->end);
b->nr = 0;
}
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
__bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
WARN_ONCE(!stats->duration_stats_weighted.weight ||
!stats->freq_stats_weighted.weight,
"uninitialized time_stats");
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
bch2_time_stats_update_one(stats, start, end);
if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct bch2_time_stat_buffer,
GFP_ATOMIC);
spin_unlock_irqrestore(&stats->lock, flags);
} else {
struct bch2_time_stat_buffer *b;
preempt_disable();
b = this_cpu_ptr(stats->buffer);
BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
.start = start,
.end = end
};
if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
bch2_time_stats_clear_buffer(stats, b);
preempt_enable();
}
}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
const struct time_unit *u = pick_time_units(ns);
const struct time_unit *u = bch2_pick_time_units(ns);
prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
prt_tab_rjust(out);
@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
struct quantiles *quantiles = time_stats_to_quantiles(stats);
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
u64 f_stddev = 0, d_stddev = 0;
if (stats->buffer) {
int cpu;
@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, d_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, d_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
prt_tab(out);
bch2_pr_time_units_aligned(out, f_mean);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
prt_newline(out);
prt_printf(out, "stddev:");
prt_tab(out);
bch2_pr_time_units_aligned(out, f_stddev);
prt_tab(out);
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
printbuf_indent_sub(out, 2);
prt_newline(out);
printbuf_tabstops_reset(out);
i = eytzinger0_first(NR_QUANTILES);
u = pick_time_units(stats->quantiles.entries[i].m);
if (quantiles) {
int i = eytzinger0_first(NR_QUANTILES);
const struct time_unit *u =
bch2_pick_time_units(quantiles->entries[i].m);
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
prt_printf(out, "quantiles (%s):\t", u->name);
eytzinger0_for_each(i, NR_QUANTILES) {
bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
q = max(stats->quantiles.entries[i].m, last_q);
prt_printf(out, "%llu ",
div_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
u64 q = max(quantiles->entries[i].m, last_q);
prt_printf(out, "%llu ", div_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
last_q = q;
}
}
}
#else
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
free_percpu(stats->buffer);
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
{
memset(stats, 0, sizeof(*stats));
stats->duration_stats_weighted.weight = 8;
stats->freq_stats_weighted.weight = 8;
stats->min_duration = U64_MAX;
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
/* ratelimit: */
@ -1007,28 +850,6 @@ void sort_cmp_size(void *base, size_t num, size_t size,
}
}
static void mempool_free_vp(void *element, void *pool_data)
{
size_t size = (size_t) pool_data;
vpfree(element, size);
}
static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t) pool_data;
return vpmalloc(size, gfp_mask);
}
int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return size < PAGE_SIZE
? mempool_init_kmalloc_pool(pool, min_nr, size)
: mempool_init(pool, min_nr, mempool_alloc_vp,
mempool_free_vp, (void *) size);
}
#if 0
void eytzinger1_test(void)
{

View File

@ -21,6 +21,7 @@
#include "mean_and_variance.h"
#include "darray.h"
#include "time_stats.h"
struct closure;
@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
static inline void vpfree(void *p, size_t size)
{
if (is_vmalloc_addr(p))
vfree(p);
else
free_pages((unsigned long) p, get_order(size));
}
static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
{
return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
get_order(size)) ?:
__vmalloc(size, gfp_mask);
}
static inline void kvpfree(void *p, size_t size)
{
if (size < PAGE_SIZE)
kfree(p);
else
vpfree(p, size);
}
static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{
return size < PAGE_SIZE
? kmalloc(size, gfp_mask)
: vpmalloc(size, gfp_mask);
}
int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
#define HEAP(type) \
struct { \
size_t size, used; \
@ -97,13 +66,13 @@ struct { \
({ \
(heap)->used = 0; \
(heap)->size = (_size); \
(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
(gfp)); \
})
#define free_heap(heap) \
do { \
kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
kvfree((heap)->data); \
(heap)->data = NULL; \
} while (0)
@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
#endif
}
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
struct bch2_quantiles {
struct bch2_quantile_entry {
u64 m;
u64 step;
} entries[NR_QUANTILES];
};
struct bch2_time_stat_buffer {
unsigned nr;
struct bch2_time_stat_buffer_entry {
u64 start;
u64 end;
} entries[32];
};
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
u64 min_duration;
u64 max_duration;
u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
struct bch2_quantiles quantiles;
struct mean_and_variance duration_stats;
struct mean_and_variance_weighted duration_stats_weighted;
struct mean_and_variance freq_stats;
struct mean_and_variance_weighted freq_stats_weighted;
struct bch2_time_stat_buffer __percpu *buffer;
};
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
if (v != !!*start) {
if (!v) {
bch2_time_stats_update(stats, *start);
*start = 0;
} else {
*start = local_clock() ?: 1;
return true;
}
}
return false;
}
#else
static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
static inline bool track_event_change(struct bch2_time_stats *stats,
u64 *start, bool v)
{
bool ret = v && !*start;
*start = v;
return ret;
}
#endif
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
#define ewma_add(ewma, val, weight) \
({ \
typeof(ewma) _ewma = (ewma); \
@ -788,8 +681,12 @@ static inline void __move_gap(void *array, size_t element_size,
}
/* Move the gap in a gap buffer: */
#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \
__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
#define move_gap(_d, _new_gap) \
do { \
__move_gap((_d)->data, sizeof((_d)->data[0]), \
(_d)->nr, (_d)->size, (_d)->gap, _new_gap); \
(_d)->gap = _new_gap; \
} while (0)
#define bubble_sort(_base, _nr, _cmp) \
do { \
@ -876,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
void bch2_darray_str_exit(darray_str *);
int bch2_split_devs(const char *, darray_str *);
#ifdef __KERNEL__
__must_check
static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
{
return copy_to_user(to, from, n) ? -EFAULT : 0;
}
__must_check
static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
{
return copy_from_user(to, from, n) ? -EFAULT : 0;
}
#endif
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
}
#endif /* _BCACHEFS_UTIL_H */

View File

@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
kfree(buf);
if (ret < 0)
return ret;
goto err_class_exit;
ret = bch2_opt_check_may_set(c, opt_id, v);
if (ret < 0)
return ret;
goto err_class_exit;
s.v = v + 1;
s.defined = true;
@ -595,6 +595,7 @@ err:
(opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
err_class_exit:
return bch2_err_class(ret);
}

View File

@ -2033,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
static int __file_remove_privs(struct file *file, unsigned int flags)
int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@ -2058,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@ -2070,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
*/
int file_remove_privs(struct file *file)
{
return __file_remove_privs(file, 0);
return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);
@ -2163,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags)
* Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries.
*/
ret = __file_remove_privs(file, flags);
ret = file_remove_privs_flags(file, flags);
if (ret)
return ret;

View File

@ -3074,6 +3074,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb);
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
const struct inode *inode);

View File

@ -5,7 +5,7 @@
* DOC: Generic radix trees/sparse arrays
*
* Very simple and minimalistic, supporting arbitrary size entries up to
* PAGE_SIZE.
* GENRADIX_NODE_SIZE.
*
* A genradix is defined with the type it will store, like so:
*
@ -45,12 +45,15 @@
struct genradix_root;
#define GENRADIX_NODE_SHIFT 9
#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT)
struct __genradix {
struct genradix_root *root;
};
/*
* NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
* NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE:
*/
#define __GENRADIX_INITIALIZER \
@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *);
static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
{
if (__builtin_constant_p(obj_size))
BUILD_BUG_ON(obj_size > PAGE_SIZE);
BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE);
else
BUG_ON(obj_size > PAGE_SIZE);
BUG_ON(obj_size > GENRADIX_NODE_SIZE);
if (!is_power_of_2(obj_size)) {
size_t objs_per_page = PAGE_SIZE / obj_size;
size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size;
return (idx / objs_per_page) * PAGE_SIZE +
return (idx / objs_per_page) * GENRADIX_NODE_SIZE +
(idx % objs_per_page) * obj_size;
} else {
return idx * obj_size;
@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
#define __genradix_cast(_radix) (typeof((_radix)->type[0]) *)
#define __genradix_obj_size(_radix) sizeof((_radix)->type[0])
#define __genradix_objs_per_page(_radix) \
(PAGE_SIZE / sizeof((_radix)->type[0]))
(GENRADIX_NODE_SIZE / sizeof((_radix)->type[0]))
#define __genradix_page_remainder(_radix) \
(PAGE_SIZE % sizeof((_radix)->type[0]))
(GENRADIX_NODE_SIZE % sizeof((_radix)->type[0]))
#define __genradix_idx_to_offset(_radix, _idx) \
__idx_to_offset(_idx, __genradix_obj_size(_radix))
@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter,
iter->offset += obj_size;
if (!is_power_of_2(obj_size) &&
(iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE)
iter->offset = round_up(iter->offset, PAGE_SIZE);
(iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE)
iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE);
iter->pos++;
}
@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
return;
}
if ((iter->offset & (PAGE_SIZE - 1)) == 0)
iter->offset -= PAGE_SIZE % obj_size;
if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0)
iter->offset -= GENRADIX_NODE_SIZE % obj_size;
iter->offset -= obj_size;
iter->pos--;
@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter,
genradix_for_each_from(_radix, _iter, _p, 0)
#define genradix_last_pos(_radix) \
(SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1)
(SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1)
/**
* genradix_for_each_reverse - iterate over entry in a genradix, reverse order

View File

@ -95,6 +95,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
(void *) size);
}
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kvfree(void *element, void *pool_data);
static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
{
return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
/*
* A mempool_alloc_t and mempool_free_t for a simple page allocator that
* allocates pages of the order specified by pool_data

View File

@ -1639,8 +1639,8 @@ extern struct pid *cad_pid;
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
#define PF__HOLE__00800000 0x00800000
#define PF__HOLE__01000000 0x01000000
#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */

View File

@ -236,16 +236,25 @@ static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);
if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
if (unlikely(pflags & (PF_MEMALLOC_NOIO |
PF_MEMALLOC_NOFS |
PF_MEMALLOC_NORECLAIM |
PF_MEMALLOC_NOWARN |
PF_MEMALLOC_PIN))) {
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precedence
* Stronger flags before weaker flags:
* NORECLAIM implies NOIO, which in turn implies NOFS
*/
if (pflags & PF_MEMALLOC_NOIO)
if (pflags & PF_MEMALLOC_NORECLAIM)
flags &= ~__GFP_DIRECT_RECLAIM;
else if (pflags & PF_MEMALLOC_NOIO)
flags &= ~(__GFP_IO | __GFP_FS);
else if (pflags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
if (pflags & PF_MEMALLOC_NOWARN)
flags |= __GFP_NOWARN;
if (pflags & PF_MEMALLOC_PIN)
flags &= ~__GFP_MOVABLE;
}
@ -306,6 +315,24 @@ static inline void might_alloc(gfp_t gfp_mask)
might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}
/**
* memalloc_flags_save - Add a PF_* flag to current->flags, save old value
*
* This allows PF_* flags to be conveniently added, irrespective of current
* value, and then the old version restored with memalloc_flags_restore().
*/
static inline unsigned memalloc_flags_save(unsigned flags)
{
unsigned oldflags = ~current->flags & flags;
current->flags |= flags;
return oldflags;
}
static inline void memalloc_flags_restore(unsigned flags)
{
current->flags &= ~flags;
}
/**
* memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
*
@ -320,9 +347,7 @@ static inline void might_alloc(gfp_t gfp_mask)
*/
static inline unsigned int memalloc_noio_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
current->flags |= PF_MEMALLOC_NOIO;
return flags;
return memalloc_flags_save(PF_MEMALLOC_NOIO);
}
/**
@ -335,7 +360,7 @@ static inline unsigned int memalloc_noio_save(void)
*/
static inline void memalloc_noio_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
memalloc_flags_restore(flags);
}
/**
@ -352,9 +377,7 @@ static inline void memalloc_noio_restore(unsigned int flags)
*/
static inline unsigned int memalloc_nofs_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
current->flags |= PF_MEMALLOC_NOFS;
return flags;
return memalloc_flags_save(PF_MEMALLOC_NOFS);
}
/**
@ -367,7 +390,7 @@ static inline unsigned int memalloc_nofs_save(void)
*/
static inline void memalloc_nofs_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
memalloc_flags_restore(flags);
}
/**
@ -395,9 +418,7 @@ static inline void memalloc_nofs_restore(unsigned int flags)
*/
static inline unsigned int memalloc_noreclaim_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC;
current->flags |= PF_MEMALLOC;
return flags;
return memalloc_flags_save(PF_MEMALLOC);
}
/**
@ -410,7 +431,7 @@ static inline unsigned int memalloc_noreclaim_save(void)
*/
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC) | flags;
memalloc_flags_restore(flags);
}
/**
@ -425,10 +446,7 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
*/
static inline unsigned int memalloc_pin_save(void)
{
unsigned int flags = current->flags & PF_MEMALLOC_PIN;
current->flags |= PF_MEMALLOC_PIN;
return flags;
return memalloc_flags_save(PF_MEMALLOC_PIN);
}
/**
@ -441,7 +459,7 @@ static inline unsigned int memalloc_pin_save(void)
*/
static inline void memalloc_pin_restore(unsigned int flags)
{
current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags;
memalloc_flags_restore(flags);
}
#ifdef CONFIG_MEMCG

View File

@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* Zero means infinite timeout - no checking done:
*/
unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:

View File

@ -5,7 +5,7 @@
#include <linux/gfp.h>
#include <linux/kmemleak.h>
#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *))
#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY)
struct genradix_node {
@ -14,13 +14,13 @@ struct genradix_node {
struct genradix_node *children[GENRADIX_ARY];
/* Leaf: */
u8 data[PAGE_SIZE];
u8 data[GENRADIX_NODE_SIZE];
};
};
static inline int genradix_depth_shift(unsigned depth)
{
return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth;
return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth;
}
/*
@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth)
/* depth that's needed for a genradix that can address up to ULONG_MAX: */
#define GENRADIX_MAX_DEPTH \
DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT)
DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT)
#define GENRADIX_DEPTH_MASK \
((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1))
@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr);
static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask)
{
struct genradix_node *node;
node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO);
/*
* We're using pages (not slab allocations) directly for kernel data
* structures, so we need to explicitly inform kmemleak of them in order
* to avoid false positive memory leak reports.
*/
kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask);
return node;
return kzalloc(GENRADIX_NODE_SIZE, gfp_mask);
}
static inline void genradix_free_node(struct genradix_node *node)
{
kmemleak_free(node);
free_page((unsigned long)node);
kfree(node);
}
/*
@ -200,7 +189,7 @@ restart:
i++;
iter->offset = round_down(iter->offset + objs_per_ptr,
objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) *
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) *
objs_per_page;
if (i == GENRADIX_ARY)
goto restart;
@ -209,7 +198,7 @@ restart:
n = n->children[i];
}
return &n->data[iter->offset & (PAGE_SIZE - 1)];
return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek);
@ -235,7 +224,7 @@ restart:
if (ilog2(iter->offset) >= genradix_depth_shift(level)) {
iter->offset = genradix_depth_size(level);
iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
iter->offset -= obj_size_plus_page_remainder;
iter->pos--;
@ -251,7 +240,7 @@ restart:
size_t objs_per_ptr = genradix_depth_size(level);
iter->offset = round_down(iter->offset, objs_per_ptr);
iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page;
iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page;
if (!iter->offset)
return NULL;
@ -267,7 +256,7 @@ restart:
n = n->children[i];
}
return &n->data[iter->offset & (PAGE_SIZE - 1)];
return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)];
}
EXPORT_SYMBOL(__genradix_iter_peek_prev);
@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size,
{
size_t offset;
for (offset = 0; offset < size; offset += PAGE_SIZE)
for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE)
if (!__genradix_ptr_alloc(radix, offset, gfp_mask))
return -ENOMEM;

View File

@ -590,6 +590,19 @@ void mempool_kfree(void *element, void *pool_data)
}
EXPORT_SYMBOL(mempool_kfree);
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
return kvmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kvmalloc);
void mempool_kvfree(void *element, void *pool_data)
{
kvfree(element);
}
EXPORT_SYMBOL(mempool_kvfree);
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.