bcachefs: Rework btree iterator lifetimes

The btree_trans struct needs to memoize/cache btree iterators, so that
on transaction restart we don't have to completely redo btree lookups,
and so that we can do them all at once in the correct order when the
transaction had to restart to avoid a deadlock.

This switches the btree iterator lookups to work based on iterator
position, instead of trying to match them up based on the stack trace.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2019-09-26 22:21:39 -04:00 committed by Kent Overstreet
parent a7199432c3
commit 64bc001153
10 changed files with 179 additions and 152 deletions

View File

@ -1730,15 +1730,6 @@ static inline void bch2_btree_iter_init(struct btree_trans *trans,
/* new transactional stuff: */
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
trans->iters_live &= ~(1ULL << iter->idx);
return ret;
}
static inline void __bch2_trans_iter_free(struct btree_trans *trans,
unsigned idx)
{
@ -1746,26 +1737,27 @@ static inline void __bch2_trans_iter_free(struct btree_trans *trans,
trans->iters_linked &= ~(1ULL << idx);
trans->iters_live &= ~(1ULL << idx);
trans->iters_touched &= ~(1ULL << idx);
trans->iters_unlink_on_restart &= ~(1ULL << idx);
trans->iters_unlink_on_commit &= ~(1ULL << idx);
}
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
if (!(trans->iters_touched & (1ULL << iter->idx)) &&
!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
__bch2_trans_iter_free(trans, iter->idx);
trans->iters_live &= ~(1ULL << iter->idx);
return ret;
}
int bch2_trans_iter_free(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
trans->iters_touched &= ~(1ULL << iter->idx);
__bch2_trans_iter_free(trans, iter->idx);
return ret;
}
int bch2_trans_iter_free_on_commit(struct btree_trans *trans,
struct btree_iter *iter)
{
int ret = btree_iter_err(iter);
trans->iters_unlink_on_commit |= 1ULL << iter->idx;
return ret;
return bch2_trans_iter_put(trans, iter);
}
static int bch2_trans_realloc_iters(struct btree_trans *trans,
@ -1839,7 +1831,25 @@ static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
goto got_slot;
if (trans->nr_iters == trans->size) {
int ret = bch2_trans_realloc_iters(trans, trans->size * 2);
int ret;
if (trans->nr_iters >= BTREE_ITER_MAX) {
struct btree_iter *iter;
trans_for_each_iter(trans, iter) {
pr_err("iter: btree %s pos %llu:%llu%s%s%s",
bch2_btree_ids[iter->btree_id],
iter->pos.inode,
iter->pos.offset,
(trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
(trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "");
}
panic("trans iter oveflow\n");
}
ret = bch2_trans_realloc_iters(trans, trans->size * 2);
if (ret)
return ERR_PTR(ret);
}
@ -1854,60 +1864,94 @@ got_slot:
return &trans->iters[idx];
}
static inline void btree_iter_copy(struct btree_iter *dst,
struct btree_iter *src)
{
unsigned i, idx = dst->idx;
*dst = *src;
dst->idx = idx;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(dst, i))
six_lock_increment(&dst->l[i].b->c.lock,
__btree_lock_want(dst, i));
}
static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
{
if (bkey_cmp(l, r) > 0)
swap(l, r);
return POS(r.inode - l.inode, r.offset - l.offset);
}
static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
unsigned btree_id, struct bpos pos,
unsigned flags, u64 iter_id)
unsigned flags)
{
struct btree_iter *iter;
struct btree_iter *iter, *best = NULL;
BUG_ON(trans->nr_iters > BTREE_ITER_MAX);
trans_for_each_iter(trans, iter)
if (iter_id
? iter->id == iter_id
: (iter->btree_id == btree_id &&
!bkey_cmp(iter->pos, pos)))
goto found;
trans_for_each_iter(trans, iter) {
if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
continue;
iter = NULL;
found:
if (!iter) {
if (iter->btree_id != btree_id)
continue;
if (best &&
bkey_cmp(bpos_diff(best->pos, pos),
bpos_diff(iter->pos, pos)) < 0)
continue;
best = iter;
}
if (!best) {
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
iter->id = iter_id;
bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
} else if ((trans->iters_live & (1ULL << best->idx)) ||
(best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
btree_iter_copy(iter, best);
} else {
iter = best;
}
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iter->flags &= ~(BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_SLOTS|BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
if ((iter->flags & BTREE_ITER_INTENT) &&
!bch2_btree_iter_upgrade(iter, 1)) {
trace_trans_restart_upgrade(trans->ip);
return ERR_PTR(-EINTR);
}
}
BUG_ON(iter->btree_id != btree_id);
BUG_ON(trans->iters_live & (1ULL << iter->idx));
trans->iters_live |= 1ULL << iter->idx;
trans->iters_touched |= 1ULL << iter->idx;
if (iter->flags & BTREE_ITER_INTENT)
bch2_btree_iter_upgrade(iter, 1);
else
bch2_btree_iter_downgrade(iter);
BUG_ON(iter->btree_id != btree_id);
BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
BUG_ON(trans->iters_live & (1ULL << iter->idx));
trans->iters_live |= 1ULL << iter->idx;
trans->iters_touched |= 1ULL << iter->idx;
return iter;
}
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
struct btree_iter *bch2_trans_get_iter(struct btree_trans *trans,
enum btree_id btree_id,
struct bpos pos, unsigned flags,
u64 iter_id)
struct bpos pos, unsigned flags)
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos, flags, iter_id);
__btree_trans_get_iter(trans, btree_id, pos, flags);
if (!IS_ERR(iter))
bch2_btree_iter_set_pos(iter, pos);
@ -1923,7 +1967,7 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
{
struct btree_iter *iter =
__btree_trans_get_iter(trans, btree_id, pos,
flags|BTREE_ITER_NODES, 0);
flags|BTREE_ITER_NODES);
unsigned i;
BUG_ON(IS_ERR(iter));
@ -1943,24 +1987,20 @@ struct btree_iter *bch2_trans_copy_iter(struct btree_trans *trans,
struct btree_iter *src)
{
struct btree_iter *iter;
int idx, i;
iter = btree_trans_iter_alloc(trans);
if (IS_ERR(iter))
return iter;
idx = iter->idx;
*iter = *src;
iter->idx = idx;
btree_iter_copy(iter, src);
trans->iters_live |= 1ULL << idx;
trans->iters_touched |= 1ULL << idx;
trans->iters_unlink_on_restart |= 1ULL << idx;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_locked(iter, i))
six_lock_increment(&iter->l[i].b->c.lock,
__btree_lock_want(iter, i));
trans->iters_live |= 1ULL << iter->idx;
/*
* Don't mark it as touched, we don't need to preserve this iter since
* it's cheap to copy it again:
*/
trans->iters_touched &= ~(1ULL << iter->idx);
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
return iter;
}
@ -2001,10 +2041,11 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
inline void bch2_trans_unlink_iters(struct btree_trans *trans)
{
iters &= trans->iters_linked;
iters &= ~trans->iters_live;
u64 iters = trans->iters_linked &
~trans->iters_touched &
~trans->iters_live;
while (iters) {
unsigned idx = __ffs64(iters);
@ -2014,32 +2055,23 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans, u64 iters)
}
}
void bch2_trans_begin(struct btree_trans *trans)
void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
{
u64 iters_to_unlink;
struct btree_iter *iter;
/*
* On transaction restart, the transaction isn't required to allocate
* all the same iterators it on the last iteration:
*
* Unlink any iterators it didn't use this iteration, assuming it got
* further (allocated an iter with a higher idx) than where the iter
* was originally allocated:
*/
iters_to_unlink = ~trans->iters_live &
((1ULL << fls64(trans->iters_live)) - 1);
trans_for_each_iter(trans, iter)
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
iters_to_unlink |= trans->iters_unlink_on_restart;
iters_to_unlink |= trans->iters_unlink_on_commit;
bch2_trans_unlink_iters(trans);
if (flags & TRANS_RESET_ITERS)
trans->iters_live = 0;
bch2_trans_unlink_iters(trans, iters_to_unlink);
trans->iters_touched &= trans->iters_live;
trans->iters_touched = 0;
trans->iters_unlink_on_restart = 0;
trans->iters_unlink_on_commit = 0;
trans->nr_updates = 0;
if (flags & TRANS_RESET_MEM)
trans->mem_top = 0;
bch2_btree_iter_traverse_all(trans);

View File

@ -271,43 +271,30 @@ static inline int bkey_err(struct bkey_s_c k)
int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
int bch2_trans_iter_free_on_commit(struct btree_trans *, struct btree_iter *);
void bch2_trans_unlink_iters(struct btree_trans *, u64);
void bch2_trans_unlink_iters(struct btree_trans *);
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
struct bpos, unsigned, u64);
struct btree_iter *bch2_trans_get_iter(struct btree_trans *, enum btree_id,
struct bpos, unsigned);
struct btree_iter *bch2_trans_copy_iter(struct btree_trans *,
struct btree_iter *);
static __always_inline u64 __btree_iter_id(void)
{
u64 ret = 0;
ret <<= 32;
ret |= _RET_IP_ & U32_MAX;
ret <<= 32;
ret |= _THIS_IP_ & U32_MAX;
return ret;
}
static __always_inline struct btree_iter *
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
return __bch2_trans_get_iter(trans, btree_id, pos, flags,
__btree_iter_id());
}
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
void bch2_trans_begin(struct btree_trans *);
#define TRANS_RESET_ITERS (1 << 0)
#define TRANS_RESET_MEM (1 << 1)
void bch2_trans_reset(struct btree_trans *, unsigned);
static inline void bch2_trans_begin(struct btree_trans *trans)
{
return bch2_trans_reset(trans, TRANS_RESET_ITERS|TRANS_RESET_MEM);
}
static inline void bch2_trans_begin_updates(struct btree_trans *trans)
{
trans->nr_updates = 0;
return bch2_trans_reset(trans, TRANS_RESET_MEM);
}
void *bch2_trans_kmalloc(struct btree_trans *, size_t);

View File

@ -191,12 +191,13 @@ enum btree_iter_type {
#define BTREE_ITER_SLOTS (1 << 2)
#define BTREE_ITER_INTENT (1 << 3)
#define BTREE_ITER_PREFETCH (1 << 4)
#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
#define BTREE_ITER_IS_EXTENTS (1 << 5)
#define BTREE_ITER_ERROR (1 << 6)
#define BTREE_ITER_IS_EXTENTS (1 << 6)
#define BTREE_ITER_ERROR (1 << 7)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
@ -237,8 +238,6 @@ struct btree_iter {
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
u64 id;
};
static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
@ -261,8 +260,6 @@ struct btree_trans {
u64 iters_linked;
u64 iters_live;
u64 iters_touched;
u64 iters_unlink_on_restart;
u64 iters_unlink_on_commit;
u8 nr_iters;
u8 nr_updates;

View File

@ -107,6 +107,8 @@ static inline void bch2_trans_update(struct btree_trans *trans,
{
EBUG_ON(trans->nr_updates >= trans->nr_iters + 4);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans->updates[trans->nr_updates++] = (struct btree_insert_entry) {
.iter = iter, .k = k
};

View File

@ -752,6 +752,7 @@ int bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
unsigned orig_nr_updates = trans->nr_updates;
unsigned orig_mem_top = trans->mem_top;
int ret = 0;
@ -814,9 +815,11 @@ out_noupdates:
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
trans_for_each_iter(trans, iter)
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
if (!ret) {
bch2_trans_unlink_iters(trans, ~trans->iters_touched|
trans->iters_unlink_on_commit);
bch2_trans_unlink_iters(trans);
trans->iters_touched = 0;
}
trans->nr_updates = 0;

View File

@ -1369,13 +1369,11 @@ static int trans_get_key(struct btree_trans *trans,
return 1;
}
*iter = __bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT, 0);
*iter = bch2_trans_get_iter(trans, btree_id, pos,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
if (IS_ERR(*iter))
return PTR_ERR(*iter);
bch2_trans_iter_free_on_commit(trans, *iter);
*k = bch2_btree_iter_peek_slot(*iter);
ret = bkey_err(*k);
if (ret)

View File

@ -282,7 +282,7 @@ static int sum_sector_overwrites(struct btree_trans *trans,
old = bch2_btree_iter_next_slot(iter);
}
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, iter);
return 0;
}
@ -2786,7 +2786,7 @@ reassemble:
bch2_disk_reservation_put(c, &disk_res);
bkey_err:
if (del)
bch2_trans_iter_free(&trans, del);
bch2_trans_iter_put(&trans, del);
del = NULL;
if (!ret)

View File

@ -355,6 +355,7 @@ static void __bch2_write_index(struct bch_write_op *op)
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
BUG_ON(ret == -EINTR);
BUG_ON(keylist_sectors(keys) && !ret);
op->written += sectors_start - keylist_sectors(keys);
@ -1337,6 +1338,8 @@ retry:
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
}
if (ret == -EINTR)
goto retry;
/*
* If we get here, it better have been because there was an error
* reading a btree node
@ -1610,9 +1613,9 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
*offset_into_extent;
iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
POS(0, reflink_offset),
BTREE_ITER_SLOTS, 1);
BTREE_ITER_SLOTS);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
@ -1888,8 +1891,6 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
BCH_READ_USER_MAPPED;
int ret;
bch2_trans_init(&trans, c, 0, 0);
BUG_ON(rbio->_state);
BUG_ON(flags & BCH_READ_NODECODE);
BUG_ON(flags & BCH_READ_IN_RETRY);
@ -1897,10 +1898,13 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c;
rbio->start_time = local_clock();
bch2_trans_init(&trans, c, 0, 0);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS);
while (1) {
BKEY_PADDED(k) tmp;
unsigned bytes, sectors, offset_into_extent;
@ -1955,6 +1959,9 @@ out:
bch2_trans_exit(&trans);
return;
err:
if (ret == -EINTR)
goto retry;
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio);
goto out;

View File

@ -190,10 +190,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
BTREE_ITER_INTENT, 1);
dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
BTREE_ITER_INTENT, 2);
src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
BTREE_ITER_INTENT);
dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
BTREE_ITER_INTENT);
while (1) {
bch2_trans_begin_updates(&trans);

View File

@ -202,12 +202,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
if (k.k->type == desc.key_type &&
desc.hash_bkey(info, k) <= start->pos.offset) {
bch2_trans_iter_free_on_commit(trans, iter);
return 1;
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
ret = 1;
break;
}
}
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, iter);
return ret;
}
@ -247,11 +248,14 @@ int bch2_hash_set(struct btree_trans *trans,
goto not_found;
}
if (!ret)
ret = -ENOSPC;
out:
if (slot)
bch2_trans_iter_free(trans, slot);
bch2_trans_iter_free(trans, iter);
bch2_trans_iter_put(trans, slot);
bch2_trans_iter_put(trans, iter);
return ret ?: -ENOSPC;
return ret;
found:
found = true;
not_found:
@ -261,17 +265,14 @@ not_found:
} else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot) {
bch2_trans_iter_free(trans, iter);
iter = slot;
}
if (!found && slot)
swap(iter, slot);
insert->k.p = iter->pos;
bch2_trans_update(trans, iter, insert);
bch2_trans_iter_free_on_commit(trans, iter);
}
return ret;
goto out;
}
static __always_inline