// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_on_stack.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "extents.h" #include "extent_update.h" /* * This counts the number of iterators to the alloc & ec btrees we'll need * inserting/removing this extent: */ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; unsigned ret = 0; bkey_extent_entry_for_each(ptrs, entry) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: case BCH_EXTENT_ENTRY_stripe_ptr: ret++; } } return ret; } static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, unsigned *nr_iters, unsigned max_iters, bool overwrite) { int ret = 0; switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); if (*nr_iters >= max_iters) { *end = bpos_min(*end, k.k->p); ret = 1; } break; case KEY_TYPE_reflink_p: { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx = le64_to_cpu(p.v->idx); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); struct btree_iter *iter; struct bkey_s_c r_k; for_each_btree_key(trans, iter, BTREE_ID_REFLINK, POS(0, idx + offset), BTREE_ITER_SLOTS, r_k, ret) { if (bkey_cmp(bkey_start_pos(r_k.k), POS(0, idx + sectors)) >= 0) break; *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); if (*nr_iters >= max_iters) { struct bpos pos = bkey_start_pos(k.k); pos.offset += r_k.k->p.offset - idx; *end = bpos_min(*end, pos); ret = 1; break; } } bch2_trans_iter_put(trans, iter); break; } } return ret; } #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) int bch2_extent_atomic_end(struct btree_iter *iter, struct bkey_i *insert, struct bpos *end) { struct btree_trans *trans = iter->trans; struct btree *b; struct btree_node_iter node_iter; struct bkey_packed *_k; unsigned nr_iters = 0; int ret; ret = bch2_btree_iter_traverse(iter); if (ret) return ret; b = iter->l[0].b; node_iter = iter->l[0].iter; BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0); *end = bpos_min(insert->k.p, b->key.k.p); ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, &nr_iters, EXTENT_ITERS_MAX / 2, false); if (ret < 0) return ret; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); unsigned offset = 0; if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) break; if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0) offset = bkey_start_offset(&insert->k) - bkey_start_offset(k.k); ret = count_iters_for_insert(trans, k, offset, end, &nr_iters, EXTENT_ITERS_MAX, true); if (ret) break; bch2_btree_node_iter_advance(&node_iter, b); } return ret < 0 ? ret : 0; } int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) { struct bpos end; int ret; ret = bch2_extent_atomic_end(iter, k, &end); if (ret) return ret; bch2_cut_back(end, k); return 0; } int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) { struct bpos end; int ret; ret = bch2_extent_atomic_end(iter, k, &end); if (ret) return ret; return !bkey_cmp(end, k->k.p); } enum btree_insert_ret bch2_extent_can_insert(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert, unsigned *u64s) { struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; struct bkey_packed *_k; struct bkey unpacked; int sectors; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard))) { struct bkey_s_c k = bkey_disassemble(l->b, _k, &unpacked); enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; overlap = bch2_extent_overlap(&insert->k, k.k); /* * If we're overwriting an existing extent, we may need to emit * a whiteout - unless we're inserting a new extent at the same * position: */ if (k.k->needs_whiteout && (!bkey_whiteout(&insert->k) || bkey_cmp(k.k->p, insert->k.p))) *u64s += BKEY_U64s; /* * If we're partially overwriting an existing extent which has * been written out to disk, we'll need to emit a new version of * that extent: */ if (bkey_written(l->b, _k) && overlap != BCH_EXTENT_OVERLAP_ALL) *u64s += _k->u64s; /* And we may be splitting an existing extent: */ if (overlap == BCH_EXTENT_OVERLAP_MIDDLE) *u64s += _k->u64s; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && (sectors = bch2_bkey_sectors_compressed(k))) { int flags = trans->flags & BTREE_INSERT_NOFAIL ? BCH_DISK_RESERVATION_NOFAIL : 0; switch (bch2_disk_reservation_add(trans->c, trans->disk_res, sectors, flags)) { case 0: break; case -ENOSPC: return BTREE_INSERT_ENOSPC; default: BUG(); } } if (overlap == BCH_EXTENT_OVERLAP_FRONT || overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; bch2_btree_node_iter_advance(&node_iter, l->b); } return BTREE_INSERT_OK; } static void verify_extent_nonoverlapping(struct bch_fs *c, struct btree *b, struct btree_node_iter *_iter, struct bkey_i *insert) { #ifdef CONFIG_BCACHEFS_DEBUG struct btree_node_iter iter; struct bkey_packed *k; struct bkey uk; if (!expensive_debug_checks(c)) return; iter = *_iter; k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard); BUG_ON(k && (uk = bkey_unpack_key(b, k), bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0)); iter = *_iter; k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard); #if 0 BUG_ON(k && (uk = bkey_unpack_key(b, k), bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0); #else if (k && (uk = bkey_unpack_key(b, k), bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) { char buf1[100]; char buf2[100]; bch2_bkey_to_text(&PBUF(buf1), &insert->k); bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_dump_btree_node(b); panic("insert > next :\n" "insert %s\n" "next %s\n", buf1, buf2); } #endif #endif } static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert) { struct btree_iter_level *l = &iter->l[0]; struct bkey_packed *k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b)); BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b)); EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size); verify_extent_nonoverlapping(c, l->b, &l->iter, insert); if (debug_check_bkeys(c)) bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert)); bch2_bset_insert(l->b, &l->iter, k, insert, 0); bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s); } static void pack_push_whiteout(struct bch_fs *c, struct btree *b, struct bpos pos) { struct bkey_packed k; if (!bkey_pack_pos(&k, pos, b)) { struct bkey_i tmp; bkey_init(&tmp.k); tmp.k.p = pos; bkey_copy(&k, &tmp); } k.needs_whiteout = true; push_whiteout(c, b, &k); } static void extent_drop(struct bch_fs *c, struct btree_iter *iter, struct bkey_packed *_k, struct bkey_s k) { struct btree_iter_level *l = &iter->l[0]; if (!bkey_whiteout(k.k)) btree_account_key_drop(l->b, _k); k.k->size = 0; k.k->type = KEY_TYPE_deleted; if (!btree_node_old_extent_overwrite(l->b) && k.k->needs_whiteout) { pack_push_whiteout(c, l->b, k.k->p); k.k->needs_whiteout = false; } if (_k >= btree_bset_last(l->b)->start) { unsigned u64s = _k->u64s; bch2_bset_delete(l->b, _k, _k->u64s); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, u64s, 0); } else { extent_save(l->b, _k, k.k); bch2_btree_iter_fix_key_modified(iter, l->b, _k); } } static void extent_squash(struct bch_fs *c, struct btree_iter *iter, struct bkey_i *insert, struct bkey_packed *_k, struct bkey_s k, enum bch_extent_overlap overlap) { struct btree_iter_level *l = &iter->l[0]; struct bkey_on_stack tmp, split; bkey_on_stack_init(&tmp); bkey_on_stack_init(&split); if (!btree_node_old_extent_overwrite(l->b)) { if (!bkey_whiteout(&insert->k) && !bkey_cmp(k.k->p, insert->k.p)) { insert->k.needs_whiteout = k.k->needs_whiteout; k.k->needs_whiteout = false; } } else { insert->k.needs_whiteout |= k.k->needs_whiteout; } switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: if (bkey_written(l->b, _k)) { bkey_on_stack_reassemble(&tmp, c, k.s_c); bch2_cut_front(insert->k.p, tmp.k); /* * needs_whiteout was propagated to new version of @k, * @tmp: */ if (!btree_node_old_extent_overwrite(l->b)) k.k->needs_whiteout = false; extent_drop(c, iter, _k, k); extent_bset_insert(c, iter, tmp.k); } else { btree_keys_account_val_delta(l->b, _k, bch2_cut_front_s(insert->k.p, k)); extent_save(l->b, _k, k.k); /* * No need to call bset_fix_invalidated_key, start of * extent changed but extents are indexed by where they * end */ bch2_btree_iter_fix_key_modified(iter, l->b, _k); } break; case BCH_EXTENT_OVERLAP_BACK: if (bkey_written(l->b, _k)) { bkey_on_stack_reassemble(&tmp, c, k.s_c); bch2_cut_back(bkey_start_pos(&insert->k), tmp.k); /* * @tmp has different position than @k, needs_whiteout * should not be propagated: */ if (!btree_node_old_extent_overwrite(l->b)) tmp.k->k.needs_whiteout = false; extent_drop(c, iter, _k, k); extent_bset_insert(c, iter, tmp.k); } else { /* * position of @k is changing, emit a whiteout if * needs_whiteout is set: */ if (!btree_node_old_extent_overwrite(l->b) && k.k->needs_whiteout) { pack_push_whiteout(c, l->b, k.k->p); k.k->needs_whiteout = false; } btree_keys_account_val_delta(l->b, _k, bch2_cut_back_s(bkey_start_pos(&insert->k), k)); extent_save(l->b, _k, k.k); bch2_bset_fix_invalidated_key(l->b, _k); bch2_btree_node_iter_fix(iter, l->b, &l->iter, _k, _k->u64s, _k->u64s); } break; case BCH_EXTENT_OVERLAP_ALL: extent_drop(c, iter, _k, k); break; case BCH_EXTENT_OVERLAP_MIDDLE: bkey_on_stack_reassemble(&split, c, k.s_c); bch2_cut_back(bkey_start_pos(&insert->k), split.k); if (!btree_node_old_extent_overwrite(l->b)) split.k->k.needs_whiteout = false; /* this is identical to BCH_EXTENT_OVERLAP_FRONT: */ if (bkey_written(l->b, _k)) { bkey_on_stack_reassemble(&tmp, c, k.s_c); bch2_cut_front(insert->k.p, tmp.k); if (!btree_node_old_extent_overwrite(l->b)) k.k->needs_whiteout = false; extent_drop(c, iter, _k, k); extent_bset_insert(c, iter, tmp.k); } else { btree_keys_account_val_delta(l->b, _k, bch2_cut_front_s(insert->k.p, k)); extent_save(l->b, _k, k.k); bch2_btree_iter_fix_key_modified(iter, l->b, _k); } extent_bset_insert(c, iter, split.k); break; } bkey_on_stack_exit(&split, c); bkey_on_stack_exit(&tmp, c); } /** * bch_extent_insert_fixup - insert a new extent and deal with overlaps * * this may result in not actually doing the insert, or inserting some subset * of the insert key. For cmpxchg operations this is where that logic lives. * * All subsets of @insert that need to be inserted are inserted using * bch2_btree_insert_and_journal(). If @b or @res fills up, this function * returns false, setting @iter->pos for the prefix of @insert that actually got * inserted. * * BSET INVARIANTS: this function is responsible for maintaining all the * invariants for bsets of extents in memory. things get really hairy with 0 * size extents * * within one bset: * * bkey_start_pos(bkey_next(k)) >= k * or bkey_start_offset(bkey_next(k)) >= k->offset * * i.e. strict ordering, no overlapping extents. * * multiple bsets (i.e. full btree node): * * ∀ k, j * k.size != 0 ∧ j.size != 0 → * ¬ (k > bkey_start_pos(j) ∧ k < j) * * i.e. no two overlapping keys _of nonzero size_ * * We can't realistically maintain this invariant for zero size keys because of * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j * there may be another 0 size key between them in another bset, and it will * thus overlap with the merged key. * * In addition, the end of iter->pos indicates how much has been processed. * If the end of iter->pos is not the same as the end of insert, then * key insertion needs to continue/be retried. */ void bch2_insert_fixup_extent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert) { struct bch_fs *c = trans->c; struct btree_iter_level *l = &iter->l[0]; struct btree_node_iter node_iter = l->iter; bool do_update = !bkey_whiteout(&insert->k); struct bkey_packed *_k; struct bkey unpacked; EBUG_ON(iter->level); EBUG_ON(!insert->k.size); EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k))); while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b, KEY_TYPE_discard))) { struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked); enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k); if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0) break; if (!bkey_whiteout(k.k)) do_update = true; if (!do_update) { struct bpos cur_end = bpos_min(insert->k.p, k.k->p); bch2_cut_front(cur_end, insert); bch2_btree_iter_set_pos_same_leaf(iter, cur_end); } else { extent_squash(c, iter, insert, _k, k, overlap); } node_iter = l->iter; if (overlap == BCH_EXTENT_OVERLAP_FRONT || overlap == BCH_EXTENT_OVERLAP_MIDDLE) break; } l->iter = node_iter; bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p); if (do_update) { if (insert->k.type == KEY_TYPE_deleted) insert->k.type = KEY_TYPE_discard; if (!bkey_whiteout(&insert->k) || btree_node_old_extent_overwrite(l->b)) extent_bset_insert(c, iter, insert); bch2_btree_journal_key(trans, iter, insert); } bch2_cut_front(insert->k.p, insert); }