From f469c8bd90b7d595414e4b1876983dc94d0df47e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 12 Apr 2023 11:33:10 +0100 Subject: [PATCH 001/194] btrfs: unexport btrfs_prev_leaf() btrfs_prev_leaf() is not used outside ctree.c, so there's no need to export it at ctree.h - just make it static at ctree.c and move its definition above btrfs_search_slot_for_read(), since that function calls it. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 161 ++++++++++++++++++++++++----------------------- fs/btrfs/ctree.h | 1 - 2 files changed, 81 insertions(+), 81 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2ff2961b1183..28b0402a008f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2378,6 +2378,87 @@ done: return ret; } +/* + * Search the tree again to find a leaf with smaller keys. + * Returns 0 if it found something. + * Returns 1 if there are no smaller keys. + * Returns < 0 on error. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key orig_key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + orig_key = key; + + if (key.offset > 0) { + key.offset--; + } else if (key.type > 0) { + key.type--; + key.offset = (u64)-1; + } else if (key.objectid > 0) { + key.objectid--; + key.type = (u8)-1; + key.offset = (u64)-1; + } else { + return 1; + } + + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret <= 0) + return ret; + + /* + * Previous key not found. Even if we were at slot 0 of the leaf we had + * before releasing the path and calling btrfs_search_slot(), we now may + * be in a slot pointing to the same original key - this can happen if + * after we released the path, one of more items were moved from a + * sibling leaf into the front of the leaf we had due to an insertion + * (see push_leaf_right()). + * If we hit this case and our slot is > 0 and just decrement the slot + * so that the caller does not process the same key again, which may or + * may not break the caller, depending on its logic. + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); + ret = comp_keys(&found_key, &orig_key); + if (ret == 0) { + if (path->slots[0] > 0) { + path->slots[0]--; + return 0; + } + /* + * At slot 0, same key as before, it means orig_key is + * the lowest, leftmost, key in the tree. We're done. + */ + return 1; + } + } + + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + /* + * We might have had an item with the previous key in the tree right + * before we released our path. And after we released our path, that + * item might have been pushed to the first slot (0) of the leaf we + * were holding due to a tree balance. Alternatively, an item with the + * previous key can exist as the only element of a leaf (big fat item). + * Therefore account for these 2 cases, so that our callers (like + * btrfs_previous_item) don't miss an existing item with a key matching + * the previous key we computed above. + */ + if (ret <= 0) + return 0; + return 1; +} + /* * helper to use instead of search slot if no exact match is needed but * instead the next or previous item should be returned. @@ -4473,86 +4554,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -/* - * search the tree again to find a leaf with lesser keys - * returns 0 if it found something or 1 if there are no lesser leaves. - * returns < 0 on io errors. - * - * This may release the path, and so you may lose any locks held at the - * time you call it. - */ -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - struct btrfs_key key; - struct btrfs_key orig_key; - struct btrfs_disk_key found_key; - int ret; - - btrfs_item_key_to_cpu(path->nodes[0], &key, 0); - orig_key = key; - - if (key.offset > 0) { - key.offset--; - } else if (key.type > 0) { - key.type--; - key.offset = (u64)-1; - } else if (key.objectid > 0) { - key.objectid--; - key.type = (u8)-1; - key.offset = (u64)-1; - } else { - return 1; - } - - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret <= 0) - return ret; - - /* - * Previous key not found. Even if we were at slot 0 of the leaf we had - * before releasing the path and calling btrfs_search_slot(), we now may - * be in a slot pointing to the same original key - this can happen if - * after we released the path, one of more items were moved from a - * sibling leaf into the front of the leaf we had due to an insertion - * (see push_leaf_right()). - * If we hit this case and our slot is > 0 and just decrement the slot - * so that the caller does not process the same key again, which may or - * may not break the caller, depending on its logic. - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); - if (ret == 0) { - if (path->slots[0] > 0) { - path->slots[0]--; - return 0; - } - /* - * At slot 0, same key as before, it means orig_key is - * the lowest, leftmost, key in the tree. We're done. - */ - return 1; - } - } - - btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); - /* - * We might have had an item with the previous key in the tree right - * before we released our path. And after we released our path, that - * item might have been pushed to the first slot (0) of the leaf we - * were holding due to a tree balance. Alternatively, an item with the - * previous key can exist as the only element of a leaf (big fat item). - * Therefore account for these 2 cases, so that our callers (like - * btrfs_previous_item) don't miss an existing item with a key matching - * the previous key we computed above. - */ - if (ret <= 0) - return 0; - return 1; -} - /* * A helper function to walk down the tree starting at min_key, and looking * for nodes or leaves that are have a minimum transaction id. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4c1986cd5bed..e81f10e12867 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -633,7 +633,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, return btrfs_insert_empty_items(trans, root, path, &batch); } -int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); From 1b53e51a4a8f870da194bcdaec9eb8865ee89386 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Tue, 11 Apr 2023 15:10:53 -0400 Subject: [PATCH 002/194] btrfs: don't commit transaction for every subvol create Recently a Meta-internal workload encountered subvolume creation taking up to 2s each, significantly slower than directory creation. As they were hoping to be able to use subvolumes instead of directories, and were looking to create hundreds, this was a significant issue. After Josef investigated, it turned out to be due to the transaction commit currently performed at the end of subvolume creation. This change improves the workload by not doing transaction commit for every subvolume creation, and merely requiring a transaction commit on fsync. In the worst case, of doing a subvolume create and fsync in a loop, this should require an equal amount of time to the current scheme; and in the best case, the internal workload creating hundreds of subvolumes before fsyncing is greatly improved. While it would be nice to be able to use the log tree and use the normal fsync path, log tree replay can't deal with new subvolume inodes presently. It's possible that there's some reason that the transaction commit is necessary for correctness during subvolume creation; however, git logs indicate that the commit dates back to the beginning of subvolume creation, and there are no notes on why it would be necessary. Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Reviewed-by: Neal Gompa Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2fa36f694daa..9522669000a7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -649,6 +649,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; + /* Tree log can't currently deal with an inode which is a new root. */ + btrfs_set_log_full_commit(trans); ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) @@ -757,10 +759,7 @@ out: trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); - if (ret) - btrfs_end_transaction(trans); - else - ret = btrfs_commit_transaction(trans); + btrfs_end_transaction(trans); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: From f2db4d5cb457b7da439d5509bdb6f7b01054c11b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Apr 2023 18:13:01 +0100 Subject: [PATCH 003/194] btrfs: make btrfs_free_device() static The function btrfs_free_device() is never used outside of volumes.c, so make it static and remove its prototype declaration at volumes.h. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 841e799dece5..1a7620680f50 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -391,7 +391,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, return fs_devs; } -void btrfs_free_device(struct btrfs_device *device) +static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf47a1a70813..5cbbee32748c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -617,7 +617,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid, const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); -void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode); From 88ad95b055764e4c74fb6b8201f794f4e531753d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 26 Apr 2023 11:51:37 +0100 Subject: [PATCH 004/194] btrfs: tag as unlikely the key comparison when checking sibling keys When checking siblings keys, before moving keys from one node/leaf to a sibling node/leaf, it's very unexpected to have the last key of the left sibling greater than or equals to the first key of the right sibling, as that means we have a (serious) corruption that breaks the key ordering properties of a b+tree. Since this is unexpected, surround the comparison with the unlikely macro, which helps the compiler generate better code for the most expected case (no existing b+tree corruption). This is also what we do for other unexpected cases of invalid key ordering (like at btrfs_set_item_key_safe()). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 28b0402a008f..0321f4286681 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2707,7 +2707,7 @@ static bool check_sibling_keys(struct extent_buffer *left, btrfs_item_key_to_cpu(right, &right_first, 0); } - if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) { btrfs_crit(left->fs_info, "left extent buffer:"); btrfs_print_tree(left, false); btrfs_crit(left->fs_info, "right extent buffer:"); From b5345d6ceeee3ef378e4800f538c8fc06bf9de48 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 26 Apr 2023 00:19:40 +0900 Subject: [PATCH 005/194] btrfs: export bitmap_test_range_all_{set,zero} bitmap_test_range_all_{set,zero} defined in subpage.c are useful for other components. Move them to misc.h and use them in zoned.c. Also, as find_next{,_zero}_bit take/return "unsigned long" instead of "unsigned int", convert the type to "unsigned long". While at it, also rewrite the "if (...) return true; else return false;" pattern and add const to the input bitmap. Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/misc.h | 20 ++++++++++++++++++++ fs/btrfs/subpage.c | 22 ---------------------- fs/btrfs/zoned.c | 12 ++++++------ 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 768583a440e1..005751a12911 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -143,4 +143,24 @@ static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr, return NULL; } +static inline bool bitmap_test_range_all_set(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_zero; + + found_zero = find_next_zero_bit(addr, start + nbits, start); + return (found_zero == start + nbits); +} + +static inline bool bitmap_test_range_all_zero(const unsigned long *addr, + unsigned long start, + unsigned long nbits) +{ + unsigned long found_set; + + found_set = find_next_bit(addr, start + nbits, start); + return (found_set == start + nbits); +} + #endif diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index dd46b978ac2c..045117ca0ddc 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -367,28 +367,6 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_zero; - - found_zero = find_next_zero_bit(addr, start + nbits, start); - if (found_zero == start + nbits) - return true; - return false; -} - -static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, - unsigned int nbits) -{ - unsigned int found_set; - - found_set = find_next_bit(addr, start + nbits, start); - if (found_set == start + nbits) - return true; - return false; -} - #define subpage_calc_start_bit(fs_info, page, name, start, len) \ ({ \ unsigned int start_bit; \ diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 39828af4a4e8..dac879fe2871 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1057,7 +1057,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, /* Check if zones in the region are all empty */ if (btrfs_dev_is_sequential(device, pos) && - find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { pos += zinfo->zone_size; continue; } @@ -1156,23 +1156,23 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) struct btrfs_zoned_device_info *zinfo = device->zone_info; const u8 shift = zinfo->zone_size_shift; unsigned long begin = start >> shift; - unsigned long end = (start + size) >> shift; + unsigned long nbits = size >> shift; u64 pos; int ret; ASSERT(IS_ALIGNED(start, zinfo->zone_size)); ASSERT(IS_ALIGNED(size, zinfo->zone_size)); - if (end > zinfo->nr_zones) + if (begin + nbits > zinfo->nr_zones) return -ERANGE; /* All the zones are conventional */ - if (find_next_bit(zinfo->seq_zones, end, begin) == end) + if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) return 0; /* All the zones are sequential and empty */ - if (find_next_zero_bit(zinfo->seq_zones, end, begin) == end && - find_next_zero_bit(zinfo->empty_zones, end, begin) == end) + if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && + bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) return 0; for (pos = start; pos < start + size; pos += zinfo->zone_size) { From 6c75a589cb35b8ea5cf9a22f389981acf687ab85 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Apr 2023 20:16:27 +0800 Subject: [PATCH 006/194] btrfs: print-tree: pass const extent buffer pointer Since print-tree infrastructure only prints the content of a tree block, we can make them to accept const extent buffer pointer. This removes a forced type convert in extent-tree, where we convert a const extent buffer pointer to regular one, just to avoid compiler warning. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/print-tree.c | 16 ++++++++-------- fs/btrfs/print-tree.h | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0321f4286681..f76da470d60b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3077,7 +3077,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * and nr indicate which items in the leaf to check. This totals up the * space used both by the item structs and the item data */ -static int leaf_space_used(struct extent_buffer *l, int start, int nr) +static int leaf_space_used(const struct extent_buffer *l, int start, int nr) { int data_len; int nritems = btrfs_header_nritems(l); @@ -3097,7 +3097,7 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct extent_buffer *leaf) +int btrfs_leaf_free_space(const struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; int nritems = btrfs_header_nritems(leaf); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e81f10e12867..b7ab7fa2b73a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -685,7 +685,7 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct extent_buffer *leaf); +int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd289de4e92..e1d5198d1f5e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -402,7 +402,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } - btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 497b9dbd8a13..aa06d9ca911d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -49,7 +49,7 @@ const char *btrfs_root_name(const struct btrfs_key *key, char *buf) return buf; } -static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +static void print_chunk(const struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); int i; @@ -62,7 +62,7 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) btrfs_stripe_offset_nr(eb, chunk, i)); } } -static void print_dev_item(struct extent_buffer *eb, +static void print_dev_item(const struct extent_buffer *eb, struct btrfs_dev_item *dev_item) { pr_info("\t\tdev item devid %llu total_bytes %llu bytes used %llu\n", @@ -70,7 +70,7 @@ static void print_dev_item(struct extent_buffer *eb, btrfs_device_total_bytes(eb, dev_item), btrfs_device_bytes_used(eb, dev_item)); } -static void print_extent_data_ref(struct extent_buffer *eb, +static void print_extent_data_ref(const struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", @@ -80,7 +80,7 @@ static void print_extent_data_ref(struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } -static void print_extent_item(struct extent_buffer *eb, int slot, int type) +static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; @@ -169,7 +169,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) WARN_ON(ptr > end); } -static void print_uuid_item(struct extent_buffer *l, unsigned long offset, +static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, u32 item_size) { if (!IS_ALIGNED(item_size, sizeof(u64))) { @@ -191,7 +191,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. */ -static void print_eb_refs_lock(struct extent_buffer *eb) +static void print_eb_refs_lock(const struct extent_buffer *eb) { #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u", @@ -199,7 +199,7 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #endif } -void btrfs_print_leaf(struct extent_buffer *l) +void btrfs_print_leaf(const struct extent_buffer *l) { struct btrfs_fs_info *fs_info; int i; @@ -355,7 +355,7 @@ void btrfs_print_leaf(struct extent_buffer *l) } } -void btrfs_print_tree(struct extent_buffer *c, bool follow) +void btrfs_print_tree(const struct extent_buffer *c, bool follow) { struct btrfs_fs_info *fs_info; int i; u32 nr; diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8c3e9319ec4e..c42bc666d5ee 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -9,8 +9,8 @@ /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 -void btrfs_print_leaf(struct extent_buffer *l); -void btrfs_print_tree(struct extent_buffer *c, bool follow); +void btrfs_print_leaf(const struct extent_buffer *l); +void btrfs_print_tree(const struct extent_buffer *c, bool follow); const char *btrfs_root_name(const struct btrfs_key *key, char *buf); #endif From eee3b811784e52f06ccb3e1c4518a9907627d4fe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 27 Apr 2023 20:16:28 +0800 Subject: [PATCH 007/194] btrfs: improve leaf dump and error handling Improve the leaf dump behavior by: - Always dump the leaf first, then the error message - Output the slot number if possible Especially in __btrfs_free_extent() the leaf dump of extent tree can be pretty large. With an extra slot number it's much easier to locate the problem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 +- fs/btrfs/extent-tree.c | 123 +++++++++++++++++++---------------------- 2 files changed, 59 insertions(+), 68 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f76da470d60b..4b33bc087fc7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2633,6 +2633,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2640,13 +2641,13 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", slot, btrfs_disk_key_objectid(&disk_key), @@ -2654,7 +2655,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_disk_key_offset(&disk_key), new_key->objectid, new_key->type, new_key->offset); - btrfs_print_leaf(eb); BUG(); } } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e1d5198d1f5e..1c326d530764 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1164,15 +1164,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, * should not happen at all. */ if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_print_leaf(path->nodes[0]); btrfs_crit(trans->fs_info, -"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", - bytenr, num_bytes, root_objectid); - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - WARN_ON(1); - btrfs_crit(trans->fs_info, - "path->slots[0]=%d path->nodes[0]:", path->slots[0]); - btrfs_print_leaf(path->nodes[0]); - } +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu slot %u", + bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } update_inline_extent_backref(path, iref, refs_to_add, extent_op); @@ -2838,6 +2833,13 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, return ret; } +#define abort_and_dump(trans, path, fmt, args...) \ +({ \ + btrfs_abort_transaction(trans, -EUCLEAN); \ + btrfs_print_leaf(path->nodes[0]); \ + btrfs_crit(trans->fs_info, fmt, ##args); \ +}) + /* * Drop one or more refs of @node. * @@ -2978,10 +2980,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!found_extent) { if (iref) { - btrfs_crit(info, -"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", + path->slots[0]); + ret = -EUCLEAN; + goto out; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, @@ -3029,11 +3032,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (ret) { - btrfs_err(info, - "umm, got %d back from search, was looking for %llu", - ret, bytenr); if (ret > 0) btrfs_print_leaf(path->nodes[0]); + btrfs_err(info, + "umm, got %d back from search, was looking for %llu, slot %d", + ret, bytenr, path->slots[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3042,12 +3045,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(path->nodes[0]); - btrfs_err(info, - "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", - bytenr, parent, root_objectid, owner_objectid, - owner_offset); - btrfs_abort_transaction(trans, ret); + abort_and_dump(trans, path, +"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", + bytenr, parent, root_objectid, owner_objectid, + owner_offset, path->slots[0]); goto out; } else { btrfs_abort_transaction(trans, ret); @@ -3067,14 +3068,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; + if (item_size < sizeof(*ei) + sizeof(*bi)) { - btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", - key.objectid, key.type, key.offset, - owner_objectid, item_size, - sizeof(*ei) + sizeof(*bi)); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid extent item size for key (%llu, %u, %llu) slot %u owner %llu, has %u expect >= %zu", + key.objectid, key.type, key.offset, + path->slots[0], owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + ret = -EUCLEAN; + goto out; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3082,11 +3084,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_crit(info, - "trying to drop %d refs but we only have %llu for bytenr %llu", - refs_to_drop, refs, bytenr); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", + refs_to_drop, refs, bytenr, path->slots[0]); + ret = -EUCLEAN; + goto out; } refs -= refs_to_drop; @@ -3099,10 +3101,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (iref) { if (!found_extent) { - btrfs_crit(info, -"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", + path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3121,21 +3124,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (found_extent) { if (is_data && refs_to_drop != extent_data_ref_count(path, iref)) { - btrfs_crit(info, - "invalid refs_to_drop, current refs %u refs_to_drop %u", - extent_data_ref_count(path, iref), - refs_to_drop); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", + extent_data_ref_count(path, iref), + refs_to_drop, path->slots[0]); + ret = -EUCLEAN; + goto out; } if (iref) { if (path->slots[0] != extent_slot) { - btrfs_crit(info, -"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", - key.objectid, key.type, - key.offset); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, +"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", + key.objectid, key.type, + key.offset, path->slots[0]); + ret = -EUCLEAN; + goto out; } } else { /* @@ -3145,10 +3148,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] */ if (path->slots[0] != extent_slot + 1) { - btrfs_crit(info, - "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); - btrfs_abort_transaction(trans, -EUCLEAN); - goto err_dump; + abort_and_dump(trans, path, + "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", + path->slots[0]); + ret = -EUCLEAN; + goto out; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3170,19 +3174,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; -err_dump: - /* - * Leaf dump can take up a lot of log buffer, so we only do full leaf - * dump for debug build. - */ - if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { - btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", - path->slots[0], extent_slot); - btrfs_print_leaf(path->nodes[0]); - } - - btrfs_free_path(path); - return -EUCLEAN; } /* From 29e70be261d92829af26fda16769784710f45503 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 15 Apr 2023 17:51:23 +0800 Subject: [PATCH 008/194] btrfs: use SECTOR_SHIFT to convert physical offset to LBA Use SECTOR_SHIFT while converting a physical address to an LBA, makes it more readable. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/compression.c | 2 +- fs/btrfs/extent-tree.c | 6 ++++-- fs/btrfs/inode.c | 2 +- fs/btrfs/raid56.c | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 82e49d985019..b4408037b823 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1565,7 +1565,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2d0493f0a184..14f5f25049a0 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -421,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ if (!em || cur < em->start || (cur + fs_info->sectorsize > extent_map_end(em)) || - (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { + (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { free_extent_map(em); unlock_extent(tree, cur, page_end, NULL); unlock_page(page); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1c326d530764..3a4c6ab80403 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1245,7 +1245,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (size) { - ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + size >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += size; @@ -1262,7 +1263,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, } if (bytes_left) { - ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, + ret = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + bytes_left >> SECTOR_SHIFT, GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7fcafcc5292c..e6a21bbf40f2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8639,7 +8639,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + - ALIGN(delalloc_bytes, blocksize)) >> 9; + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; return 0; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2fab37f062de..d271c6e04dcc 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1099,7 +1099,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); - bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); From adbe7e388e4239d9c1754d475aea791136927137 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 15 Apr 2023 19:32:38 +0800 Subject: [PATCH 009/194] btrfs: use SECTOR_SHIFT to convert LBA to physical offset Using SECTOR_SHIFT to convert LBA to physical address makes it more readable. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/raid56.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b3ad0f51e616..2a182edcfb61 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -635,7 +635,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) struct btrfs_fs_info *fs_info = bbio->fs_info; struct btrfs_bio *orig_bbio = bbio; struct bio *bio = &bbio->bio; - u64 logical = bio->bi_iter.bi_sector << 9; + u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; u64 map_length = length; bool use_append = btrfs_use_zone_append(bbio); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3a4c6ab80403..581ddb24e113 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1203,11 +1203,11 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, { int j, ret = 0; u64 bytes_left, end; - u64 aligned_start = ALIGN(start, 1 << 9); + u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT); if (WARN_ON(start != aligned_start)) { len -= aligned_start - start; - len = round_down(len, 1 << 9); + len = round_down(len, 1 << SECTOR_SHIFT); start = aligned_start; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d1cd0a692f8e..e74b9804bcde 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -749,7 +749,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - sums->bytenr = bio->bi_iter.bi_sector << 9; + sums->bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; index = 0; shash->tfm = fs_info->csum_shash; @@ -799,7 +799,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) ordered = btrfs_lookup_ordered_extent(inode, offset); ASSERT(ordered); /* Logic error */ - sums->bytenr = (bio->bi_iter.bi_sector << 9) + sums->bytenr = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + total_bytes; index = 0; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d271c6e04dcc..3c08e132d83d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1079,7 +1079,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* see if we can add this page onto our existing bio */ if (last) { - u64 last_end = last->bi_iter.bi_sector << 9; + u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* From da023618076a13c35bcde1a49a87b7da64761f1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 09:06:13 +0200 Subject: [PATCH 010/194] btrfs: submit IO synchronously for fast checksum implementations Most modern hardware supports very fast accelerated crc32c calculation. If that is supported the CPU overhead of the checksum calculation is very limited, and offloading the calculation to special worker threads has a lot of overhead for no gain. E.g. on an Intel Optane device is actually very much slows down even 1M buffered writes with fio: Unpatched: write: IOPS=3316, BW=3316MiB/s (3477MB/s)(200GiB/61757msec); 0 zone resets With synchronous CRCs: write: IOPS=4882, BW=4882MiB/s (5119MB/s)(200GiB/41948msec); 0 zone resets With a lot of variation during the unpatched run going down as low as 1100MB/s, while the synchronous CRC version has about the same peak write speed but much lower dips, and fewer kworkers churning around. Both tests had fio saturated at 100% CPU. (thanks to Jens Axboe via Chris Mason for the benchmarking) Reviewed-by: Chris Mason Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 2a182edcfb61..67fb8f6a0eb9 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -574,6 +574,10 @@ static void run_one_async_free(struct btrfs_work *work) static bool should_async_write(struct btrfs_bio *bbio) { + /* Submit synchronously if the checksum implementation is fast. */ + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + return false; + /* * If the I/O is not issued by fsync and friends, (->sync_writers != 0), * then try to defer the submission to a workqueue to parallelize the @@ -582,19 +586,9 @@ static bool should_async_write(struct btrfs_bio *bbio) if (atomic_read(&bbio->inode->sync_writers)) return false; - /* - * Submit metadata writes synchronously if the checksum implementation - * is fast, or we are on a zoned device that wants I/O to be submitted - * in order. - */ - if (bbio->bio.bi_opf & REQ_META) { - struct btrfs_fs_info *fs_info = bbio->fs_info; - - if (btrfs_is_zoned(fs_info)) - return false; - if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return false; - } + /* Zoned devices require I/O to be submitted in order. */ + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + return false; return true; } From e917ff56c8e7b117b590632fa40a08e36577d31f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 09:06:14 +0200 Subject: [PATCH 011/194] btrfs: determine synchronous writers from bio or writeback control The writeback_control structure already passes down the information about a writeback being synchronous from the core VM code, and thus information is propagated into the bio REQ_SYNC flag through the wbc_to_write_flags helper. Use that information to decide if checksums calculation is offloaded to a workqueue instead of btrfs_inode::sync_writers field that not only bloats the inode but also has too wide scope, being inode wide instead of limited to the actual writeback request. The sync writes were set in: - btrfs_do_write_iter - regular IO, sync status is set - start_ordered_ops - ordered write start, writeback with WB_SYNC_ALL mode - btrfs_write_marked_extents - write marked extents, writeback with WB_SYNC_ALL mode Reviewed-by: Chris Mason Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/bio.c | 7 +++---- fs/btrfs/btrfs_inode.h | 3 --- fs/btrfs/file.c | 9 --------- fs/btrfs/inode.c | 1 - fs/btrfs/transaction.c | 2 -- 5 files changed, 3 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 67fb8f6a0eb9..990a517c069b 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -579,11 +579,10 @@ static bool should_async_write(struct btrfs_bio *bbio) return false; /* - * If the I/O is not issued by fsync and friends, (->sync_writers != 0), - * then try to defer the submission to a workqueue to parallelize the - * checksum calculation. + * Try to defer the submission to a workqueue to parallelize the + * checksum calculation unless the I/O is issued synchronously. */ - if (atomic_read(&bbio->inode->sync_writers)) + if (op_is_sync(bbio->bio.bi_opf)) return false; /* Zoned devices require I/O to be submitted in order. */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ec2ae4406c16..0849b85b94fe 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -116,9 +116,6 @@ struct btrfs_inode { unsigned long runtime_flags; - /* Keep track of who's O_SYNC/fsyncing currently */ - atomic_t sync_writers; - /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f649647392e0..f53b7b75092d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1651,7 +1651,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -1664,9 +1663,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; - if (sync) - atomic_inc(&inode->sync_writers); - if (encoded) { num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; @@ -1686,9 +1682,6 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = num_sync; } - if (sync) - atomic_dec(&inode->sync_writers); - current->backing_dev_info = NULL; return num_written; } @@ -1733,9 +1726,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) * several segments of stripe length (currently 64K). */ blk_start_plug(&plug); - atomic_inc(&BTRFS_I(inode)->sync_writers); ret = btrfs_fdatawrite_range(inode, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); blk_finish_plug(&plug); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e6a21bbf40f2..675b20c4d3c9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8468,7 +8468,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); - atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8b6a99b8d7f6..27c616fdfae2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1056,7 +1056,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -1092,7 +1091,6 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } - atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } From 8bfec2e426e40597d594db07de4e53def38c8879 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 09:06:15 +0200 Subject: [PATCH 012/194] btrfs: remove hipri_workers workqueue Now that btrfs_wq_submit_bio is never called for synchronous I/O, the hipri_workers workqueue is not used anymore and can be removed. Reviewed-by: Chris Mason Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 5 +---- fs/btrfs/disk-io.c | 6 +----- fs/btrfs/fs.h | 1 - fs/btrfs/super.c | 1 - 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 990a517c069b..2e2e1abd5838 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -615,10 +615,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); - if (op_is_sync(bbio->bio.bi_opf)) - btrfs_queue_work(fs_info->hipri_workers, &async->work); - else - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->workers, &async->work); return true; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dabc79c1af1b..f4adda2b4317 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1991,7 +1991,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); - btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); @@ -2186,9 +2185,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); - fs_info->hipri_workers = - btrfs_alloc_workqueue(fs_info, "worker-high", - flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = btrfs_alloc_workqueue(fs_info, "delalloc", @@ -2225,7 +2221,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->hipri_workers && + if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 0d98fc5f6f44..840e4def18b5 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -543,7 +543,6 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other two. */ struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index efeb1a9d040a..8b1c1225245e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1631,7 +1631,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); workqueue_set_max_active(fs_info->endio_workers, new_pool_size); From b9a9a85059cde9bcc66effe0da9f1b6fb342a700 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 3 May 2023 12:40:01 +0800 Subject: [PATCH 013/194] btrfs: output affected files when relocation fails [PROBLEM] When relocation fails (mostly due to checksum mismatch), we only got very cryptic error messages like: BTRFS info (device dm-4): relocating block group 13631488 flags data BTRFS warning (device dm-4): csum failed root -9 ino 257 off 0 csum 0x373e1ae3 expected csum 0x98757625 mirror 1 BTRFS error (device dm-4): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 1, gen 0 BTRFS info (device dm-4): balance: ended with status: -5 The end user has to decipher the above messages and use various tools to locate the affected files and find a way to fix the problem (mostly deleting the file). This is not an easy work even for experienced developer, not to mention the end users. [SCRUB IS DOING BETTER] By contrast, scrub is providing much better error messages: BTRFS error (device dm-4): unable to fixup (regular) error at logical 13631488 on dev /dev/mapper/test-scratch1 physical 13631488 BTRFS warning (device dm-4): checksum error at logical 13631488 on dev /dev/mapper/test-scratch1, physical 13631488, root 5, inode 257, offset 0, length 4096, links 1 (path: file) BTRFS info (device dm-4): scrub: finished on devid 1 with status: 0 Which provides the affected files directly to the end user. [IMPROVEMENT] Instead of the generic data checksum error messages, which is not doing a good job for data reloc inodes, this patch introduce a scrub like backref walking based solution. When a sector fails its checksum for data reloc inode, we go the following workflow: - Get the real logical bytenr For data reloc inode, the file offset is the offset inside the block group. Thus the real logical bytenr is @file_off + @block_group->start. - Do an extent type check If it's tree blocks it's much easier to handle, just go through all the tree block backref. - Do a backref walk and inode path resolution for data extents This is mostly the same as scrub. But unfortunately we can not reuse the same function as the output format is different. Now the new output would be more user friendly: BTRFS info (device dm-4): relocating block group 13631488 flags data BTRFS warning (device dm-4): csum failed root -9 ino 257 off 0 logical 13631488 csum 0x373e1ae3 expected csum 0x98757625 mirror 1 BTRFS warning (device dm-4): checksum error at logical 13631488 mirror 1 root 5 inode 257 offset 0 length 4096 links 1 (path: file) BTRFS error (device dm-4): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 2, gen 0 BTRFS info (device dm-4): balance: ended with status: -5 Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 191 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/relocation.c | 16 ++++ fs/btrfs/relocation.h | 1 + 3 files changed, 208 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 675b20c4d3c9..da7e5bf2f6db 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,6 +70,7 @@ #include "verity.h" #include "super.h" #include "orphan.h" +#include "backref.h" struct btrfs_iget_args { u64 ino; @@ -100,6 +101,18 @@ struct btrfs_rename_ctx { u64 index; }; +/* + * Used by data_reloc_print_warning_inode() to pass needed info for filename + * resolution and output of error message. + */ +struct data_reloc_warn { + struct btrfs_path path; + struct btrfs_fs_info *fs_info; + u64 extent_item_size; + u64 logical; + int mirror_num; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -122,12 +135,190 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) +{ + struct data_reloc_warn *warn = warn_ctx; + struct btrfs_fs_info *fs_info = warn->fs_info; + struct extent_buffer *eb; + struct btrfs_inode_item *inode_item; + struct inode_fs_paths *ipath = NULL; + struct btrfs_root *local_root; + struct btrfs_key key; + unsigned int nofs_flag; + u32 nlink; + int ret; + + local_root = btrfs_get_fs_root(fs_info, root, true); + if (IS_ERR(local_root)) { + ret = PTR_ERR(local_root); + goto err; + } + + /* This makes the path point to (inum INODE_ITEM ioff). */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); + if (ret) { + btrfs_put_root(local_root); + btrfs_release_path(&warn->path); + goto err; + } + + eb = warn->path.nodes[0]; + inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); + nlink = btrfs_inode_nlink(eb, inode_item); + btrfs_release_path(&warn->path); + + nofs_flag = memalloc_nofs_save(); + ipath = init_ipath(4096, local_root, &warn->path); + memalloc_nofs_restore(nofs_flag); + if (IS_ERR(ipath)) { + btrfs_put_root(local_root); + ret = PTR_ERR(ipath); + ipath = NULL; + /* + * -ENOMEM, not a critical error, just output an generic error + * without filename. + */ + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", + warn->logical, warn->mirror_num, root, inum, offset); + return ret; + } + ret = paths_from_inode(inum, ipath); + if (ret < 0) + goto err; + + /* + * We deliberately ignore the bit ipath might have been too small to + * hold all of the paths here + */ + for (int i = 0; i < ipath->fspath->elem_cnt; i++) { + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", + warn->logical, warn->mirror_num, root, inum, offset, + fs_info->sectorsize, nlink, + (char *)(unsigned long)ipath->fspath->val[i]); + } + + btrfs_put_root(local_root); + free_ipath(ipath); + return 0; + +err: + btrfs_warn(fs_info, +"checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", + warn->logical, warn->mirror_num, root, inum, offset, ret); + + free_ipath(ipath); + return ret; +} + +/* + * Do extra user-friendly error output (e.g. lookup all the affected files). + * + * Return true if we succeeded doing the backref lookup. + * Return false if such lookup failed, and has to fallback to the old error message. + */ +static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, + const u8 *csum, const u8 *csum_expected, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_path path = { 0 }; + struct btrfs_key found_key = { 0 }; + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + const u32 csum_size = fs_info->csum_size; + u64 logical; + u64 flags; + u32 item_size; + int ret; + + mutex_lock(&fs_info->reloc_mutex); + logical = btrfs_get_reloc_bg_bytenr(fs_info); + mutex_unlock(&fs_info->reloc_mutex); + + if (logical == U64_MAX) { + btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, btrfs_ino(inode), file_off, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + return; + } + + logical += file_off; + btrfs_warn_rl(fs_info, +"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + inode->root->root_key.objectid, + btrfs_ino(inode), file_off, logical, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + + ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); + if (ret < 0) { + btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", + logical, ret); + return; + } + eb = path.nodes[0]; + ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); + item_size = btrfs_item_size(eb, path.slots[0]); + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + unsigned long ptr = 0; + u64 ref_root; + u8 ref_level; + + do { + ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, + item_size, &ref_root, + &ref_level); + btrfs_warn_rl(fs_info, +"csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", + logical, mirror_num, + (ref_level ? "node" : "leaf"), + (ret < 0 ? -1 : ref_level), + (ret < 0 ? -1 : ref_root)); + } while (ret != 1); + btrfs_release_path(&path); + } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + struct data_reloc_warn reloc_warn = { 0 }; + + btrfs_release_path(&path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = logical - found_key.objectid; + ctx.fs_info = fs_info; + + reloc_warn.logical = logical; + reloc_warn.extent_item_size = found_key.offset; + reloc_warn.mirror_num = mirror_num; + reloc_warn.fs_info = fs_info; + + iterate_extent_inodes(&ctx, true, + data_reloc_print_warning_inode, &reloc_warn); + } +} + static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; + /* For data reloc tree, it's better to do a backref lookup instead. */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + return print_data_reloc_error(inode, logical_start, csum, + csum_expected, mirror_num); + /* Output without objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59a06499c647..38cfbd38a819 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4523,3 +4523,19 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, ret = clone_backref_node(trans, rc, root, reloc_root); return ret; } + +/* + * Get the current bytenr for the block group which is being relocated. + * + * Return U64_MAX if no running relocation. + */ +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +{ + u64 logical = U64_MAX; + + lockdep_assert_held(&fs_info->reloc_mutex); + + if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) + logical = fs_info->reloc_ctl->block_group->start; + return logical; +} diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 2041a86186de..57cbac5c8ddd 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -19,5 +19,6 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_should_ignore_reloc_root(struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); #endif From 12df6a622ed8f0cd14470c6d76fe1b85f621f230 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 2 May 2023 10:51:29 -0400 Subject: [PATCH 014/194] btrfs: simplify transid initialization in btrfs_ioctl_wait_sync A small code simplification, move the default value of transid to its initialization and remove the else-statement. Signed-off-by: Tom Rix Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9522669000a7..df7603c30686 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3133,14 +3133,13 @@ out: static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { - u64 transid; + /* By default wait for the current transaction. */ + u64 transid = 0; - if (argp) { + if (argp) if (copy_from_user(&transid, argp, sizeof(transid))) return -EFAULT; - } else { - transid = 0; /* current trans */ - } + return btrfs_wait_for_commit(fs_info, transid); } From fbb2e654d898f141a2cde37074535d9cec10b03a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:19 +0100 Subject: [PATCH 015/194] btrfs: avoid extra memory allocation when copying free space cache At copy_free_space_cache(), we add a new entry to the block group's ctl before we free the entry from the temporary ctl. Adding a new entry requires the allocation of a new struct btrfs_free_space, so we can avoid a temporary extra allocation by freeing the entry from the temporary ctl before we add a new entry to the main ctl, which possibly also reduces the chances for a memory allocation failure in case of very high memory pressure. So just do that. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cf98a3c05480..ec53119d4cfb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -923,10 +923,12 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, while (!ret && (n = rb_first(&ctl->free_space_offset)) != NULL) { info = rb_entry(n, struct btrfs_free_space, offset_index); if (!info->bitmap) { + const u64 offset = info->offset; + const u64 bytes = info->bytes; + unlink_free_space(ctl, info, true); - ret = btrfs_add_free_space(block_group, info->offset, - info->bytes); kmem_cache_free(btrfs_free_space_cachep, info); + ret = btrfs_add_free_space(block_group, offset, bytes); } else { u64 offset = info->offset; u64 bytes = ctl->unit; From 9085f42571e53b56b0087237d2e6258f47424938 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:20 +0100 Subject: [PATCH 016/194] btrfs: avoid searching twice for previous node when merging free space entries At try_merge_free_space(), avoid calling twice rb_prev() to find the previous node, as that requires looping through the red black tree, so store the result of the rb_prev() call and then use it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ec53119d4cfb..7f69fcc51550 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2449,6 +2449,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, u64 offset = info->offset; u64 bytes = info->bytes; const bool is_trimmed = btrfs_free_space_trimmed(info); + struct rb_node *right_prev = NULL; /* * first we want to see if there is free space adjacent to the range we @@ -2456,9 +2457,11 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, * cover the entire range */ right_info = tree_search_offset(ctl, offset + bytes, 0, 0); - if (right_info && rb_prev(&right_info->offset_index)) - left_info = rb_entry(rb_prev(&right_info->offset_index), - struct btrfs_free_space, offset_index); + if (right_info) + right_prev = rb_prev(&right_info->offset_index); + + if (right_prev) + left_info = rb_entry(right_prev, struct btrfs_free_space, offset_index); else if (!right_info) left_info = tree_search_offset(ctl, offset - 1, 0, 0); From b77433b144621ebd5d0a6a3b6a3d454fa45b9309 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:21 +0100 Subject: [PATCH 017/194] btrfs: use precomputed end offsets at do_trimming() The are two computations of end offsets at do_trimming() that are not necessary, as they were previously computed and stored in local const variables. So just use the variables instead, to make the source code shorter and easier to read. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7f69fcc51550..985bc94b3763 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3676,7 +3676,7 @@ static int do_trimming(struct btrfs_block_group *block_group, __btrfs_add_free_space(block_group, reserved_start, start - reserved_start, reserved_trim_state); - if (start + bytes < reserved_start + reserved_bytes) + if (end < reserved_end) __btrfs_add_free_space(block_group, end, reserved_end - end, reserved_trim_state); __btrfs_add_free_space(block_group, start, bytes, trim_state); From 0d6bac4d30b8bdefd1cb97296620141953f409d6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:22 +0100 Subject: [PATCH 018/194] btrfs: simplify arguments to tree_insert_offset() For the in-memory component of space caching (free space cache and free space tree), three of the arguments passed to tree_insert_offset() can always be taken from the new free space entry that we are about to add. So simplify tree_insert_offset() to take the new entry instead of the 'offset', 'node' and 'bitmap' arguments. This will also allow to make further changes simpler. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 985bc94b3763..ced874a4ae3f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1598,20 +1598,21 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, return bitmap_start; } -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node, int bitmap) +static int tree_insert_offset(struct rb_root *root, + struct btrfs_free_space *new_entry) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - struct btrfs_free_space *info; while (*p) { + struct btrfs_free_space *info; + parent = *p; info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (offset < info->offset) { + if (new_entry->offset < info->offset) { p = &(*p)->rb_left; - } else if (offset > info->offset) { + } else if (new_entry->offset > info->offset) { p = &(*p)->rb_right; } else { /* @@ -1627,7 +1628,7 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * found a bitmap, we want to go left, or before * logically. */ - if (bitmap) { + if (new_entry->bitmap) { if (info->bitmap) { WARN_ON_ONCE(1); return -EEXIST; @@ -1643,8 +1644,8 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, } } - rb_link_node(node, parent, p); - rb_insert_color(node, root); + rb_link_node(&new_entry->offset_index, parent, p); + rb_insert_color(&new_entry->offset_index, root); return 0; } @@ -1835,8 +1836,7 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, int ret = 0; ASSERT(info->bytes || info->bitmap); - ret = tree_insert_offset(&ctl->free_space_offset, info->offset, - &info->offset_index, (info->bitmap != NULL)); + ret = tree_insert_offset(&ctl->free_space_offset, info); if (ret) return ret; @@ -2973,8 +2973,6 @@ static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct btrfs_free_space *entry; struct rb_node *node; spin_lock(&cluster->lock); @@ -2989,15 +2987,15 @@ static void __btrfs_return_cluster_to_free_space( node = rb_first(&cluster->root); while (node) { - bool bitmap; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); RB_CLEAR_NODE(&entry->offset_index); - bitmap = (entry->bitmap != NULL); - if (!bitmap) { + if (!entry->bitmap) { /* Merging treats extents as if they were new */ if (!btrfs_free_space_trimmed(entry)) { ctl->discardable_extents[BTRFS_STAT_CURR]--; @@ -3015,8 +3013,7 @@ static void __btrfs_return_cluster_to_free_space( entry->bytes; } } - tree_insert_offset(&ctl->free_space_offset, - entry->offset, &entry->offset_index, bitmap); + tree_insert_offset(&ctl->free_space_offset, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -3390,8 +3387,7 @@ again: */ RB_CLEAR_NODE(&entry->bytes_index); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 1); + ret = tree_insert_offset(&cluster->root, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, @@ -3481,8 +3477,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); - ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index, 0); + ret = tree_insert_offset(&cluster->root, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); From 13c2018fcc27b0e2cebf0d3732c36b3ecfddc34c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:23 +0100 Subject: [PATCH 019/194] btrfs: assert proper locks are held at tree_insert_offset() There are multiple code paths leading to tree_insert_offset(), and each path takes the necessary locks before tree_insert_offset() is called, since they do other things that require those locks to be held. This makes it easy to miss the locking somewhere, so make tree_insert_offset() assert that the required locks are being held by the calling task. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ced874a4ae3f..be45be6ec888 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1598,12 +1598,25 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, return bitmap_start; } -static int tree_insert_offset(struct rb_root *root, +static int tree_insert_offset(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_cluster *cluster, struct btrfs_free_space *new_entry) { - struct rb_node **p = &root->rb_node; + struct rb_root *root; + struct rb_node **p; struct rb_node *parent = NULL; + lockdep_assert_held(&ctl->tree_lock); + + if (cluster) { + lockdep_assert_held(&cluster->lock); + root = &cluster->root; + } else { + root = &ctl->free_space_offset; + } + + p = &root->rb_node; + while (*p) { struct btrfs_free_space *info; @@ -1836,7 +1849,7 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, int ret = 0; ASSERT(info->bytes || info->bitmap); - ret = tree_insert_offset(&ctl->free_space_offset, info); + ret = tree_insert_offset(ctl, NULL, info); if (ret) return ret; @@ -3013,7 +3026,7 @@ static void __btrfs_return_cluster_to_free_space( entry->bytes; } } - tree_insert_offset(&ctl->free_space_offset, entry); + tree_insert_offset(ctl, NULL, entry); rb_add_cached(&entry->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -3387,7 +3400,7 @@ again: */ RB_CLEAR_NODE(&entry->bytes_index); - ret = tree_insert_offset(&cluster->root, entry); + ret = tree_insert_offset(ctl, cluster, entry); ASSERT(!ret); /* -EEXIST; Logic error */ trace_btrfs_setup_cluster(block_group, cluster, @@ -3477,7 +3490,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, rb_erase(&entry->offset_index, &ctl->free_space_offset); rb_erase_cached(&entry->bytes_index, &ctl->free_space_bytes); - ret = tree_insert_offset(&cluster->root, entry); + ret = tree_insert_offset(ctl, cluster, entry); total_size += entry->bytes; ASSERT(!ret); /* -EEXIST; Logic error */ } while (node && entry != last); From 91de9e978d1c32c477e14f44a7e945f17863c66e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:24 +0100 Subject: [PATCH 020/194] btrfs: assert tree lock is held when searching for free space entries When searching for a free space entry by offset, at tree_search_offset(), we are supposed to have the btrfs_free_space_ctl's 'tree_lock' held, so assert that. We have multiple callers of tree_search_offset(), and all currently hold the necessary lock before calling it. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index be45be6ec888..f3361ebfef4a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1721,6 +1721,8 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl, struct rb_node *n = ctl->free_space_offset.rb_node; struct btrfs_free_space *entry = NULL, *prev = NULL; + lockdep_assert_held(&ctl->tree_lock); + /* find entry that is closest to the 'offset' */ while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); From 9649bd9a29a7da97eca456befd275ee16e3a2291 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:25 +0100 Subject: [PATCH 021/194] btrfs: assert tree lock is held when linking free space When linking a free space entry, at link_free_space(), the caller should be holding the spinlock 'tree_lock' of the given btrfs_free_space_ctl argument, which is necessary for manipulating the red black tree of free space entries (done by tree_insert_offset(), which already asserts the lock is held) and for manipulating the 'free_space', 'free_extents', 'discardable_extents' and 'discardable_bytes' counters of the given struct btrfs_free_space_ctl. So assert that the spinlock 'tree_lock' of the given btrfs_free_space_ctl is held by the current task. We have multiple code paths that end up calling link_free_space(), and all currently take the lock before calling it. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f3361ebfef4a..d349ba39820a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1850,6 +1850,8 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, { int ret = 0; + lockdep_assert_held(&ctl->tree_lock); + ASSERT(info->bytes || info->bitmap); ret = tree_insert_offset(ctl, NULL, info); if (ret) From 7e5ba559941f011389936d49641304ed45e8b6a7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 May 2023 12:04:26 +0100 Subject: [PATCH 022/194] btrfs: assert tree lock is held when removing free space entries Removing a free space entry from an in memory space cache requires having the corresponding btrfs_free_space_ctl's 'tree_lock' held. We have several code paths that remove an entry, so add assertions where appropriate to verify we are holding the lock, as the lock is acquired by some other function up in the call chain, which makes it easy to miss in the future. Note: for this to work we need to lock the local btrfs_free_space_ctl at load_free_space_cache(), which was not being done because it's local, declared on the stack, so no other task has access to it. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d349ba39820a..fb606698bf3c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -927,25 +927,27 @@ static int copy_free_space_cache(struct btrfs_block_group *block_group, const u64 bytes = info->bytes; unlink_free_space(ctl, info, true); + spin_unlock(&ctl->tree_lock); kmem_cache_free(btrfs_free_space_cachep, info); ret = btrfs_add_free_space(block_group, offset, bytes); + spin_lock(&ctl->tree_lock); } else { u64 offset = info->offset; u64 bytes = ctl->unit; - while (search_bitmap(ctl, info, &offset, &bytes, - false) == 0) { + ret = search_bitmap(ctl, info, &offset, &bytes, false); + if (ret == 0) { + bitmap_clear_bits(ctl, info, offset, bytes, true); + spin_unlock(&ctl->tree_lock); ret = btrfs_add_free_space(block_group, offset, bytes); - if (ret) - break; - bitmap_clear_bits(ctl, info, offset, bytes, true); - offset = info->offset; - bytes = ctl->unit; + spin_lock(&ctl->tree_lock); + } else { + free_bitmap(ctl, info); + ret = 0; } - free_bitmap(ctl, info); } - cond_resched(); + cond_resched_lock(&ctl->tree_lock); } return ret; } @@ -1039,7 +1041,9 @@ int load_free_space_cache(struct btrfs_block_group *block_group) block_group->bytes_super)); if (matched) { + spin_lock(&tmp_ctl.tree_lock); ret = copy_free_space_cache(block_group, &tmp_ctl); + spin_unlock(&tmp_ctl.tree_lock); /* * ret == 1 means we successfully loaded the free space cache, * so we need to re-set it here. @@ -1832,6 +1836,8 @@ static inline void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { + lockdep_assert_held(&ctl->tree_lock); + rb_erase(&info->offset_index, &ctl->free_space_offset); rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); ctl->free_extents--; @@ -1881,6 +1887,8 @@ static void relink_bitmap_entry(struct btrfs_free_space_ctl *ctl, if (RB_EMPTY_NODE(&info->bytes_index)) return; + lockdep_assert_held(&ctl->tree_lock); + rb_erase_cached(&info->bytes_index, &ctl->free_space_bytes); rb_add_cached(&info->bytes_index, &ctl->free_space_bytes, entry_less); } @@ -2990,8 +2998,11 @@ static void __btrfs_return_cluster_to_free_space( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct rb_node *node; + lockdep_assert_held(&ctl->tree_lock); + spin_lock(&cluster->lock); if (cluster->block_group != block_group) { spin_unlock(&cluster->lock); @@ -3004,7 +3015,6 @@ static void __btrfs_return_cluster_to_free_space( node = rb_first(&cluster->root); while (node) { - struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; entry = rb_entry(node, struct btrfs_free_space, offset_index); @@ -3343,6 +3353,8 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group *block_group, unsigned long total_found = 0; int ret; + lockdep_assert_held(&ctl->tree_lock); + i = offset_to_bit(entry->offset, ctl->unit, max_t(u64, offset, entry->offset)); want_bits = bytes_to_bits(bytes, ctl->unit); @@ -3432,6 +3444,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group *block_group, u64 max_extent; u64 total_size = 0; + lockdep_assert_held(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 0, 1); if (!entry) return -ENOSPC; From 94ead93e63758f2c7cbe0c68ca232fff812ca33e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Apr 2023 13:57:18 +0800 Subject: [PATCH 023/194] btrfs: scrub: use recovered data stripes as cache to avoid unnecessary read For P/Q stripe scrub, we have quite some duplicated read IO: - Data stripes read for verification This is triggered by the scrub_submit_initial_read() inside scrub_raid56_parity_stripe(). - Data stripes read (again) for P/Q stripe verification This is triggered by scrub_assemble_read_bios() from scrub_rbio(). Although we can have hit rbio cache and avoid unnecessary read, the chance is very low, as scrub would easily flush the whole rbio cache. This means, even we're just scrubbing a single P/Q stripe, we would read the data stripes twice for the best case scenario. If we need to recover some data stripes, it would cause more reads on the same data stripes, again and again. However before we call raid56_parity_submit_scrub_rbio() we already have all data stripes repaired and their contents ready to use. But RAID56 cache is unaware about the scrub cache, thus RAID56 layer itself still needs to re-read the data stripes. To avoid such cache miss, this patch would: - Introduce a new helper, raid56_parity_cache_data_pages() This function would grab the pages from an array, and copy the content to the rbio, marking all the involved sectors uptodate. The page copy is unavoidable because of the cache pages of rbio are all self managed, thus can not utilize outside pages without screwing up the lifespan. - Use the repaired data stripes as cache inside scrub_raid56_parity_stripe() By this, we ensure all the data sectors of the scrub rbio are already uptodate, and no need to read them again from disk. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/raid56.h | 3 +++ fs/btrfs/scrub.c | 7 +++++++ 3 files changed, 55 insertions(+) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3c08e132d83d..f37b925d587f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2747,3 +2747,48 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } + +/* + * This is for scrub call sites where we already have correct data contents. + * This allows us to avoid reading data stripes again. + * + * Unfortunately here we have to do page copy, other than reusing the pages. + * This is due to the fact rbio has its own page management for its cache. + */ +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical) +{ + const u64 offset_in_full_stripe = data_logical - + rbio->bioc->full_stripe_logical; + const int page_index = offset_in_full_stripe >> PAGE_SHIFT; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int ret; + + /* + * If we hit ENOMEM temporarily, but later at + * raid56_parity_submit_scrub_rbio() time it succeeded, we just do + * the extra read, not a big deal. + * + * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, + * the bio would got proper error number set. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return; + + /* data_logical must be at stripe boundary and inside the full stripe. */ + ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); + ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); + + for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { + struct page *dst = rbio->stripe_pages[page_nr + page_index]; + struct page *src = data_pages[page_nr]; + + memcpy_page(dst, 0, src, 0, PAGE_SIZE); + for (int sector_nr = sectors_per_page * page_index; + sector_nr < sectors_per_page * (page_index + 1); + sector_nr++) + rbio->stripe_sectors[sector_nr].uptodate = true; + } +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0f7f31c8cb98..0e84c9c9293f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -193,6 +193,9 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); +void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, + struct page **data_pages, u64 data_logical); + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bceaa8c2007e..f231883f504a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1972,6 +1972,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_dec(fs_info); goto out; } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_pages(rbio, stripe->pages, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } raid56_parity_submit_scrub_rbio(rbio); wait_for_completion_io(&io_done); ret = blk_status_to_errno(bio->bi_status); From 54d687c13aef3194ea1490cf6d4b5016da1cf0bb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:10 -0400 Subject: [PATCH 024/194] btrfs: move btrfs_check_trunc_cache_free_space into block-rsv.c This is completely related to block rsv's, move it out of the free space cache code and into block-rsv.c. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 19 +++++++++++++++++++ fs/btrfs/block-rsv.h | 2 ++ fs/btrfs/free-space-cache.c | 19 ------------------- fs/btrfs/free-space-cache.h | 2 -- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ac18c43fadad..6279d200cf83 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -541,3 +541,22 @@ try_reserve: return ERR_PTR(ret); } + +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + u64 needed_bytes; + int ret; + + /* 1 for slack space, 1 for updating the inode */ + needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + + btrfs_calc_metadata_size(fs_info, 1); + + spin_lock(&rsv->lock); + if (rsv->reserved < needed_bytes) + ret = -ENOSPC; + else + ret = 0; + spin_unlock(&rsv->lock); + return ret; +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 6dc781709aca..b0bd12b8652f 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -82,6 +82,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize); +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u32 blocksize) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fb606698bf3c..880800418075 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -292,25 +292,6 @@ out: return ret; } -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - u64 needed_bytes; - int ret; - - /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_insert_metadata_size(fs_info, 1) + - btrfs_calc_metadata_size(fs_info, 1); - - spin_lock(&rsv->lock); - if (rsv->reserved < needed_bytes) - ret = -ENOSPC; - else - ret = 0; - spin_unlock(&rsv->lock); - return ret; -} - int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *vfs_inode) diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a855e0483e03..33b4da3271b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -101,8 +101,6 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group); -int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, struct inode *inode); From 4aec05fa5a190d641664c2bca8ad270cf8190b8c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:11 -0400 Subject: [PATCH 025/194] btrfs: remove level argument from btrfs_set_block_flags We just pass in btrfs_header_level(eb) for the level, and we're passing in the eb already, so simply get the level from the eb inside of btrfs_set_block_flags. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +---- fs/btrfs/extent-tree.c | 7 +++---- fs/btrfs/extent-tree.h | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4b33bc087fc7..0c83cd78a483 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -464,10 +464,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; } if (new_flags != 0) { - int level = btrfs_header_level(buf); - - ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level); + ret = btrfs_set_disk_extent_flags(trans, buf, new_flags); if (ret) return ret; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 581ddb24e113..99b9ab5ccf93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2152,10 +2152,10 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level) + struct extent_buffer *eb, u64 flags) { struct btrfs_delayed_extent_op *extent_op; + int level = btrfs_header_level(eb); int ret; extent_op = btrfs_alloc_delayed_extent_op(); @@ -5095,8 +5095,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb)); + ret = btrfs_set_disk_extent_flags(trans, eb, flag); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0c958fc1b3b8..429d5c570061 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -141,7 +141,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); + struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, From 85d8a826c7cde17f9cca9c4debecb4538bdb6573 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:12 -0400 Subject: [PATCH 026/194] btrfs: simplify btrfs_check_leaf_* helpers into a single helper We have two helpers for checking leaves, because we have an extra check for debugging in btrfs_mark_buffer_dirty(), and at that stage we may have item data that isn't consistent yet. However we can handle this case internally in the helper, if BTRFS_HEADER_FLAG_WRITTEN is set we know the buffer should be internally consistent, otherwise we need to skip checking the item data. Simplify this helper down a single helper and handle the item data checking logic internally to the helper. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++------- fs/btrfs/tree-checker.c | 20 +++++++------------- fs/btrfs/tree-checker.h | 13 +------------ 3 files changed, 13 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f4adda2b4317..922755926ee9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -326,7 +326,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) if (btrfs_header_level(eb)) ret = btrfs_check_node(eb); else - ret = btrfs_check_leaf_full(eb); + ret = btrfs_check_leaf(eb); if (ret < 0) goto error; @@ -583,7 +583,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf_full(eb)) { + if (found_level == 0 && btrfs_check_leaf(eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } @@ -4701,12 +4701,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* - * Since btrfs_mark_buffer_dirty() can be called with item pointer set - * but item data not updated. - * So here we should only check item pointers, not item data. + * btrfs_check_leaf() won't check item data if we don't have WRITTEN + * set, so this will only validate the basic structure of the items. */ - if (btrfs_header_level(buf) == 0 && - btrfs_check_leaf_relaxed(buf)) { + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { btrfs_print_leaf(buf); ASSERT(0); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e2b54793bf0c..2eff4e2f2c47 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1674,7 +1674,7 @@ static int check_leaf_item(struct extent_buffer *leaf, return ret; } -static int check_leaf(struct extent_buffer *leaf, bool check_item_data) +int btrfs_check_leaf(struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -1807,7 +1807,11 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return -EUCLEAN; } - if (check_item_data) { + /* + * We only want to do this if WRITTEN is set, otherwise the leaf + * may be in some intermediate state and won't appear valid. + */ + if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { /* * Check if the item size and content meet other * criteria @@ -1824,17 +1828,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) return 0; } - -int btrfs_check_leaf_full(struct extent_buffer *leaf) -{ - return check_leaf(leaf, true); -} -ALLOW_ERROR_INJECTION(btrfs_check_leaf_full, ERRNO); - -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf) -{ - return check_leaf(leaf, false); -} +ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); int btrfs_check_node(struct extent_buffer *node) { diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index bfb5efa4e01f..48321e8d91bb 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -40,18 +40,7 @@ struct btrfs_tree_parent_check { u8 level; }; -/* - * Comprehensive leaf checker. - * Will check not only the item pointers, but also every possible member - * in item data. - */ -int btrfs_check_leaf_full(struct extent_buffer *leaf); - -/* - * Less strict leaf checker. - * Will only check item pointers, not reading item data. - */ -int btrfs_check_leaf_relaxed(struct extent_buffer *leaf); +int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, From a7b4e6c7aa66632d776cf2b991ff2def076e34b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:13 -0400 Subject: [PATCH 027/194] btrfs: add btrfs_tree_block_status definitions to tree-checker.h We use this in btrfs-progs to determine if we can fix different types of corruptions. We don't care about this in the kernel, however it would be good to share this code between the kernel and btrfs-progs, so add the status definitions so we can start converting the tree-checker code over to using these status flags instead of blanket returning -EUCLEAN. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-checker.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 48321e8d91bb..78ee2896423d 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -40,6 +40,19 @@ struct btrfs_tree_parent_check { u8 level; }; +enum btrfs_tree_block_status { + BTRFS_TREE_BLOCK_CLEAN, + BTRFS_TREE_BLOCK_INVALID_NRITEMS, + BTRFS_TREE_BLOCK_INVALID_PARENT_KEY, + BTRFS_TREE_BLOCK_BAD_KEY_ORDER, + BTRFS_TREE_BLOCK_INVALID_LEVEL, + BTRFS_TREE_BLOCK_INVALID_FREE_SPACE, + BTRFS_TREE_BLOCK_INVALID_OFFSETS, + BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, + BTRFS_TREE_BLOCK_INVALID_ITEM, + BTRFS_TREE_BLOCK_INVALID_OWNER, +}; + int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); From c8d5421563547a4b4ba6fcb9dae6b323dd02b75f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:14 -0400 Subject: [PATCH 028/194] btrfs: use btrfs_tree_block_status for leaf item errors We have a variety of item specific errors that can occur. For now simply put these under the umbrella of BTRFS_TREE_BLOCK_INVALID_ITEM, this can be fleshed out as we need in the future. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 2eff4e2f2c47..63a1086582a2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1620,9 +1620,10 @@ static int check_inode_ref(struct extent_buffer *leaf, /* * Common point to switch the item-specific validation. */ -static int check_leaf_item(struct extent_buffer *leaf, - struct btrfs_key *key, int slot, - struct btrfs_key *prev_key) +static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, + struct btrfs_key *key, + int slot, + struct btrfs_key *prev_key) { int ret = 0; struct btrfs_chunk *chunk; @@ -1671,7 +1672,10 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_ref(leaf, key, slot); break; } - return ret; + + if (ret) + return BTRFS_TREE_BLOCK_INVALID_ITEM; + return BTRFS_TREE_BLOCK_CLEAN; } int btrfs_check_leaf(struct extent_buffer *leaf) @@ -1751,7 +1755,6 @@ int btrfs_check_leaf(struct extent_buffer *leaf) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; - int ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -1812,13 +1815,15 @@ int btrfs_check_leaf(struct extent_buffer *leaf) * may be in some intermediate state and won't appear valid. */ if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { + enum btrfs_tree_block_status ret; + /* * Check if the item size and content meet other * criteria */ ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret < 0)) - return ret; + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; } prev_key.objectid = key.objectid; From 924452c80e81ba96bfc64847e983862016345381 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:15 -0400 Subject: [PATCH 029/194] btrfs: extend btrfs_leaf_check to return btrfs_tree_block_status Instead of blanket returning -EUCLEAN for all the failures in btrfs_check_leaf, use btrfs_tree_block_status and return the appropriate status for each failure. Rename the helper to __btrfs_check_leaf and then make a wrapper of btrfs_check_leaf that will return -EUCLEAN to non-clean error codes. This will allow us to have the __btrfs_check_leaf variant in btrfs-progs while keeping the behavior in the kernel consistent. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 36 +++++++++++++++++++++++------------- fs/btrfs/tree-checker.h | 6 ++++++ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 63a1086582a2..870b716b393f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1678,7 +1678,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, return BTRFS_TREE_BLOCK_CLEAN; } -int btrfs_check_leaf(struct extent_buffer *leaf) +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) { struct btrfs_fs_info *fs_info = leaf->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -1691,7 +1691,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) generic_err(leaf, 0, "invalid level for leaf, have %d expect 0", btrfs_header_level(leaf)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } /* @@ -1714,32 +1714,32 @@ int btrfs_check_leaf(struct extent_buffer *leaf) generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } /* Unknown tree */ if (unlikely(owner == 0)) { generic_err(leaf, 0, "invalid owner, root 0 is not defined"); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OWNER; } /* EXTENT_TREE_V2 can have empty extent trees. */ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) { generic_err(leaf, 0, "invalid root, root %llu must never be empty", owner); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } - return 0; + return BTRFS_TREE_BLOCK_CLEAN; } if (unlikely(nritems == 0)) - return 0; + return BTRFS_TREE_BLOCK_CLEAN; /* * Check the following things to make sure this is a good leaf, and @@ -1765,7 +1765,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) prev_key.objectid, prev_key.type, prev_key.offset, key.objectid, key.type, key.offset); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } item_data_end = (u64)btrfs_item_offset(leaf, slot) + @@ -1784,7 +1784,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) generic_err(leaf, slot, "unexpected item end, have %llu expect %u", item_data_end, item_end_expected); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* @@ -1796,7 +1796,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) generic_err(leaf, slot, "slot end outside of leaf, have %llu expect range [0, %u]", item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* Also check if the item pointer overlaps with btrfs item. */ @@ -1807,7 +1807,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } /* @@ -1823,7 +1823,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) */ ret = check_leaf_item(leaf, &key, slot, &prev_key); if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) - return -EUCLEAN; + return ret; } prev_key.objectid = key.objectid; @@ -1831,6 +1831,16 @@ int btrfs_check_leaf(struct extent_buffer *leaf) prev_key.offset = key.offset; } + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_leaf(struct extent_buffer *leaf) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_leaf(leaf); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; return 0; } ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 78ee2896423d..3b8de6d36141 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -53,6 +53,12 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_INVALID_OWNER, }; +/* + * Exported simply for btrfs-progs which wants to have the + * btrfs_tree_block_status return codes. + */ +enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf); + int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); From c26fa931eb186a748608b4155fe2f4821738b140 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:16 -0400 Subject: [PATCH 030/194] btrfs: add __btrfs_check_node helper This helper returns a btrfs_tree_block_status for the various errors, and then btrfs_check_node() will return -EUCLEAN if it gets anything other than BTRFS_TREE_BLOCK_CLEAN which will be used by the kernel. In the future btrfs-progs will use this helper instead. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 29 +++++++++++++++++------------ fs/btrfs/tree-checker.h | 1 + 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 870b716b393f..bd85205a6249 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1845,7 +1845,7 @@ int btrfs_check_leaf(struct extent_buffer *leaf) } ALLOW_ERROR_INJECTION(btrfs_check_leaf, ERRNO); -int btrfs_check_node(struct extent_buffer *node) +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) { struct btrfs_fs_info *fs_info = node->fs_info; unsigned long nr = btrfs_header_nritems(node); @@ -1853,13 +1853,12 @@ int btrfs_check_node(struct extent_buffer *node) int slot; int level = btrfs_header_level(node); u64 bytenr; - int ret = 0; if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", level, BTRFS_MAX_LEVEL - 1); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_LEVEL; } if (unlikely(nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info))) { btrfs_crit(fs_info, @@ -1867,7 +1866,7 @@ int btrfs_check_node(struct extent_buffer *node) btrfs_header_owner(node), node->start, nr == 0 ? "small" : "large", nr, BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - return -EUCLEAN; + return BTRFS_TREE_BLOCK_INVALID_NRITEMS; } for (slot = 0; slot < nr - 1; slot++) { @@ -1878,15 +1877,13 @@ int btrfs_check_node(struct extent_buffer *node) if (unlikely(!bytenr)) { generic_err(node, slot, "invalid NULL node pointer"); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(!IS_ALIGNED(bytenr, fs_info->sectorsize))) { generic_err(node, slot, "unaligned pointer, have %llu should be aligned to %u", bytenr, fs_info->sectorsize); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_INVALID_BLOCKPTR; } if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { @@ -1895,12 +1892,20 @@ int btrfs_check_node(struct extent_buffer *node) key.objectid, key.type, key.offset, next_key.objectid, next_key.type, next_key.offset); - ret = -EUCLEAN; - goto out; + return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } -out: - return ret; + return BTRFS_TREE_BLOCK_CLEAN; +} + +int btrfs_check_node(struct extent_buffer *node) +{ + enum btrfs_tree_block_status ret; + + ret = __btrfs_check_node(node); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return -EUCLEAN; + return 0; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3b8de6d36141..c0861ce1429b 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -58,6 +58,7 @@ enum btrfs_tree_block_status { * btrfs_tree_block_status return codes. */ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf); +enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node); int btrfs_check_leaf(struct extent_buffer *leaf); int btrfs_check_node(struct extent_buffer *node); From 2cac5af16537f8eafaba5e525fbb5a93160ebaff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:17 -0400 Subject: [PATCH 031/194] btrfs: move btrfs_verify_level_key into tree-checker.c This is more a buffer validation helper, move it into the tree-checker files where it makes more sense. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 58 ----------------------------------------- fs/btrfs/disk-io.h | 2 -- fs/btrfs/tree-checker.c | 58 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tree-checker.h | 2 ++ 4 files changed, 60 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 922755926ee9..83518ed71bfd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -180,64 +180,6 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - int found_level; - struct btrfs_key found_key; - int ret; - - found_level = btrfs_header_level(eb); - if (found_level != level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); - btrfs_err(fs_info, -"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", - eb->start, level, found_level); - return -EIO; - } - - if (!first_key) - return 0; - - /* - * For live tree block (new tree blocks in current transaction), - * we need proper lock context to avoid race, which is impossible here. - * So we only checks tree blocks which is read from disk, whose - * generation <= fs_info->last_trans_committed. - */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) - return 0; - - /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { - btrfs_err(fs_info, - "invalid tree nritems, bytenr=%llu nritems=0 expect >0", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return -EUCLEAN; - } - - if (found_level) - btrfs_node_key_to_cpu(eb, &found_key, 0); - else - btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(first_key, &found_key); - - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); - btrfs_err(fs_info, -"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", - eb->start, parent_transid, first_key->objectid, - first_key->type, first_key->offset, - found_key.objectid, found_key.type, - found_key.offset); - } - return ret; -} - static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4d5772330110..a26ab3cbb449 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -31,8 +31,6 @@ struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_verify_level_key(struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index bd85205a6249..9e92548a6aa2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1963,3 +1963,61 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) } return 0; } + +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + int found_level; + struct btrfs_key found_key; + int ret; + + found_level = btrfs_header_level(eb); + if (found_level != level) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree level check failed\n"); + btrfs_err(fs_info, +"tree level mismatch detected, bytenr=%llu level expected=%u has=%u", + eb->start, level, found_level); + return -EIO; + } + + if (!first_key) + return 0; + + /* + * For live tree block (new tree blocks in current transaction), + * we need proper lock context to avoid race, which is impossible here. + * So we only checks tree blocks which is read from disk, whose + * generation <= fs_info->last_trans_committed. + */ + if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + return 0; + + /* We have @first_key, so this @eb must have at least one item */ + if (btrfs_header_nritems(eb) == 0) { + btrfs_err(fs_info, + "invalid tree nritems, bytenr=%llu nritems=0 expect >0", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return -EUCLEAN; + } + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + ret = btrfs_comp_cpu_keys(first_key, &found_key); + + if (ret) { + WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), + KERN_ERR "BTRFS: tree first key check failed\n"); + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, parent_transid, first_key->objectid, + first_key->type, first_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + } + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index c0861ce1429b..3c2a02a72f64 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -66,5 +66,7 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); +int btrfs_verify_level_key(struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); #endif From f541833c8eea17e3779e0fce81d3c92a3604d80a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:18 -0400 Subject: [PATCH 032/194] btrfs: move split_flags/combine_flags helpers to inode-item.h These are more related to the inode item flags on disk than the in-memory btrfs_inode, move the helpers to inode-item.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 16 ---------------- fs/btrfs/inode-item.h | 16 ++++++++++++++++ fs/btrfs/tree-checker.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0849b85b94fe..08c996023394 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -404,22 +404,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } -/* - * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two - * separate u32s. These two functions convert between the two representations. - */ -static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) -{ - return (flags | ((u64)ro_flags << 32)); -} - -static inline void btrfs_inode_split_flags(u64 inode_item_flags, - u32 *flags, u32 *ro_flags) -{ - *flags = (u32)inode_item_flags; - *ro_flags = (u32)(inode_item_flags >> 32); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index b80aeb715701..ede43b6c6559 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -60,6 +60,22 @@ struct btrfs_truncate_control { bool clear_extent_range; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9e92548a6aa2..351ba9e90675 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -25,10 +25,10 @@ #include "compression.h" #include "volumes.h" #include "misc.h" -#include "btrfs_inode.h" #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "inode-item.h" /* * Error message should follow the following format: From a95b7f93602e41f680096106935a76667a8763f1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:19 -0400 Subject: [PATCH 033/194] btrfs: add __KERNEL__ check for btrfs_no_printk We want to override this in btrfs-progs, so wrap this in the __KERNEL__ check so we can easily sync this to btrfs-progs and have our local version of btrfs_no_printk do the work. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/messages.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index ac2d1982ba3d..99143bbf78a5 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -7,11 +7,18 @@ struct btrfs_fs_info; +/* + * We want to be able to override this in btrfs-progs. + */ +#ifdef __KERNEL__ + static inline __printf(2, 3) __cold void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } +#endif + #ifdef CONFIG_PRINTK #define btrfs_printk(fs_info, fmt, args...) \ From b3cbfb0dd4a80c359701280f6acfa37131d8ee8b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:20 -0400 Subject: [PATCH 034/194] btrfs: add a btrfs_csum_type_size helper This is needed in btrfs-progs for the tools that convert the checksum types for file systems and a few other things. We don't have it in the kernel as we just want to get the size for the super blocks type. However I don't want to have to manually add this every time we sync ctree.c into btrfs-progs, so add the helper in the kernel with a note so it doesn't get removed by a later cleanup. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 +++++++- fs/btrfs/ctree.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0c83cd78a483..ddd22f16cdff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -150,13 +150,19 @@ static inline void copy_leaf_items(const struct extent_buffer *dst, nr_items * sizeof(struct btrfs_item)); } +/* This exists for btrfs-progs usages. */ +u16 btrfs_csum_type_size(u16 type) +{ + return btrfs_csums[type].size; +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* * csum type is validated at mount time */ - return btrfs_csums[t].size; + return btrfs_csum_type_size(t); } const char *btrfs_super_csum_name(u16 csum_type) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b7ab7fa2b73a..717fca1797d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -701,6 +701,7 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); From 016f9d0b744202eb170ed91e995fc757dd4adeeb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Sat, 29 Apr 2023 16:07:21 -0400 Subject: [PATCH 035/194] btrfs: rename del_ptr to btrfs_del_ptr and export it This exists internal to ctree.c, however btrfs check needs to use it for some of its operations. I'd rather not duplicate that code inside of btrfs check as this is low level and I want to keep this code in one place, so rename the function to btrfs_del_ptr and export it so that it can be used inside of btrfs-progs safely. Add a comment to make sure this doesn't get removed by a future cleanup. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/ctree.h | 2 ++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ddd22f16cdff..2f2071d64c52 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,8 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot); static const struct btrfs_csums { u16 size; @@ -1122,7 +1120,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - del_ptr(root, path, level + 1, pslot + 1); + btrfs_del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1168,7 +1166,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - del_ptr(root, path, level + 1, pslot); + btrfs_del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -4357,9 +4355,11 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * the tree should have been previously balanced so the deletion does not * empty a node. + * + * This is exported for use inside btrfs-progs, don't un-export it. */ -static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, - int level, int slot) +void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, + int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4414,7 +4414,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *leaf) { WARN_ON(btrfs_header_generation(leaf) != trans->transid); - del_ptr(root, path, 1, path->slots[1]); + btrfs_del_ptr(root, path, 1, path->slots[1]); /* * btrfs_free_extent is expensive, we want to make sure we @@ -4500,7 +4500,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* push_leaf_left fixes the path. * make sure the path still points to our leaf - * for possible call to del_ptr below + * for possible call to btrfs_del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 717fca1797d6..5af61480dde6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,6 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); +void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, + int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, From bb5167e6197b73a38cf35bee788e5cbb6d3f8bfa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 9 May 2023 10:12:01 -0700 Subject: [PATCH 036/194] btrfs: unexport btrfs_run_discard_work and make it static Mark btrfs_run_discard_work static and move it above its callers. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 34 +++++++++++++++++----------------- fs/btrfs/discard.h | 1 - 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index a6d77fe41e1a..944a7340f6a4 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -73,6 +73,23 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, return &discard_ctl->discard_list[block_group->discard_index]; } +/* + * Determine if async discard should be running. + * + * @discard_ctl: discard control + * + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { @@ -544,23 +561,6 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_unlock(&discard_ctl->lock); } -/* - * Determine if async discard should be running. - * - * @discard_ctl: discard control - * - * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. - */ -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) -{ - struct btrfs_fs_info *fs_info = container_of(discard_ctl, - struct btrfs_fs_info, - discard_ctl); - - return (!(fs_info->sb->s_flags & SB_RDONLY) && - test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); -} - /* * Recalculate the base delay. * diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 57b9202f427f..dddb0f9101ba 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -24,7 +24,6 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group); void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override); -bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); /* Update operations */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); From f18cc97845aa4ae0e795c088c979fe1642b3b8e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 May 2023 07:58:38 -0700 Subject: [PATCH 037/194] btrfs: fix dirty_metadata_bytes for redirtied buffers dirty_metadata_bytes is decremented in both places that clear the dirty bit in a buffer, but only incremented in btrfs_mark_buffer_dirty, which means that a buffer that is redirtied using btrfs_redirty_list_add won't be added to dirty_metadata_bytes, but it will be subtracted when written out, leading an inconsistency in the counter. Move the dirty_metadata_bytes from btrfs_mark_buffer_dirty into set_extent_buffer_dirty to also account for the redirty case, and remove the now unused set_extent_buffer_dirty return value. Fixes: d3575156f662 ("btrfs: zoned: redirty released extent buffers") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Naohiro Aota Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 +------ fs/btrfs/extent_io.c | 7 ++++--- fs/btrfs/extent_io.h | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 83518ed71bfd..112c99dde57f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4621,7 +4621,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); - int was_dirty; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* @@ -4636,11 +4635,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); - was_dirty = set_extent_buffer_dirty(buf); - if (!was_dirty) - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + set_extent_buffer_dirty(buf); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* * btrfs_check_leaf() won't check item data if we don't have WRITTEN diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1adadd5d25d..a829390632a5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4148,7 +4148,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, WARN_ON(atomic_read(&eb->refs) == 0); } -bool set_extent_buffer_dirty(struct extent_buffer *eb) +void set_extent_buffer_dirty(struct extent_buffer *eb) { int i; int num_pages; @@ -4183,13 +4183,14 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) eb->start, eb->len); if (subpage) unlock_page(eb->pages[0]); + percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, + eb->len, + eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); #endif - - return was_dirty; } void clear_extent_buffer_uptodate(struct extent_buffer *eb) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4341ad978fb8..f937654230d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -262,7 +262,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len); -bool set_extent_buffer_dirty(struct extent_buffer *eb); +void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(const struct extent_buffer *eb); From f880fe6e0b4b127d6e2a007fe68bdf5651677ae2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 May 2023 07:58:39 -0700 Subject: [PATCH 038/194] btrfs: don't hold an extra reference for redirtied buffers When btrfs_redirty_list_add redirties a buffer, it also acquires an extra reference that is released on transaction commit. But this is not required as buffers that are dirty or under writeback are never freed (look for calls to extent_buffer_under_io())). Remove the extra reference and the infrastructure used to drop it again. History behind redirty logic: In the first place, it used releasing_list to hold all the to-be-released extent buffers, and decided which buffers to re-dirty at the commit time. Then, in a later version, the behaviour got changed to re-dirty a necessary buffer and add re-dirtied one to the list in btrfs_free_tree_block(). In short, the list was there mostly for the patch series' historical reason. Reviewed-by: Naohiro Aota Signed-off-by: Christoph Hellwig [ add Naohiro's comment regarding history ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/extent_io.c | 1 - fs/btrfs/extent_io.h | 1 - fs/btrfs/transaction.c | 9 --------- fs/btrfs/transaction.h | 3 --- fs/btrfs/zoned.c | 28 ++++------------------------ fs/btrfs/zoned.h | 2 -- 7 files changed, 4 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 112c99dde57f..16224fd82987 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5073,8 +5073,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); - btrfs_free_redirty_list(cur_trans); - cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a829390632a5..d8becf1cdbc0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3557,7 +3557,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); - INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f937654230d3..6f3cfadd232c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -89,7 +89,6 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; - struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 27c616fdfae2..fe0f00e717a8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -374,8 +374,6 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); - INIT_LIST_HEAD(&cur_trans->releasing_ebs); - spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES); @@ -2482,13 +2480,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* - * At this point, we should have written all the tree blocks allocated - * in this transaction. So it's now safe to free the redirtyied extent - * buffers. - */ - btrfs_free_redirty_list(cur_trans); - ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index fa728ab80826..8e9fa23bd7fe 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -94,9 +94,6 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; - - spinlock_t releasing_ebs_lock; - struct list_head releasing_ebs; }; enum { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index dac879fe2871..bda301a55cbe 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1602,37 +1602,17 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - - if (!btrfs_is_zoned(fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || - !list_empty(&eb->release_list)) + if (!btrfs_is_zoned(eb->fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) return; + ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + memzero_extent_buffer(eb, 0, eb->len); set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bits_nowait(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, EXTENT_DIRTY); - - spin_lock(&trans->releasing_ebs_lock); - list_add_tail(&eb->release_list, &trans->releasing_ebs); - spin_unlock(&trans->releasing_ebs_lock); - atomic_inc(&eb->refs); -} - -void btrfs_free_redirty_list(struct btrfs_transaction *trans) -{ - spin_lock(&trans->releasing_ebs_lock); - while (!list_empty(&trans->releasing_ebs)) { - struct extent_buffer *eb; - - eb = list_first_entry(&trans->releasing_ebs, - struct extent_buffer, release_list); - list_del_init(&eb->release_list); - free_extent_buffer(eb); - } - spin_unlock(&trans->releasing_ebs_lock); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c0570d35fea2..3058ef559c98 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -54,7 +54,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); -void btrfs_free_redirty_list(struct btrfs_transaction *trans); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); @@ -179,7 +178,6 @@ static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } -static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { From b7f9945a1479a97a7aa2cc2b9d36e72f33fcb174 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 11 May 2023 16:01:44 +0800 Subject: [PATCH 039/194] btrfs: handle tree backref walk error properly [BUG] Smatch reports the following errors related to commit ("btrfs: output affected files when relocation fails"): fs/btrfs/inode.c:283 print_data_reloc_error() error: uninitialized symbol 'ref_level'. [CAUSE] That part of code is mostly copied from scrub, but unfortunately scrub code from the beginning is not doing the error handling properly. The offending code looks like this: do { ret = tree_backref_for_extent(); btrfs_warn_rl(); } while (ret != 1); There are several problems involved: - No error handling If that tree_backref_for_extent() failed, we would output the same error again and again, never really exit as it requires ret == 1 to exit. - Always do one extra output As tree_backref_for_extent() only return > 0 if there is no more backref item. This means after the last item we hit, we would output an invalid error message for ret > 0 case. [FIX] Fix the old code by: - Move @ref_root and @ref_level into the if branch And do not initialize them, so we can catch such uninitialized values just like what we do in the inode.c - Explicitly check the return value of tree_backref_for_extent() And handle ret < 0 and ret > 0 cases properly. - No more do {} while () loop Instead go while (true) {} loop since we will handle @ret manually. Reported-by: Dan Carpenter Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++++++---- fs/btrfs/scrub.c | 28 +++++++++++++++++----------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da7e5bf2f6db..333736712bd9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -276,17 +276,25 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off u64 ref_root; u8 ref_level; - do { + while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); + if (ret < 0) { + btrfs_warn_rl(fs_info, + "failed to resolve tree backref for logical %llu: %d", + logical, ret); + break; + } + if (ret > 0) + break; + btrfs_warn_rl(fs_info, "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", logical, mirror_num, (ref_level ? "node" : "leaf"), - (ret < 0 ? -1 : ref_level), - (ret < 0 ? -1 : ref_root)); - } while (ret != 1); + ref_level, ref_root); + } btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f231883f504a..265db0c15a4c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -479,11 +479,8 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * struct extent_buffer *eb; struct btrfs_extent_item *ei; struct scrub_warning swarn; - unsigned long ptr = 0; u64 flags = 0; - u64 ref_root; u32 item_size; - u8 ref_level = 0; int ret; /* Super block error, no need to search extent tree. */ @@ -513,19 +510,28 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * item_size = btrfs_item_size(eb, path->slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - do { + unsigned long ptr = 0; + u8 ref_level; + u64 ref_root; + + while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); + if (ret < 0) { + btrfs_warn(fs_info, + "failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); + break; + } + if (ret > 0) + break; btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", - errstr, swarn.logical, - btrfs_dev_name(dev), - swarn.physical, - ref_level ? "node" : "leaf", - ret < 0 ? -1 : ref_level, - ret < 0 ? -1 : ref_root); - } while (ret != 1); + errstr, swarn.logical, btrfs_dev_name(dev), + swarn.physical, (ref_level ? "node" : "leaf"), + ref_level, ref_root); + } btrfs_release_path(path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; From b9cb105e737f32b9cc18c4f47edabdb0ae66d569 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 12 May 2023 13:44:57 +0800 Subject: [PATCH 040/194] btrfs: scrub: remove more unused functions These functions are defined in the scrub.c file, but last callers were removed in e9255d6c4054 ("btrfs: scrub: remove the old scrub recheck code"). fs/btrfs/scrub.c:553:20: warning: unused function 'scrub_stripe_index_and_offset'. fs/btrfs/scrub.c:543:19: warning: unused function 'scrub_nr_raid_mirrors'. Reported-by: Abaci Robot Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=4937 Signed-off-by: Jiapeng Chong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 265db0c15a4c..d7a14458c4a7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -552,48 +552,6 @@ out: btrfs_free_path(path); } -static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) -{ - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - return 2; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - return 3; - else - return (int)bioc->num_stripes; -} - -static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, - u64 full_stripe_logical, - int nstripes, int mirror, - int *stripe_index, - u64 *stripe_offset) -{ - int i; - - if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? - nstripes - 1 : nstripes - 2; - - /* RAID5/6 */ - for (i = 0; i < nr_data_stripes; i++) { - const u64 data_stripe_start = full_stripe_logical + - (i * BTRFS_STRIPE_LEN); - - if (logical >= data_stripe_start && - logical < data_stripe_start + BTRFS_STRIPE_LEN) - break; - } - - *stripe_index = i; - *stripe_offset = (logical - full_stripe_logical) & - BTRFS_STRIPE_LEN_MASK; - } else { - /* The other RAID type */ - *stripe_index = mirror; - *stripe_offset = 0; - } -} - static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) { int ret = 0; From bf1f4fd3fadb8dd4337ad46d63eb6ebcce3366f5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:02:12 +0100 Subject: [PATCH 041/194] btrfs: use inode_logged() at need_log_inode() At need_log_inode() we directly check the ->logged_trans field of the given inode to check if it was previously logged in the transaction, with the goal of skipping logging the inode again when it's not necessary. The ->logged_trans field in not persisted in the inode item or elsewhere, it's only stored in memory (struct btrfs_inode), so it's transient and lost once the inode is evicted and then loaded again. Once an inode is loaded, we are conservative and set ->logged_trans to 0, which may mean that either the inode was never logged in the current transaction or it was logged but evicted before being loaded again. Instead of checking the inode's ->logged_trans field directly, we can use instead the helper inode_logged(), which will really check if the inode was logged before in the current transaction in case we have a ->logged_trans field with a value of 0. This will prevent unnecessarily logging an inode when it's not needed, and in some cases preventing a transaction commit, in case the logging requires a fallback to a transaction commit. The following test script shows a scenario where due to eviction we fallback a transaction commit when trying to fsync a file that was renamed: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 num_init_files=10000 num_new_files=10000 mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT mkdir $MNT/testdir for ((i = 1; i <= $num_init_files; i++)); do echo -n > $MNT/testdir/file_$i done echo -n > $MNT/testdir/foo sync # Add some files so that there's more work in the transaction other # than just renaming file foo. for ((i = 1; i <= $num_new_files; i++)); do echo -n > $MNT/testdir/new_file_$i done # Fsync the directory first. xfs_io -c "fsync" $MNT/testdir # Rename file foo. mv $MNT/testdir/foo $MNT/testdir/bar # Now trigger eviction of the test directory's inode. # Once loaded again, it will have logged_trans set to 0 and # last_unlink_trans set to the current transaction. echo 2 > /proc/sys/vm/drop_caches # Fsync file bar (ex-foo). # Before the patch the fsync would result in a transaction commit # because the inode for file bar has last_unlink_trans set to the # current transaction, so it will attempt to log the parent directory # as well, which will fallback to a full transaction commit because # it also has its last_unlink_trans set to the current transaction, # due to the inode eviction. start=$(date +%s%N) xfs_io -c "fsync" $MNT/testdir/bar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "file fsync took: $dur milliseconds" umount $MNT Before this patch: fsync took 22 milliseconds After this patch: fsync took 8 milliseconds Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d2755d5e338b..dc00baa9dd73 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3252,7 +3252,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, * Returns 1 if the inode was logged before in the transaction, 0 if it was not, * and < 0 on error. */ -static int inode_logged(struct btrfs_trans_handle *trans, +static int inode_logged(const struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path_in) { @@ -5303,7 +5303,7 @@ out: * multiple times when multiple tasks have joined the same log transaction. */ static bool need_log_inode(const struct btrfs_trans_handle *trans, - const struct btrfs_inode *inode) + struct btrfs_inode *inode) { /* * If a directory was not modified, no dentries added or removed, we can @@ -5321,7 +5321,7 @@ static bool need_log_inode(const struct btrfs_trans_handle *trans, * logged_trans will be 0, in which case we have to fully log it since * logged_trans is a transient field, not persisted. */ - if (inode->logged_trans == trans->transid && + if (inode_logged(trans, inode, NULL) == 1 && !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) return false; From d67ba263f4add857ac2a02088c08e1dc2fe1171b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:02:13 +0100 Subject: [PATCH 042/194] btrfs: use inode_logged() at btrfs_record_unlink_dir() At btrfs_record_unlink_dir() we directly check the logged_trans field of the given inodes to check if they were previously logged in the current transaction, and if any of them were, then we can avoid setting the field last_unlink_trans of the directory to the id of the current transaction if we are in a rename path. Avoiding that can later prevent falling back to a transaction commit if anyone attempts to log the directory. However the logged_trans field, store in struct btrfs_inode, is transient, not persisted in the inode item on its subvolume b+tree, so that means that if an inode is evicted and then loaded again, its original value is lost and it's reset to 0. So directly checking the logged_trans field can lead to some false negative, and that only results in a performance impact as mentioned before. Instead of directly checking the logged_trans field of the inodes, use the inode_logged() helper, which will check in the log tree if an inode was logged before in case its logged_trans field has a value of 0. This way we can avoid setting the directory inode's last_unlink_trans and cause future logging attempts of it to fallback to transaction commits. The following test script shows one example where this happens without this patch: $ cat test.sh #!/bin/bash DEV=/dev/nullb0 MNT=/mnt/nullb0 num_init_files=10000 num_new_files=10000 mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT mkdir $MNT/testdir for ((i = 1; i <= $num_init_files; i++)); do echo -n > $MNT/testdir/file_$i done echo -n > $MNT/testdir/foo sync # Add some files so that there's more work in the transaction other # than just renaming file foo. for ((i = 1; i <= $num_new_files; i++)); do echo -n > $MNT/testdir/new_file_$i done # Change the file, fsync it. setfattr -n user.x1 -v 123 $MNT/testdir/foo xfs_io -c "fsync" $MNT/testdir/foo # Now triggger eviction of file foo but no eviction for our test # directory, since it is being used by the process below. This will # set logged_trans of the file's inode to 0 once it is loaded again. ( cd $MNT/testdir while true; do : done ) & pid=$! echo 2 > /proc/sys/vm/drop_caches kill $pid wait $pid # Move foo out of our testdir. This will set last_unlink_trans # of the directory inode to the current transaction, because # logged_trans of both the directory and the file are set to 0. mv $MNT/testdir/foo $MNT/foo # Change file foo again and fsync it. # This fsync will result in a transaction commit because the rename # above has set last_unlink_trans of the parent directory to the id # of the current transaction and because our inode for file foo has # last_unlink_trans set to the current transaction, since it was # evicted and reloaded and it was previously modified in the current # transaction (the xattr addition). xfs_io -c "pwrite 0 64K" $MNT/foo start=$(date +%s%N) xfs_io -c "fsync" $MNT/foo end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "file fsync took: $dur milliseconds" umount $MNT Before this patch: fsync took 19 milliseconds After this patch: fsync took 5 milliseconds Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dc00baa9dd73..82da38109b16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7329,14 +7329,14 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * if this directory was already logged any new * names for this file/dir will get recorded */ - if (dir->logged_trans == trans->transid) + if (inode_logged(trans, dir, NULL) == 1) return; /* * if the inode we're about to unlink was logged, * the log will be properly updated for any new names */ - if (inode->logged_trans == trans->transid) + if (inode_logged(trans, inode, NULL) == 1) return; /* From 1e75ef039d1a13afd0abd794dee9df462d5b7990 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:02:14 +0100 Subject: [PATCH 043/194] btrfs: update comments at btrfs_record_unlink_dir() to be more clear Update the comments at btrfs_record_unlink_dir() so that they mention where new names are logged and where old names are removed. Also, while at it make the width of the comments closer to 80 columns and capitalize the sentences and finish them with punctuation. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 82da38109b16..95d01a122b3f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7326,15 +7326,19 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, mutex_unlock(&inode->log_mutex); /* - * if this directory was already logged any new - * names for this file/dir will get recorded + * If this directory was already logged, any new names will be logged + * with btrfs_log_new_name() and old names will be deleted from the log + * tree with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ if (inode_logged(trans, dir, NULL) == 1) return; /* - * if the inode we're about to unlink was logged, - * the log will be properly updated for any new names + * If the inode we're about to unlink was logged before, the log will be + * properly updated with the new name with btrfs_log_new_name() and the + * old name removed with btrfs_del_dir_entries_in_log() or with + * btrfs_del_inode_ref_in_log(). */ if (inode_logged(trans, inode, NULL) == 1) return; From acfb5a4f1109a4f477f816d572b4f8f65b645bee Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:02:15 +0100 Subject: [PATCH 044/194] btrfs: remove pointless label and goto at btrfs_record_unlink_dir() There's no point of having a label and goto at btrfs_record_unlink_dir() because the function is trivial and can just return early if we are not in a rename context. So remove the label and goto and instead return early if we are not in a rename. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 95d01a122b3f..c988eae6a4f2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7325,6 +7325,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, inode->last_unlink_trans = trans->transid; mutex_unlock(&inode->log_mutex); + if (!for_rename) + return; + /* * If this directory was already logged, any new names will be logged * with btrfs_log_new_name() and old names will be deleted from the log @@ -7350,13 +7353,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * properly. So, we have to be conservative and force commits * so the new name gets discovered. */ - if (for_rename) - goto record; - - /* we can safely do the unlink without any special recording */ - return; - -record: mutex_lock(&dir->log_mutex); dir->last_unlink_trans = trans->transid; mutex_unlock(&dir->log_mutex); From 59fcf388172dc4e00a8a98b8b720c063af398bda Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:02:16 +0100 Subject: [PATCH 045/194] btrfs: change for_rename argument of btrfs_record_unlink_dir() to bool The for_rename argument of btrfs_record_unlink_dir() is defined as an integer, but the argument is in fact used as a boolean. So change it to a boolean to make its use more clear. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/tree-log.c | 2 +- fs/btrfs/tree-log.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 333736712bd9..fa781fc863fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4425,7 +4425,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - 0); + false); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); @@ -8993,9 +8993,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), - BTRFS_I(new_inode), 1); + BTRFS_I(new_inode), true); } /* src is a subvolume */ @@ -9261,7 +9261,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), - BTRFS_I(old_inode), 1); + BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c988eae6a4f2..ecb73da5d25a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7309,7 +7309,7 @@ error: */ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename) + bool for_rename) { /* * when we're logging a file, if it hasn't been renamed diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index bdeb5216718f..a550a8a375cd 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -100,7 +100,7 @@ void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - int for_rename); + bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, From 618d1d7da587b44e60922103ee2dc36949641d4d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 May 2023 12:03:44 +0100 Subject: [PATCH 046/194] btrfs: fix comment referring to no longer existing btrfs_clean_tree_block() There's a comment at btrfs_init_new_buffer() that refers to a function named btrfs_clean_tree_block(), however the function was renamed to btrfs_clear_buffer_dirty() in commit 190a83391bc4 ("btrfs: rename btrfs_clean_tree_block to btrfs_clear_buffer_dirty"). So update the comment to refer to the current name. Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 99b9ab5ccf93..5de5b577e7fd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4797,7 +4797,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, !test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) lockdep_owner = BTRFS_FS_TREE_OBJECTID; - /* btrfs_clean_tree_block() accesses generation field. */ + /* btrfs_clear_buffer_dirty() accesses generation field. */ btrfs_set_header_generation(buf, trans->transid); /* From edc728814f9a73cec501e4e5dd677dc9d65841cb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 12 May 2023 07:42:05 +0800 Subject: [PATCH 047/194] btrfs: trigger orphan inode cleanup during START_SYNC ioctl There is an internal error report that scrub found an error in an orphan inode's data. However there are very limited ways to cleanup such orphan inodes: - btrfs_start_pre_rw_mount() This happens at either mount, or RO->RW switch. This is not a viable solution for root fs which may not be unmounted or RO mounted. Furthermore this doesn't cover every subvolume, it only covers the currently cached subvolumes. - btrfs_lookup_dentry() This happens when we first lookup the subvolume dentry. But dentry can be cached thus it's not ensured to be triggered every time. - create_snapshot() This only happens for the created snapshot, not the source one. This means if we didn't trigger orphan items cleanup, there is really no other way to manually trigger it. Add this step to the START_SYNC ioctl. This is a slight change in the semantics of the ioctl but as sync can be potentially slow and is usually paired with WAIT_SYNC ioctl. The errors are not handled because the main point of the ioctl is the async commit, orphan cleanup is a side effect. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index df7603c30686..edbbd5cf23fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3112,6 +3112,13 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, struct btrfs_trans_handle *trans; u64 transid; + /* + * Start orphan cleanup here for the given root in case it hasn't been + * started already by other means. Errors are handled in the other + * functions during transaction commit. + */ + btrfs_orphan_cleanup(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) From 7f26fb1c13f7445adf7890bf27458081655c9a4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:21 +0200 Subject: [PATCH 048/194] btrfs: mark extent_buffer_under_io static extent_buffer_under_io is only used in extent_io.c, so mark it static. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d8becf1cdbc0..9827403ce2fb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3421,7 +3421,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -int extent_buffer_under_io(const struct extent_buffer *eb) +static int extent_buffer_under_io(const struct extent_buffer *eb) { return (atomic_read(&eb->io_pages) || test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6f3cfadd232c..e11ee09ab26b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -264,7 +264,6 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); -int extent_buffer_under_io(const struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, From 243984b3b99199678b6ec74787f8fed60c58042d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:22 +0200 Subject: [PATCH 049/194] btrfs: subpage: fix error handling in end_bio_subpage_eb_writepage Call btrfs_page_clear_uptodate instead of ClearPageUptodate to properly manage the uptodate bit for the subpage case. Reported-by: Qu Wenruo Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9827403ce2fb..a75e05c4f496 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1915,7 +1915,8 @@ static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); + btrfs_page_clear_uptodate(fs_info, page, + eb->start, eb->len); set_btree_ioerr(page, eb); } From aebcc1596b5c37095385ecd930cf335254828538 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:23 +0200 Subject: [PATCH 050/194] btrfs: move setting the buffer uptodate out of validate_extent_buffer Setting the buffer uptodate in a function that is named as a validation helper is a it confusing. Move the call from validate_extent_buffer to the one of its two callers that didn't already have a duplicate call to set_extent_buffer_uptodate. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 16224fd82987..41045c900c2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -533,9 +533,7 @@ static int validate_extent_buffer(struct extent_buffer *eb, if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; - if (!ret) - set_extent_buffer_uptodate(eb); - else + if (ret) btrfs_err(fs_info, "read time tree block corruption detected on logical %llu mirror %u", eb->start, eb->read_mirror); @@ -627,6 +625,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, goto err; } ret = validate_extent_buffer(eb, &bbio->parent_check); + if (!ret) + set_extent_buffer_uptodate(eb); err: if (ret) { /* From d87e6575e9d1c9d43e223c3fe858e4e453265707 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:24 +0200 Subject: [PATCH 051/194] btrfs: merge verify_parent_transid and btrfs_buffer_uptodate verify_parent_transid is only called by btrfs_buffer_uptodate, which confusingly inverts the return value. Merge the two functions and reflow the parent_transid so that error handling is in a branch. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 45 ++++++++++++++------------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41045c900c2c..d4f7578cda00 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -110,32 +110,32 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -static int verify_parent_transid(struct extent_io_tree *io_tree, - struct extent_buffer *eb, u64 parent_transid, - int atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) { + struct inode *btree_inode = eb->pages[0]->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; struct extent_state *cached_state = NULL; - int ret; + int ret = 1; + + if (!extent_buffer_uptodate(eb)) + return 0; if (!parent_transid || btrfs_header_generation(eb) == parent_transid) - return 0; + return 1; if (atomic) return -EAGAIN; lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (extent_buffer_uptodate(eb) && - btrfs_header_generation(eb) == parent_transid) { - ret = 0; - goto out; - } - btrfs_err_rl(eb->fs_info, + if (!extent_buffer_uptodate(eb) || + btrfs_header_generation(eb) != parent_transid) { + btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); - ret = 1; - clear_extent_buffer_uptodate(eb); -out: + clear_extent_buffer_uptodate(eb); + ret = 0; + } unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); return ret; @@ -4600,23 +4600,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_close_devices(fs_info->fs_devices); } -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, - int atomic) -{ - int ret; - struct inode *btree_inode = buf->pages[0]->mapping->host; - - ret = extent_buffer_uptodate(buf); - if (!ret) - return ret; - - ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, - parent_transid, atomic); - if (ret == -EAGAIN) - return ret; - return !ret; -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; From e95382834cf885b478dbe14a66451b863eb35c94 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:25 +0200 Subject: [PATCH 052/194] btrfs: always read the entire extent_buffer Currently read_extent_buffer_pages skips pages that are already uptodate when reading in an extent_buffer. While this reduces the amount of data read, it increases the number of I/O operations as we now need to do multiple I/Os when reading an extent buffer with one or more uptodate pages in the middle of it. On any modern storage device, be that hard drives or SSDs this actually decreases I/O performance. Fortunately this case is pretty rare as the pages are always initially read together and then aged the same way. Besides simplifying the code a bit as-is this will allow for major simplifications to the I/O completion handler later on. Note that the case where all pages are uptodate is still handled by an optimized fast path that does not read any data from disk. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a75e05c4f496..ede7c88e829e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4315,7 +4315,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, int locked_pages = 0; int all_uptodate = 1; int num_pages; - unsigned long num_reads = 0; struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ, .mirror_num = mirror_num, @@ -4361,10 +4360,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, */ for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (!PageUptodate(page)) { - num_reads++; + if (!PageUptodate(page)) all_uptodate = 0; - } } if (all_uptodate) { @@ -4374,7 +4371,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_reads); + atomic_set(&eb->io_pages, num_pages); /* * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. @@ -4384,13 +4381,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (!PageUptodate(page)) { - ClearPageError(page); - submit_extent_page(&bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0); - } else { - unlock_page(page); - } + ClearPageError(page); + submit_extent_page(&bio_ctrl, page_offset(page), page, + PAGE_SIZE, 0); } submit_one_bio(&bio_ctrl); From b78b98e06fb7f9860da5a7c28e1edbaefc2f7be1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:26 +0200 Subject: [PATCH 053/194] btrfs: don't use btrfs_bio_ctrl for extent buffer reading The btrfs_bio_ctrl machinery is overkill for reading extent_buffers as we always operate on PAGE_SIZE chunks (or one smaller one for the subpage case) that are contiguous and are guaranteed to fit into a single bio. Replace it with open coded btrfs_bio_alloc, __bio_add_page and btrfs_submit_bio calls in a helper function shared between the subpage and node size >= PAGE_SIZE cases. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 99 ++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ede7c88e829e..681e6774124f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -98,22 +98,12 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; - int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; - /* - * This is for metadata read, to provide the extra needed verification - * info. This has to be provided for submit_one_bio(), as - * submit_one_bio() can submit a bio if it ends at stripe boundary. If - * no such parent_check is provided, the metadata can hit false alert at - * endio time. - */ - struct btrfs_tree_parent_check *parent_check; - /* * Tell writepage not to lock the state bits for this range, it still * does the unlocking. @@ -124,7 +114,6 @@ struct btrfs_bio_ctrl { static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_bio *bbio = bio_ctrl->bbio; - int mirror_num = bio_ctrl->mirror_num; if (!bbio) return; @@ -132,25 +121,14 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(&bbio->inode->vfs_inode)) { - if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { - /* - * For metadata read, we should have the parent_check, - * and copy it to bbio for metadata verification. - */ - ASSERT(bio_ctrl->parent_check); - memcpy(&bbio->parent_check, - bio_ctrl->parent_check, - sizeof(struct btrfs_tree_parent_check)); - } + if (!is_data_inode(&bbio->inode->vfs_inode)) bbio->bio.bi_opf |= REQ_META; - } if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(bbio, mirror_num); + btrfs_submit_compressed_read(bbio, 0); else - btrfs_submit_bio(bbio, mirror_num); + btrfs_submit_bio(bbio, 0); /* The bbio is owned by the end_io handler now */ bio_ctrl->bbio = NULL; @@ -4243,6 +4221,36 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, + struct btrfs_tree_parent_check *check) +{ + int num_pages = num_extent_pages(eb), i; + struct btrfs_bio *bbio; + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, num_pages); + check_buffer_tree_ref(eb); + + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_READ | REQ_META, eb->fs_info, + end_bio_extent_readpage, NULL); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + memcpy(&bbio->parent_check, check, sizeof(*check)); + if (eb->fs_info->nodesize < PAGE_SIZE) { + __bio_add_page(&bbio->bio, eb->pages[0], eb->len, + eb->start - page_offset(eb->pages[0])); + } else { + for (i = 0; i < num_pages; i++) { + ClearPageError(eb->pages[i]); + __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + } + } + btrfs_submit_bio(bbio, mirror_num); +} + static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) @@ -4251,11 +4259,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; struct extent_state *cached_state = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; int ret; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -4283,18 +4286,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = 0; - atomic_set(&eb->io_pages, 1); - check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; - btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); - btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - submit_extent_page(&bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); - submit_one_bio(&bio_ctrl); + + __read_extent_buffer_pages(eb, mirror_num, check); if (wait != WAIT_COMPLETE) { free_extent_state(cached_state); return 0; @@ -4315,11 +4310,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, int locked_pages = 0; int all_uptodate = 1; int num_pages; - struct btrfs_bio_ctrl bio_ctrl = { - .opf = REQ_OP_READ, - .mirror_num = mirror_num, - .parent_check = check, - }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -4369,24 +4359,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, goto unlock_exit; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_pages); - /* - * It is possible for release_folio to clear the TREE_REF bit before we - * set io_pages. See check_buffer_tree_ref for a more detailed comment. - */ - check_buffer_tree_ref(eb); - bio_ctrl.end_io_func = end_bio_extent_readpage; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - - ClearPageError(page); - submit_extent_page(&bio_ctrl, page_offset(page), page, - PAGE_SIZE, 0); - } - - submit_one_bio(&bio_ctrl); + __read_extent_buffer_pages(eb, mirror_num, check); if (wait != WAIT_COMPLETE) return 0; From e19493107675d48eb207b91d9a775c811f247e27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:27 +0200 Subject: [PATCH 054/194] btrfs: remove the mirror_num argument to btrfs_submit_compressed_read Given that read recovery for data I/O is handled in the storage layer, the mirror_num argument to btrfs_submit_compressed_read is always 0, so remove it. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/compression.h | 2 +- fs/btrfs/extent_io.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 14f5f25049a0..04cd5de4f00f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -472,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) +void btrfs_submit_compressed_read(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -538,7 +538,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) if (memstall) psi_memstall_leave(&pflags); - btrfs_submit_bio(&cb->bbio, mirror_num); + btrfs_submit_bio(&cb->bbio, 0); return; out_free_compressed_pages: diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 19ab2abeddc0..b462ab106f43 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -93,7 +93,7 @@ void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int nr_pages, blk_opf_t write_flags, bool writeback); -void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); +void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 681e6774124f..ea99b2a3755e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -126,7 +126,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) - btrfs_submit_compressed_read(bbio, 0); + btrfs_submit_compressed_read(bbio); else btrfs_submit_bio(bbio, 0); From 046b562b20a5cfe205fb78c6f8a1a8b74f01d303 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:28 +0200 Subject: [PATCH 055/194] btrfs: use a separate end_io handler for read_extent_buffer Now that we always use a single bio to read an extent_buffer, the buffer can be passed to the end_io handler as private data. This allows implementing a much simplified dedicated end I/O handler for metadata reads. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 105 +------------------------------------------ fs/btrfs/disk-io.h | 5 +-- fs/btrfs/extent_io.c | 80 +++++++++++++++------------------ 3 files changed, 41 insertions(+), 149 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d4f7578cda00..db04a957b445 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -427,8 +427,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb, - struct btrfs_tree_parent_check *check) +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -541,107 +541,6 @@ out: return ret; } -static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror, struct btrfs_tree_parent_check *check) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - struct extent_buffer *eb; - bool reads_done; - int ret = 0; - - ASSERT(check); - - /* - * We don't allow bio merge for subpage metadata read, so we should - * only get one eb for each endio hook. - */ - ASSERT(end == start + fs_info->nodesize - 1); - ASSERT(PagePrivate(page)); - - eb = find_extent_buffer(fs_info, start); - /* - * When we are reading one tree block, eb must have been inserted into - * the radix tree. If not, something is wrong. - */ - ASSERT(eb); - - reads_done = atomic_dec_and_test(&eb->io_pages); - /* Subpage read must finish in page read */ - ASSERT(reads_done); - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, check); - if (ret < 0) - goto err; - - set_extent_buffer_uptodate(eb); - - free_extent_buffer(eb); - return ret; -err: - /* - * end_bio_extent_readpage decrements io_pages in case of error, - * make sure it has something to decrement. - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - free_extent_buffer(eb); - return ret; -} - -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror) -{ - struct extent_buffer *eb; - int ret = 0; - int reads_done; - - ASSERT(page->private); - - if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror, - &bbio->parent_check); - - eb = (struct extent_buffer *)page->private; - - /* - * The pending IO might have been the only thing that kept this buffer - * in memory. Make sure we have a ref for all this other checks - */ - atomic_inc(&eb->refs); - - reads_done = atomic_dec_and_test(&eb->io_pages); - if (!reads_done) - goto err; - - eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { - ret = -EIO; - goto err; - } - ret = validate_extent_buffer(eb, &bbio->parent_check); - if (!ret) - set_extent_buffer_uptodate(eb); -err: - if (ret) { - /* - * our io error hook is going to dec the io pages - * again, we have to make sure it has something - * to decrement - */ - atomic_inc(&eb->io_pages); - clear_extent_buffer_uptodate(eb); - } - free_extent_buffer(eb); - - return ret; -} - #ifdef CONFIG_MIGRATION static int btree_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a26ab3cbb449..b03767f4d7ed 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -82,9 +82,8 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, - struct page *page, u64 start, u64 end, - int mirror); +int btrfs_validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ea99b2a3755e..4581ee5a0835 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -663,35 +663,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); } -/* - * Find extent buffer for a givne bytenr. - * - * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking - * in endio context. - */ -static struct extent_buffer *find_extent_buffer_readpage( - struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) -{ - struct extent_buffer *eb; - - /* - * For regular sectorsize, we can use page->private to grab extent - * buffer - */ - if (fs_info->nodesize >= PAGE_SIZE) { - ASSERT(PagePrivate(page) && page->private); - return (struct extent_buffer *)page->private; - } - - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); - ASSERT(eb); - return eb; -} - /* * after a readpage IO is done, we need to: * clear the uptodate bits on error @@ -713,7 +684,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - int mirror; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -753,11 +723,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = bbio->mirror_num; - if (uptodate && !is_data_inode(inode) && - btrfs_validate_metadata_buffer(bbio, page, start, end, mirror)) - uptodate = false; - if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -778,13 +743,6 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } - } else if (!is_data_inode(inode)) { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); } /* Update page status and unlock. */ @@ -4221,6 +4179,42 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +{ + struct extent_buffer *eb = bbio->private; + bool uptodate = !bbio->bio.bi_status; + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + u32 bio_offset = 0; + + atomic_inc(&eb->refs); + eb->read_mirror = bbio->mirror_num; + + if (uptodate && + btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) + uptodate = false; + + if (uptodate) { + set_extent_buffer_uptodate(eb); + } else { + clear_extent_buffer_uptodate(eb); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + } + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + atomic_dec(&eb->io_pages); + end_page_read(bvec->bv_page, uptodate, eb->start + bio_offset, + bvec->bv_len); + bio_offset += bvec->bv_len; + } + + unlock_extent(&bbio->inode->io_tree, eb->start, + eb->start + bio_offset - 1, NULL); + free_extent_buffer(eb); + + bio_put(&bbio->bio); +} + static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, struct btrfs_tree_parent_check *check) { @@ -4234,7 +4228,7 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, - end_bio_extent_readpage, NULL); + extent_buffer_read_end_io, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; From 3d66b4b27d2b7f5e74071256acea38108f9dbeb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:29 +0200 Subject: [PATCH 056/194] btrfs: do not try to unlock the extent for non-subpage metadata reads Only subpage metadata reads lock the extent. Don't try to unlock it and waste cycles in the extent tree lookup for PAGE_SIZE or larger metadata. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4581ee5a0835..e35dfae57e50 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4208,8 +4208,10 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) bio_offset += bvec->bv_len; } - unlock_extent(&bbio->inode->io_tree, eb->start, - eb->start + bio_offset - 1, NULL); + if (eb->fs_info->nodesize < PAGE_SIZE) { + unlock_extent(&bbio->inode->io_tree, eb->start, + eb->start + bio_offset - 1, NULL); + } free_extent_buffer(eb); bio_put(&bbio->bio); From 9fdd160160f002ac2604c5b8f90598febad44ae7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:30 +0200 Subject: [PATCH 057/194] btrfs: return bool from lock_extent_buffer_for_io lock_extent_buffer_for_io never returns a negative error value, so switch the return value to a simple bool. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ keep noinline_for_stack ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e35dfae57e50..898c608c9b0b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1629,18 +1629,17 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) * * May try to flush write bio if we can't get the lock. * - * Return 0 if the extent buffer doesn't need to be submitted. - * (E.g. the extent buffer is not dirty) - * Return >0 is the extent buffer is submitted to bio. - * Return <0 if something went wrong, no page is locked. + * Return %false if the extent buffer doesn't need to be submitted (e.g. the + * extent buffer is not dirty) + * Return %true is the extent buffer is submitted to bio. */ -static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, +static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages; int flush = 0; - int ret = 0; + bool ret = false; if (!btrfs_try_tree_write_lock(eb)) { submit_write_bio(bio_ctrl, 0); @@ -1651,7 +1650,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) - return 0; + return false; if (!flush) { submit_write_bio(bio_ctrl, 0); flush = 1; @@ -1678,7 +1677,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - ret = 1; + ret = true; } else { spin_unlock(&eb->refs_lock); } @@ -2012,7 +2011,6 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) u64 page_start = page_offset(page); int bit_start = 0; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - int ret; /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { @@ -2058,25 +2056,13 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret == 0) { - free_extent_buffer(eb); - continue; + if (lock_extent_buffer_for_io(eb, bio_ctrl)) { + write_one_subpage_eb(eb, bio_ctrl); + submitted++; } - if (ret < 0) { - free_extent_buffer(eb); - goto cleanup; - } - write_one_subpage_eb(eb, bio_ctrl); free_extent_buffer(eb); - submitted++; } return submitted; - -cleanup: - /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(bio_ctrl, ret); - return ret; } /* @@ -2155,8 +2141,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, bio_ctrl); - if (ret <= 0) { + if (!lock_extent_buffer_for_io(eb, bio_ctrl)) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) btrfs_put_block_group(cache); From 50b21d7a066f9a702b1d54ca11fc577302e875cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:31 +0200 Subject: [PATCH 058/194] btrfs: submit a writeback bio per extent_buffer Stop trying to cluster writes of multiple extent_buffers into a single bio. There is no need for that as the blk_plug mechanism used all the way up in writeback_inodes_wb gives us the same I/O pattern even with multiple bios. Removing the clustering simplifies lock_extent_buffer_for_io a lot and will also allow passing the eb as private data to the end I/O handler. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 102 ++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 65 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 898c608c9b0b..40b612cf80a9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1627,41 +1627,24 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) /* * Lock extent buffer status and pages for writeback. * - * May try to flush write bio if we can't get the lock. - * * Return %false if the extent buffer doesn't need to be submitted (e.g. the * extent buffer is not dirty) * Return %true is the extent buffer is submitted to bio. */ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages; - int flush = 0; bool ret = false; + int i; - if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - btrfs_tree_lock(eb); - } - - if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_lock(eb); + while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) + if (wbc->sync_mode != WB_SYNC_ALL) return false; - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - while (1) { - wait_on_extent_buffer_writeback(eb); - btrfs_tree_lock(eb); - if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) - break; - btrfs_tree_unlock(eb); - } + wait_on_extent_buffer_writeback(eb); + btrfs_tree_lock(eb); } /* @@ -1693,19 +1676,8 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (!trylock_page(p)) { - if (!flush) { - submit_write_bio(bio_ctrl, 0); - flush = 1; - } - lock_page(p); - } - } - + for (i = 0; i < num_extent_pages(eb); i++) + lock_page(eb->pages[i]); return ret; } @@ -1936,11 +1908,16 @@ static void prepare_eb_write(struct extent_buffer *eb) * Page locking is only utilized at minimum to keep the VMM code happy. */ static void write_one_subpage_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) + struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; bool no_dirty_ebs = false; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + .end_io_func = end_bio_subpage_eb_writepage, + }; prepare_eb_write(eb); @@ -1954,40 +1931,43 @@ static void write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; - - submit_extent_page(bio_ctrl, eb->start, page, eb->len, + submit_extent_page(&bio_ctrl, eb->start, page, eb->len, eb->start - page_offset(page)); unlock_page(page); + submit_one_bio(&bio_ctrl); /* * Submission finished without problem, if no range of the page is * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - bio_ctrl->wbc->nr_to_write--; + wbc->nr_to_write--; } static noinline_for_stack void write_one_eb(struct extent_buffer *eb, - struct btrfs_bio_ctrl *bio_ctrl) + struct writeback_control *wbc) { u64 disk_bytenr = eb->start; int i, num_pages; + struct btrfs_bio_ctrl bio_ctrl = { + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + .end_io_func = end_bio_extent_buffer_writepage, + }; prepare_eb_write(eb); - bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; - num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); - submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); + submit_extent_page(&bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); disk_bytenr += PAGE_SIZE; - bio_ctrl->wbc->nr_to_write--; + wbc->nr_to_write--; unlock_page(p); } + submit_one_bio(&bio_ctrl); } /* @@ -2004,7 +1984,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, * Return >=0 for the number of submitted extent buffers. * Return <0 for fatal error. */ -static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) +static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2056,8 +2036,8 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) if (!eb) continue; - if (lock_extent_buffer_for_io(eb, bio_ctrl)) { - write_one_subpage_eb(eb, bio_ctrl); + if (lock_extent_buffer_for_io(eb, wbc)) { + write_one_subpage_eb(eb, wbc); submitted++; } free_extent_buffer(eb); @@ -2085,7 +2065,7 @@ static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, +static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2097,7 +2077,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, bio_ctrl); + return submit_eb_subpage(page, wbc); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2130,8 +2110,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, * If for_sync, this hole will be filled with * trasnsaction commit. */ - if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && - !bio_ctrl->wbc->for_sync) + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) ret = -EAGAIN; else ret = 0; @@ -2141,12 +2120,12 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, *eb_context = eb; - if (!lock_extent_buffer_for_io(eb, bio_ctrl)) { + if (!lock_extent_buffer_for_io(eb, wbc)) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) btrfs_put_block_group(cache); free_extent_buffer(eb); - return ret; + return 0; } if (cache) { /* @@ -2155,7 +2134,7 @@ static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - write_one_eb(eb, bio_ctrl); + write_one_eb(eb, wbc); free_extent_buffer(eb); return 1; } @@ -2164,11 +2143,6 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct btrfs_bio_ctrl bio_ctrl = { - .wbc = wbc, - .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, - }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -2210,7 +2184,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); + ret = submit_eb_page(&folio->page, wbc, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -2271,8 +2245,6 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&bio_ctrl, ret); - btrfs_zoned_meta_io_unlock(fs_info); return ret; } From 81a79b6ae4519a97c5924d68e1fb77d283cb051d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:32 +0200 Subject: [PATCH 059/194] btrfs: move page locking from lock_extent_buffer_for_io to write_one_eb Locking the pages in lock_extent_buffer_for_io only for the non-subpage case is very confusing. Move it to write_one_eb to mirror the subpage case and simplify the code. Now lock_extent_buffer_for_io does not leave all the pages locked and each is individually locked/unlocked in write_one_eb. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40b612cf80a9..6284dfd43c91 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1636,7 +1636,6 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e { struct btrfs_fs_info *fs_info = eb->fs_info; bool ret = false; - int i; btrfs_tree_lock(eb); while (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { @@ -1664,20 +1663,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e } else { spin_unlock(&eb->refs_lock); } - btrfs_tree_unlock(eb); - - /* - * Either we don't need to submit any tree block, or we're submitting - * subpage eb. - * Subpage metadata doesn't use page locking at all, so we can skip - * the page locking. - */ - if (!ret || fs_info->nodesize < PAGE_SIZE) - return ret; - - for (i = 0; i < num_extent_pages(eb); i++) - lock_page(eb->pages[i]); return ret; } @@ -1960,6 +1946,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; + lock_page(p); clear_page_dirty_for_io(p); set_page_writeback(p); submit_extent_page(&bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); From b51e6b4bda5bddbd002ace239eb5eadd4d729039 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:33 +0200 Subject: [PATCH 060/194] btrfs: don't use btrfs_bio_ctrl for extent buffer writing The btrfs_bio_ctrl machinery is overkill for writing extent_buffers as we always operate on PAGE_SIZE chunks (or one smaller one for the subpage case) that are contiguous and are guaranteed to fit into a single bio. Replace it with open coded btrfs_bio_alloc, __bio_add_page and btrfs_submit_bio calls. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 45 ++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6284dfd43c91..3d60dde197f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -121,9 +121,6 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) /* Caller should ensure the bio has at least some range added */ ASSERT(bbio->bio.bi_iter.bi_size); - if (!is_data_inode(&bbio->inode->vfs_inode)) - bbio->bio.bi_opf |= REQ_META; - if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) btrfs_submit_compressed_read(bbio); @@ -1899,11 +1896,7 @@ static void write_one_subpage_eb(struct extent_buffer *eb, struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; bool no_dirty_ebs = false; - struct btrfs_bio_ctrl bio_ctrl = { - .wbc = wbc, - .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .end_io_func = end_bio_subpage_eb_writepage, - }; + struct btrfs_bio *bbio; prepare_eb_write(eb); @@ -1917,10 +1910,17 @@ static void write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - submit_extent_page(&bio_ctrl, eb->start, page, eb->len, - eb->start - page_offset(page)); + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), + eb->fs_info, end_bio_subpage_eb_writepage, NULL); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + __bio_add_page(&bbio->bio, page, eb->len, eb->start - page_offset(page)); + wbc_account_cgroup_owner(wbc, page, eb->len); unlock_page(page); - submit_one_bio(&bio_ctrl); + btrfs_submit_bio(bbio, 0); + /* * Submission finished without problem, if no range of the page is * dirty anymore, we have submitted a page. Update nr_written in wbc. @@ -1932,16 +1932,21 @@ static void write_one_subpage_eb(struct extent_buffer *eb, static noinline_for_stack void write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc) { - u64 disk_bytenr = eb->start; + struct btrfs_bio *bbio; int i, num_pages; - struct btrfs_bio_ctrl bio_ctrl = { - .wbc = wbc, - .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .end_io_func = end_bio_extent_buffer_writepage, - }; prepare_eb_write(eb); + bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, + REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), + eb->fs_info, end_bio_extent_buffer_writepage, + NULL); + bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; + bio_set_dev(&bbio->bio, eb->fs_info->fs_devices->latest_dev->bdev); + wbc_init_bio(wbc, &bbio->bio); + bbio->inode = BTRFS_I(eb->fs_info->btree_inode); + bbio->file_offset = eb->start; + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -1949,12 +1954,12 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, lock_page(p); clear_page_dirty_for_io(p); set_page_writeback(p); - submit_extent_page(&bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); - disk_bytenr += PAGE_SIZE; + __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); + wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); wbc->nr_to_write--; unlock_page(p); } - submit_one_bio(&bio_ctrl); + btrfs_submit_bio(bbio, 0); } /* From cd88a4fdbf1e3d55d852dd67c38155ee3ee3469f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:34 +0200 Subject: [PATCH 061/194] btrfs: use a separate end_io handler for extent_buffer writing Now that we always use a single bio to write an extent_buffer, the buffer can be passed to the end_io handler as private data. This allows to simplify the metadata write end I/O handler, and merge the subpage end_io handler into the main one. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 128 +++++++++---------------------------------- 1 file changed, 27 insertions(+), 101 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3d60dde197f1..eebcef557bd8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1614,13 +1614,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - /* * Lock extent buffer status and pages for writeback. * @@ -1664,13 +1657,11 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e return ret; } -static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) +static void set_btree_ioerr(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - btrfs_page_set_error(fs_info, page, eb->start, eb->len); - if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) - return; + set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); /* * A read may stumble upon this buffer later, make sure that it gets an @@ -1684,7 +1675,7 @@ static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) * return a 0 because we are readonly if we don't modify the err seq for * the superblock. */ - mapping_set_error(page->mapping, -EIO); + mapping_set_error(eb->fs_info->btree_inode->i_mapping, -EIO); /* * If writeback for a btree extent that doesn't belong to a log tree @@ -1759,102 +1750,38 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -/* - * The endio function for subpage extent buffer write. - * - * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() - * after all extent buffers in the page has finished their writeback. - */ -static void end_bio_subpage_eb_writepage(struct btrfs_bio *bbio) +static void extent_buffer_write_end_io(struct btrfs_bio *bbio) { - struct bio *bio = &bbio->bio; - struct btrfs_fs_info *fs_info; - struct bio_vec *bvec; + struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; + bool uptodate = !bbio->bio.bi_status; struct bvec_iter_all iter_all; - - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->nodesize < PAGE_SIZE); - - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 bvec_end = bvec_start + bvec->bv_len - 1; - u64 cur_bytenr = bvec_start; - - ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); - - /* Iterate through all extent buffers in the range */ - while (cur_bytenr <= bvec_end) { - struct extent_buffer *eb; - int done; - - /* - * Here we can't use find_extent_buffer(), as it may - * try to lock eb->refs_lock, which is not safe in endio - * context. - */ - eb = find_extent_buffer_nolock(fs_info, cur_bytenr); - ASSERT(eb); - - cur_bytenr = eb->start + eb->len; - - ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); - done = atomic_dec_and_test(&eb->io_pages); - ASSERT(done); - - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); - set_btree_ioerr(page, eb); - } - - btrfs_subpage_clear_writeback(fs_info, page, eb->start, - eb->len); - end_extent_buffer_writeback(eb); - /* - * free_extent_buffer() will grab spinlock which is not - * safe in endio context. Thus here we manually dec - * the ref. - */ - atomic_dec(&eb->refs); - } - } - bio_put(bio); -} - -static void end_bio_extent_buffer_writepage(struct btrfs_bio *bbio) -{ - struct bio *bio = &bbio->bio; struct bio_vec *bvec; - struct extent_buffer *eb; - int done; - struct bvec_iter_all iter_all; + u32 bio_offset = 0; - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + if (!uptodate) + set_btree_ioerr(eb); + + bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + u64 start = eb->start + bio_offset; struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - done = atomic_dec_and_test(&eb->io_pages); + atomic_dec(&eb->io_pages); - if (bio->bi_status || - test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { - ClearPageUptodate(page); - set_btree_ioerr(page, eb); + if (!uptodate) { + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); } - - end_page_writeback(page); - - if (!done) - continue; - - end_extent_buffer_writeback(eb); + btrfs_page_clear_writeback(fs_info, page, start, len); + bio_offset += len; } - bio_put(bio); + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); + + bio_put(&bbio->bio); } static void prepare_eb_write(struct extent_buffer *eb) @@ -1912,7 +1839,7 @@ static void write_one_subpage_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, end_bio_subpage_eb_writepage, NULL); + eb->fs_info, extent_buffer_write_end_io, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; @@ -1939,8 +1866,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, end_bio_extent_buffer_writepage, - NULL); + eb->fs_info, extent_buffer_write_end_io, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, eb->fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); From 31d89399dad0cf5524bd0e4e2c2827ae7004b2ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:35 +0200 Subject: [PATCH 062/194] btrfs: remove the extent_buffer lookup in btree block checksumming The checksumming of btree blocks always operates on the entire extent_buffer, and because btree blocks are always allocated contiguously on disk they are never split by btrfs_submit_bio. Simplify the checksumming code by finding the extent_buffer in the btrfs_bio private data instead of trying to search through the bio_vec. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 120 +++++++++------------------------------------ 1 file changed, 24 insertions(+), 96 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index db04a957b445..384c8110ef2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -254,12 +254,34 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, return ret; } -static int csum_one_extent_buffer(struct extent_buffer *eb) +/* + * Checksum a dirty tree block before IO. + */ +blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) { + struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; + u64 found_start = btrfs_header_bytenr(eb); u8 result[BTRFS_CSUM_SIZE]; int ret; + /* Btree blocks are always contiguous on disk. */ + if (WARN_ON_ONCE(bbio->file_offset != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) + return BLK_STS_IOERR; + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON_ONCE(found_start != 0); + return BLK_STS_OK; + } + + if (WARN_ON_ONCE(found_start != eb->start)) + return BLK_STS_IOERR; + if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, + eb->len))) + return BLK_STS_IOERR; + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE) == 0); @@ -286,8 +308,7 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return BLK_STS_OK; error: btrfs_print_tree(eb, 0); @@ -301,99 +322,6 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return ret; -} - -/* Checksum all dirty extent buffers in one bio_vec */ -static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, - struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 bvec_start = page_offset(page) + bvec->bv_offset; - u64 cur; - int ret = 0; - - for (cur = bvec_start; cur < bvec_start + bvec->bv_len; - cur += fs_info->nodesize) { - struct extent_buffer *eb; - bool uptodate; - - eb = find_extent_buffer(fs_info, cur); - uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, - fs_info->nodesize); - - /* A dirty eb shouldn't disappear from buffer_radix */ - if (WARN_ON(!eb)) - return -EUCLEAN; - - if (WARN_ON(cur != btrfs_header_bytenr(eb))) { - free_extent_buffer(eb); - return -EUCLEAN; - } - if (WARN_ON(!uptodate)) { - free_extent_buffer(eb); - return -EUCLEAN; - } - - ret = csum_one_extent_buffer(eb); - free_extent_buffer(eb); - if (ret < 0) - return ret; - } - return ret; -} - -/* - * Checksum a dirty tree block before IO. This has extra checks to make sure - * we only fill in the checksum field in the first page of a multi-page block. - * For subpage extent buffers we need bvec to also read the offset in the page. - */ -static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - u64 start = page_offset(page); - u64 found_start; - struct extent_buffer *eb; - - if (fs_info->nodesize < PAGE_SIZE) - return csum_dirty_subpage_buffers(fs_info, bvec); - - eb = (struct extent_buffer *)page->private; - if (page != eb->pages[0]) - return 0; - - found_start = btrfs_header_bytenr(eb); - - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON(found_start != 0); - return 0; - } - - /* - * Please do not consolidate these warnings into a single if. - * It is useful to know what went wrong. - */ - if (WARN_ON(found_start != start)) - return -EUCLEAN; - if (WARN_ON(!PageUptodate(page))) - return -EUCLEAN; - - return csum_one_extent_buffer(eb); -} - -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) -{ - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - struct bvec_iter iter; - struct bio_vec bv; - int ret = 0; - - bio_for_each_segment(bv, &bbio->bio, iter) { - ret = csum_dirty_buffer(fs_info, &bv); - if (ret) - break; - } - return errno_to_blk_status(ret); } From 113fa05c2fa1a9ee45a31909e6d78666e426adca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:36 +0200 Subject: [PATCH 063/194] btrfs: remove the io_pages field in struct extent_buffer No need to track the number of pages under I/O now that each extent_buffer is read and written using a single bio. For the read side we need to grab an extra reference for the duration of the I/O to prevent eviction, though. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 17 +++++------------ fs/btrfs/extent_io.h | 1 - 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eebcef557bd8..d8e6c8d5ed98 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1767,8 +1767,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - atomic_dec(&eb->io_pages); - if (!uptodate) { btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_set_error(fs_info, page, start, len); @@ -1791,7 +1789,6 @@ static void prepare_eb_write(struct extent_buffer *eb) unsigned long end; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - atomic_set(&eb->io_pages, num_extent_pages(eb)); /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); @@ -3235,8 +3232,7 @@ static void __free_extent_buffer(struct extent_buffer *eb) static int extent_buffer_under_io(const struct extent_buffer *eb) { - return (atomic_read(&eb->io_pages) || - test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || + return (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) || test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } @@ -3372,7 +3368,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - atomic_set(&eb->io_pages, 0); ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); @@ -3489,9 +3484,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * adequately protected by the refcount, but the TREE_REF bit and * its corresponding reference are not. To protect against this * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io after they set eb->io_pages. Note that once io is - * initiated, TREE_REF can no longer be cleared, so that is the - * moment at which any such race is best fixed. + * which trigger io. Note that once io is initiated, TREE_REF can no + * longer be cleared, so that is the moment at which any such race is + * best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -4062,7 +4057,6 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) struct bio_vec *bvec; u32 bio_offset = 0; - atomic_inc(&eb->refs); eb->read_mirror = bbio->mirror_num; if (uptodate && @@ -4077,7 +4071,6 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) } bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - atomic_dec(&eb->io_pages); end_page_read(bvec->bv_page, uptodate, eb->start + bio_offset, bvec->bv_len); bio_offset += bvec->bv_len; @@ -4100,8 +4093,8 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; - atomic_set(&eb->io_pages, num_pages); check_buffer_tree_ref(eb); + atomic_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index e11ee09ab26b..217c433bb999 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,7 +79,6 @@ struct extent_buffer { struct btrfs_fs_info *fs_info; spinlock_t refs_lock; atomic_t refs; - atomic_t io_pages; int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; From 011134f444dcc4192b9945b963d4b8340db3667d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:37 +0200 Subject: [PATCH 064/194] btrfs: stop using PageError for extent_buffers PageError is only used to limit the uptodate check in assert_eb_page_uptodate. But we have a much more useful flag indicating the exact condition we are about with the EXTENT_BUFFER_WRITE_ERR flag, so use that instead and help the kernel toward eventually removing PageError. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d8e6c8d5ed98..16e439fb3d80 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1767,10 +1767,8 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - if (!uptodate) { + if (!uptodate) btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } btrfs_page_clear_writeback(fs_info, page, start, len); bio_offset += len; } @@ -3948,7 +3946,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, continue; lock_page(page); btree_clear_page_dirty(page); - ClearPageError(page); unlock_page(page); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -4107,10 +4104,8 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, __bio_add_page(&bbio->bio, eb->pages[0], eb->len, eb->start - page_offset(eb->pages[0])); } else { - for (i = 0; i < num_pages; i++) { - ClearPageError(eb->pages[i]); + for (i = 0; i < num_pages; i++) __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); - } } btrfs_submit_bio(bbio, mirror_num); } @@ -4150,7 +4145,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return 0; } - btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); __read_extent_buffer_pages(eb, mirror_num, check); @@ -4392,18 +4386,16 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * looked up. We don't want to complain in this case, as the page was * valid before, we just didn't write it out. Instead we want to catch * the case where we didn't actually read the block properly, which - * would have !PageUptodate && !PageError, as we clear PageError before - * reading. + * would have !PageUptodate and !EXTENT_BUFFER_WRITE_ERR. */ - if (fs_info->nodesize < PAGE_SIZE) { - bool uptodate, error; + if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; - uptodate = btrfs_subpage_test_uptodate(fs_info, page, - eb->start, eb->len); - error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate && !error); + if (fs_info->nodesize < PAGE_SIZE) { + WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len)); } else { - WARN_ON(!PageUptodate(page) && !PageError(page)); + WARN_ON(!PageUptodate(page)); } } From f3d315eb9372868bce92e7b24f2b92e061fe6fcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:38 +0200 Subject: [PATCH 065/194] btrfs: don't check for uptodate pages in read_extent_buffer_pages The only place that reads in pages and thus marks them uptodate for the btree inode is read_extent_buffer_pages. Which means that either pages are already uptodate from an old buffer when creating a new one in alloc_extent_buffer, or they will be updated by ca call to read_extent_buffer_pages. This means the checks for uptodate pages in read_extent_buffer_pages and read_extent_buffer_subpage are superfluous and can be removed. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 16e439fb3d80..40f06392a7cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4136,10 +4136,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return ret; } - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || - PageUptodate(page) || - btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) { unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); return 0; @@ -4166,7 +4163,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, int i; struct page *page; int locked_pages = 0; - int all_uptodate = 1; int num_pages; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -4201,21 +4197,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, } locked_pages++; } - /* - * We need to firstly lock all pages to make sure that - * the uptodate bit of our pages won't be affected by - * clear_extent_buffer_uptodate(). - */ - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageUptodate(page)) - all_uptodate = 0; - } - - if (all_uptodate) { - set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - goto unlock_exit; - } __read_extent_buffer_pages(eb, mirror_num, check); From 9e2aff90fc2ad2b94933762f5442fef9ee7a692e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:39 +0200 Subject: [PATCH 066/194] btrfs: stop using lock_extent in btrfs_buffer_uptodate The only other place that locks extents on the btree inode is read_extent_buffer_subpage while reading in the partial page for a buffer. This means locking the extent in btrfs_buffer_uptodate does not synchronize with anything on non-subpage file systems, and on subpage file systems it only waits for a parallel read(-ahead) to finish, which seems to be counter to what the callers actually expect. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 384c8110ef2a..a3366fe546b1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -112,11 +112,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) */ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic) { - struct inode *btree_inode = eb->pages[0]->mapping->host; - struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; - struct extent_state *cached_state = NULL; - int ret = 1; - if (!extent_buffer_uptodate(eb)) return 0; @@ -126,7 +121,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atom if (atomic) return -EAGAIN; - lock_extent(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (!extent_buffer_uptodate(eb) || btrfs_header_generation(eb) != parent_transid) { btrfs_err_rl(eb->fs_info, @@ -134,11 +128,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atom eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); clear_extent_buffer_uptodate(eb); - ret = 0; + return 0; } - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return ret; + return 1; } static bool btrfs_supported_super_csum(u16 csum_type) From d7172f52e9933b6ec9305e7fe6e829e3939dba04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:40 +0200 Subject: [PATCH 067/194] btrfs: use per-buffer locking for extent_buffer reading Instead of locking and unlocking every page or the extent, just add a new EXTENT_BUFFER_READING bit that mirrors EXTENT_BUFFER_WRITEBACK for synchronizing threads trying to read an extent_buffer and to wait for I/O completion. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 150 ++++++++++--------------------------------- fs/btrfs/extent_io.h | 2 + 2 files changed, 37 insertions(+), 115 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40f06392a7cf..650cb68dcfb6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4049,6 +4049,7 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) static void extent_buffer_read_end_io(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; + struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; struct bvec_iter_all iter_all; struct bio_vec *bvec; @@ -4068,26 +4069,47 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) } bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { - end_page_read(bvec->bv_page, uptodate, eb->start + bio_offset, - bvec->bv_len); - bio_offset += bvec->bv_len; + u64 start = eb->start + bio_offset; + struct page *page = bvec->bv_page; + u32 len = bvec->bv_len; + + if (uptodate) + btrfs_page_set_uptodate(fs_info, page, start, len); + else + btrfs_page_clear_uptodate(fs_info, page, start, len); + + bio_offset += len; } - if (eb->fs_info->nodesize < PAGE_SIZE) { - unlock_extent(&bbio->inode->io_tree, eb->start, - eb->start + bio_offset - 1, NULL); - } + clear_bit(EXTENT_BUFFER_READING, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING); free_extent_buffer(eb); bio_put(&bbio->bio); } -static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, - struct btrfs_tree_parent_check *check) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) { int num_pages = num_extent_pages(eb), i; struct btrfs_bio *bbio; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + return 0; + + /* + * We could have had EXTENT_BUFFER_UPTODATE cleared by the write + * operation, which could potentially still be in flight. In this case + * we simply want to return an error. + */ + if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) + return -EIO; + + /* Someone else is already reading the buffer, just wait for it. */ + if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) + goto done; + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); @@ -4108,117 +4130,15 @@ static void __read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); } btrfs_submit_bio(bbio, mirror_num); -} -static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num, - struct btrfs_tree_parent_check *check) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; - struct page *page = eb->pages[0]; - struct extent_state *cached_state = NULL; - int ret; - - ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); - ASSERT(PagePrivate(page)); - ASSERT(check); - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; - - if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state)) - return -EAGAIN; - } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - if (ret < 0) - return ret; - } - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) { - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, - &cached_state); - return 0; - } - - btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - - __read_extent_buffer_pages(eb, mirror_num, check); - if (wait != WAIT_COMPLETE) { - free_extent_state(cached_state); - return 0; - } - - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, - EXTENT_LOCKED, &cached_state); - if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return -EIO; - return 0; -} - -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, - struct btrfs_tree_parent_check *check) -{ - int i; - struct page *page; - int locked_pages = 0; - int num_pages; - - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) - return 0; - - /* - * We could have had EXTENT_BUFFER_UPTODATE cleared by the write - * operation, which could potentially still be in flight. In this case - * we simply want to return an error. - */ - if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) - return -EIO; - - if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num, check); - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (wait == WAIT_NONE) { - /* - * WAIT_NONE is only utilized by readahead. If we can't - * acquire the lock atomically it means either the eb - * is being read out or under modification. - * Either way the eb will be or has been cached, - * readahead can exit safely. - */ - if (!trylock_page(page)) - goto unlock_exit; - } else { - lock_page(page); - } - locked_pages++; - } - - __read_extent_buffer_pages(eb, mirror_num, check); - - if (wait != WAIT_COMPLETE) - return 0; - - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - wait_on_page_locked(page); - if (!PageUptodate(page)) +done: + if (wait == WAIT_COMPLETE) { + wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return -EIO; } return 0; - -unlock_exit: - while (locked_pages > 0) { - locked_pages--; - page = eb->pages[locked_pages]; - unlock_page(page); - } - return 0; } static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 217c433bb999..2d91ca91dca5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,6 +29,8 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + /* Indicate that extent buffer pages a being read */ + EXTENT_BUFFER_READING, }; /* these are flags for __process_pages_contig */ From 46672a44b023461d13f063bc95bf832f8124177f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:24:41 +0200 Subject: [PATCH 068/194] btrfs: merge write_one_subpage_eb into write_one_eb Most of the code in write_one_subpage_eb and write_one_eb is shared, so merge the two functions into one. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 80 ++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 650cb68dcfb6..b38fbdd70a6c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1808,54 +1808,11 @@ static void prepare_eb_write(struct extent_buffer *eb) } } -/* - * Unlike the work in write_one_eb(), we rely completely on extent locking. - * Page locking is only utilized at minimum to keep the VMM code happy. - */ -static void write_one_subpage_eb(struct extent_buffer *eb, - struct writeback_control *wbc) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; - bool no_dirty_ebs = false; - struct btrfs_bio *bbio; - - prepare_eb_write(eb); - - /* clear_page_dirty_for_io() in subpage helper needs page locked */ - lock_page(page); - btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); - - /* Check if this is the last dirty bit to update nr_written */ - no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, - eb->start, eb->len); - if (no_dirty_ebs) - clear_page_dirty_for_io(page); - - bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, - REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, extent_buffer_write_end_io, eb); - bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; - __bio_add_page(&bbio->bio, page, eb->len, eb->start - page_offset(page)); - wbc_account_cgroup_owner(wbc, page, eb->len); - unlock_page(page); - btrfs_submit_bio(bbio, 0); - - /* - * Submission finished without problem, if no range of the page is - * dirty anymore, we have submitted a page. Update nr_written in wbc. - */ - if (no_dirty_ebs) - wbc->nr_to_write--; -} - static noinline_for_stack void write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_bio *bbio; - int i, num_pages; prepare_eb_write(eb); @@ -1863,22 +1820,35 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), eb->fs_info, extent_buffer_write_end_io, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; - bio_set_dev(&bbio->bio, eb->fs_info->fs_devices->latest_dev->bdev); + bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; + if (fs_info->nodesize < PAGE_SIZE) { + struct page *p = eb->pages[0]; lock_page(p); - clear_page_dirty_for_io(p); - set_page_writeback(p); - __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); - wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); - wbc->nr_to_write--; + btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + eb->len)) { + clear_page_dirty_for_io(p); + wbc->nr_to_write--; + } + __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); + wbc_account_cgroup_owner(wbc, p, eb->len); unlock_page(p); + } else { + for (int i = 0; i < num_extent_pages(eb); i++) { + struct page *p = eb->pages[i]; + + lock_page(p); + clear_page_dirty_for_io(p); + set_page_writeback(p); + __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); + wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); + wbc->nr_to_write--; + unlock_page(p); + } } btrfs_submit_bio(bbio, 0); } @@ -1950,7 +1920,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) continue; if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_subpage_eb(eb, wbc); + write_one_eb(eb, wbc); submitted++; } free_extent_buffer(eb); From 4693893bf8d01c37a8952a3ea2a1bdfeb4106277 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:35 +0800 Subject: [PATCH 069/194] btrfs: reduce struct btrfs_fs_devices size by moving fsid_change Pack bool fsid_change and bool seeding with other bool declarations in the struct btrfs_fs_devices, approximately 6 bytes is saved, depending on the config. before: 512 bytes after: 496 bytes Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5cbbee32748c..236ae696c984 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -281,7 +281,6 @@ enum btrfs_read_policy { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; - bool fsid_change; struct list_head fs_list; /* @@ -337,7 +336,6 @@ struct btrfs_fs_devices { struct list_head alloc_list; struct list_head seed_list; - bool seeding; int opened; @@ -347,6 +345,8 @@ struct btrfs_fs_devices { bool rotating; /* Devices support TRIM/discard commands */ bool discardable; + bool fsid_change; + bool seeding; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ From 19c4c49ca9d0eb14987386598e44a533a367f745 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:36 +0800 Subject: [PATCH 070/194] btrfs: streamline fsid checks in alloc_fs_devices We currently have redundant checks for the non-null value of fsid simplify it. And, no one is using alloc_fs_devices() with a NULL metadata_uuid while fsid is not NULL, add an assert() to verify this condition. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1a7620680f50..c98b45c350db 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -370,6 +370,8 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, { struct btrfs_fs_devices *fs_devs; + ASSERT(fsid || !metadata_fsid); + fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -380,13 +382,12 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); - if (fsid) - memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - if (metadata_fsid) - memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); - else if (fsid) - memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); + if (fsid) { + memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, + metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + } return fs_devs; } From c6930d7d11e30f55dee1e306bc3a3105fde56f93 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:37 +0800 Subject: [PATCH 071/194] btrfs: merge calls to alloc_fs_devices in device_list_add Simplify has_metadata_uuid checks - by localizing the has_metadata_uuid checked within alloc_fs_devices()'s second argument, it improves the code readability. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c98b45c350db..de60a55dd7d3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -791,12 +791,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { - if (has_metadata_uuid) - fs_devices = alloc_fs_devices(disk_super->fsid, - disk_super->metadata_uuid); - else - fs_devices = alloc_fs_devices(disk_super->fsid, NULL); - + fs_devices = alloc_fs_devices(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); From f62c302e6dfe7bb03b35157e1c1b7cdaeabd54f9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:38 +0800 Subject: [PATCH 072/194] btrfs: add comment about metadata_uuid in btrfs_fs_devices Add comment about metadata_uuid in btrfs_fs_devices. No functional change. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 236ae696c984..24f30542ef7c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -280,7 +280,19 @@ enum btrfs_read_policy { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* + * UUID written into the btree blocks: + * + * - If metadata_uuid != fsid then super block must have + * BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag set. + * + * - Following shall be true at all times: + * - metadata_uuid == btrfs_header::fsid + * - metadata_uuid == btrfs_dev_item::fsid + */ u8 metadata_uuid[BTRFS_FSID_SIZE]; + struct list_head fs_list; /* From 413fb1bc1d3224e508c698cd82e5ff1dc7d16d64 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:39 +0800 Subject: [PATCH 073/194] btrfs: return bool from check_tree_block_fsid instead of int Simplify the return type of check_tree_block_fsid() from int (1 or 0) to bool. Its only user is interested in knowing the success or failure. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a3366fe546b1..8102d2aa3f62 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -317,7 +317,7 @@ error: return errno_to_blk_status(ret); } -static int check_tree_block_fsid(struct extent_buffer *eb) +static bool check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; @@ -337,13 +337,13 @@ static int check_tree_block_fsid(struct extent_buffer *eb) metadata_uuid = fs_devices->fsid; if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) - return 0; + return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) - return 0; + return false; - return 1; + return true; } /* Do basic extent buffer checks at read time */ From 1a8983450090c97bdc498d021fd74fc7e42cc38c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:40 +0800 Subject: [PATCH 074/194] btrfs: simplify fsid and metadata_uuid comparisons Refactor the functions find_fsid() and find_fsid_with_metadata_uuid(), as they currently share a common set of code to compare the fsid and metadata_uuid. Create a common helper function, match_fsid_fs_devices(). Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index de60a55dd7d3..1affe63a84d6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -427,6 +427,21 @@ void __exit btrfs_cleanup_fs_uuids(void) } } +static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid, const u8 *metadata_fsid) +{ + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0) + return false; + + if (!metadata_fsid) + return true; + + if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) + return false; + + return true; +} + static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -436,15 +451,8 @@ static noinline struct btrfs_fs_devices *find_fsid( /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (metadata_fsid) { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 - && memcmp(metadata_fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } else { - if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) - return fs_devices; - } + if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid)) + return fs_devices; } return NULL; } @@ -462,14 +470,14 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * at all and the CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(disk_super->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, + fs_devices->fsid)) return fs_devices; - } } + /* * Handle scanned device having completed its fsid change but * belonging to a fs_devices that was created by a device that From a3c54b0be1a218bdd8e42abf9ecfd9bddf1eadee Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:41 +0800 Subject: [PATCH 075/194] btrfs: simplify how changed fsid and metadata_uuid is checked We often check if the metadata_uuid is not the same as fsid, and then we check if the given fsid matches the metadata_uuid. This patch refactors this logic into function match_fsid_changed and utilize it. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 47 +++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1affe63a84d6..a4bfec088617 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -457,6 +457,19 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } +/* + * First check if the metadata_uuid is different from the fsid in the given + * fs_devices. Then check if the given fsid is the same as the metadata_uuid + * in the fs_devices. If it is, return true; otherwise, return false. + */ +static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, + const u8 *fsid) +{ + return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; +} + static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( struct btrfs_super_block *disk_super) { @@ -485,13 +498,11 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( * CHANGING_FSID_V2 flag set. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(fs_devices->metadata_uuid, - fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && - memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) return fs_devices; - } } return find_fsid(disk_super->fsid, disk_super->metadata_uuid); @@ -682,18 +693,16 @@ static struct btrfs_fs_devices *find_fsid_inprogress( struct btrfs_fs_devices *fs_devices; list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { + if (fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; - } } return find_fsid(disk_super->fsid, NULL); } - static struct btrfs_fs_devices *find_fsid_changed( struct btrfs_super_block *disk_super) { @@ -710,10 +719,7 @@ static struct btrfs_fs_devices *find_fsid_changed( */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { /* Changed UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0 && + if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && memcmp(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE) != 0) return fs_devices; @@ -744,11 +750,10 @@ static struct btrfs_fs_devices *find_fsid_reverted_metadata( * fs_devices equal to the FSID of the disk. */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, disk_super->fsid, - BTRFS_FSID_SIZE) == 0 && - fs_devices->fsid_change) + if (!fs_devices->fsid_change) + continue; + + if (check_fsid_changed(fs_devices, disk_super->fsid)) return fs_devices; } From 25984a5ae8f144e823468d39e483face534e45d1 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:42 +0800 Subject: [PATCH 076/194] btrfs: consolidate uuid comparisons in btrfs_validate_super There are three ways the fsid is validated in btrfs_validate_super(): - verify that super_copy::fsid is the same as fs_devices::fsid - if the metadata_uuid flag is set, verify if super_copy::metadata_uuid and fs_devices::metadata_uuid are the same. - a few lines below, often missed out, verify if dev_item::fsid is the same as fs_devices::metadata_uuid. The function btrfs_validate_super() contains multiple if-statements with memcmp() to check UUIDs. This patch consolidates them into a single location. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8102d2aa3f62..6e0d4f99ab04 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2399,6 +2399,14 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0) { + btrfs_err(fs_info, + "dev_item UUID does not match metadata fsid: %pU != %pU", + fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); + ret = -EINVAL; + } + /* * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. @@ -2411,14 +2419,6 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, - BTRFS_FSID_SIZE) != 0) { - btrfs_err(fs_info, - "dev_item UUID does not match metadata fsid: %pU != %pU", - fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); - ret = -EINVAL; - } - /* * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later From d85512d54e15c3b8f17dbab44b3d41c1d956902a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 24 May 2023 20:02:43 +0800 Subject: [PATCH 077/194] btrfs: add and fix comments in btrfs_fs_devices Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 24f30542ef7c..16fc640cabd3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -330,34 +330,31 @@ struct btrfs_fs_devices { */ struct btrfs_device *latest_dev; - /* all of the devices in the FS, protected by a mutex - * so we can safely walk it to write out the supers without - * worrying about add/remove by the multi-device code. - * Scrubbing super can kick off supers writing by holding - * this mutex lock. + /* + * All of the devices in the filesystem, protected by a mutex so we can + * safely walk it to write out the super blocks without worrying about + * adding/removing by the multi-device code. Scrubbing super block can + * kick off supers writing by holding this mutex lock. */ struct mutex device_list_mutex; /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* - * Devices which can satisfy space allocation. Protected by - * chunk_mutex - */ + /* Devices which can satisfy space allocation. Protected by * chunk_mutex. */ struct list_head alloc_list; struct list_head seed_list; + /* Count fs-devices opened. */ int opened; - /* set when we find or add a device that doesn't have the - * nonrot flag set - */ + /* Set when we find or add a device that doesn't have the nonrot flag set. */ bool rotating; - /* Devices support TRIM/discard commands */ + /* Devices support TRIM/discard commands. */ bool discardable; bool fsid_change; + /* The filesystem is a seed filesystem. */ bool seeding; struct btrfs_fs_info *fs_info; @@ -369,7 +366,7 @@ struct btrfs_fs_devices { enum btrfs_chunk_allocation_policy chunk_alloc_policy; - /* Policy used to read the mirrored stripes */ + /* Policy used to read the mirrored stripes. */ enum btrfs_read_policy read_policy; }; From 85724171b302914bb8999b9df091fd4616a36eb7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 May 2023 10:40:18 +0200 Subject: [PATCH 078/194] btrfs: fix the btrfs_get_global_root return value btrfs_grab_root returns either the root or NULL, and the callers of btrfs_get_global_root expect it to return the same. But all the more recently added roots instead return an ERR_PTR, so fix this. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6e0d4f99ab04..4d347995fcaa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1183,19 +1183,13 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, if (objectid == BTRFS_CSUM_TREE_OBJECTID) return btrfs_grab_root(btrfs_global_root(fs_info, &key)); if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->quota_root); if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); + return btrfs_grab_root(fs_info->uuid_root); if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) - return btrfs_grab_root(fs_info->block_group_root) ? - fs_info->block_group_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) { - struct btrfs_root *root = btrfs_global_root(fs_info, &key); - - return btrfs_grab_root(root) ? root : ERR_PTR(-ENOENT); - } + return btrfs_grab_root(fs_info->block_group_root); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return btrfs_grab_root(btrfs_global_root(fs_info, &key)); return NULL; } From e91909aace336a182da20bcbe507995ac24f74bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 May 2023 10:40:19 +0200 Subject: [PATCH 079/194] btrfs: convert btrfs_get_global_root to use a switch statement Use a switch statement instead of an endless chain of if statements to make the code a little cleaner. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4d347995fcaa..c53c82cd0885 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1172,25 +1172,28 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, .offset = 0, }; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) + switch (objectid) { + case BTRFS_ROOT_TREE_OBJECTID: return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + case BTRFS_EXTENT_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + case BTRFS_CHUNK_TREE_OBJECTID: return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) + case BTRFS_DEV_TREE_OBJECTID: return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) + case BTRFS_CSUM_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) + case BTRFS_QUOTA_TREE_OBJECTID: return btrfs_grab_root(fs_info->quota_root); - if (objectid == BTRFS_UUID_TREE_OBJECTID) + case BTRFS_UUID_TREE_OBJECTID: return btrfs_grab_root(fs_info->uuid_root); - if (objectid == BTRFS_BLOCK_GROUP_TREE_OBJECTID) + case BTRFS_BLOCK_GROUP_TREE_OBJECTID: return btrfs_grab_root(fs_info->block_group_root); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); - return NULL; + default: + return NULL; + } } int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, From 25ac047c9d3df373de35ba4ee3f0c902874a1fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 23 May 2023 10:40:20 +0200 Subject: [PATCH 080/194] btrfs: remove a pointless NULL check in btrfs_lookup_fs_root btrfs_grab_root already checks for a NULL root itself. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c53c82cd0885..06dab9d018c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1157,8 +1157,7 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)root_id); - if (root) - root = btrfs_grab_root(root); + root = btrfs_grab_root(root); spin_unlock(&fs_info->fs_roots_radix_lock); return root; } From dc5646c15cd6f320b8881781336041fcdb58ff97 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:21 +0200 Subject: [PATCH 081/194] btrfs: open code set_extent_defrag The helper is used only once. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 4 +++- fs/btrfs/extent-io-tree.h | 8 -------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 8065341d831a..4e7a1e0a0441 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1040,7 +1040,9 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, clear_extent_bit(&inode->io_tree, start, start + len - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, + cached_state, GFP_NOFS); /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 21766e49ec02..ea344e5ca24f 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -202,14 +202,6 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, cached_state, GFP_NOFS); } -static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DEFRAG, - cached_state, GFP_NOFS); -} - static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { From 66240ab115907677a9b9954e936a9d6211783cbe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:23 +0200 Subject: [PATCH 082/194] btrfs: open code set_extent_delalloc The helper is used once in fs code and a few times in the self test code. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 9 --------- fs/btrfs/inode.c | 4 ++-- fs/btrfs/tests/extent-io-tests.c | 6 +++--- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ea344e5ca24f..e5289d67b6b7 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -193,15 +193,6 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, u32 extra_bits, - struct extent_state **cached_state) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | extra_bits, - cached_state, GFP_NOFS); -} - static inline int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa781fc863fc..8dd979b2f084 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2922,8 +2922,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, - cached_state); + return set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state, GFP_NOFS); } /* see btrfs_writepage_start_hook for details on why this is required */ diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index dfc5c7fa6038..b9de94a50868 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -159,7 +159,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); + set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -190,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, From eea8686e6830a487d9f09dd12673ac374351f803 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:26 +0200 Subject: [PATCH 083/194] btrfs: open code set_extent_new The helper is used only once. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 6 ------ fs/btrfs/extent-tree.c | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index e5289d67b6b7..293e49377104 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -193,12 +193,6 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5de5b577e7fd..5c7c72042193 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4832,8 +4832,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else - set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL, GFP_NOFS); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, From fe1a598c42a02eb9b3efec0001369d3153eb6ef2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:28 +0200 Subject: [PATCH 084/194] btrfs: open code set_extent_dirty The helper is used a few times, that it's setting the DIRTY extent bit is still clear. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 7 ++++--- fs/btrfs/extent-io-tree.h | 6 ------ fs/btrfs/extent-tree.c | 15 +++++++++------ 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 590b03560265..ec463f8d83ec 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3521,9 +3521,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, + bytenr, bytenr + num_bytes - 1, + EXTENT_DIRTY, NULL, + GFP_NOFS | __GFP_NOFAIL); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 293e49377104..0fc54546a63d 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -175,12 +175,6 @@ static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, cached_state, GFP_NOFS, NULL); } -static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask); -} - static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5c7c72042193..47cfb694cdbf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2507,8 +2507,9 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_dirty(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL, + GFP_NOFS | __GFP_NOFAIL); return 0; } @@ -4829,16 +4830,18 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL, GFP_NOFS); else set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, EXTENT_NEW, NULL, GFP_NOFS); } else { buf->log_index = -1; - set_extent_dirty(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL, + GFP_NOFS); } /* this returns a buffer locked for blocking */ return buf; From e85de967bca42070aa841b1ae41f7702120a0e97 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:30 +0200 Subject: [PATCH 085/194] btrfs: open code set_extent_bits_nowait The helper only passes GFP_NOWAIT as gfp flags and is used two times. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 6 ------ fs/btrfs/extent_map.c | 5 +++-- fs/btrfs/zoned.c | 4 ++-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 0fc54546a63d..ef9d54cdee5c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -156,12 +156,6 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, gfp_t mask); -static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT); -} - static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 138afa955370..918ce12ea412 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -364,8 +364,9 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bits_nowait(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits); + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + stripe_size - 1, bits, NULL, + GFP_NOWAIT); } } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index bda301a55cbe..b82a350c4c59 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1611,8 +1611,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, memzero_extent_buffer(eb, 0, eb->len); set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); - set_extent_bits_nowait(&trans->dirty_pages, eb->start, - eb->start + eb->len - 1, EXTENT_DIRTY); + set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, + EXTENT_DIRTY, NULL, GFP_NOWAIT); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) From 0acd32c294cf51e22ecf31a5b9065047568e5d0d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:32 +0200 Subject: [PATCH 086/194] btrfs: open code set_extent_bits This helper calls set_extent_bit with two more parameters set to default values, but otherwise it's purpose is not clear. Reviewed-by: Christoph Hellwig Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/extent-io-tree.h | 6 ------ fs/btrfs/extent-tree.c | 10 +++++----- fs/btrfs/file-item.c | 10 +++++----- fs/btrfs/relocation.c | 11 ++++++----- fs/btrfs/tests/extent-io-tests.c | 11 ++++++----- 6 files changed, 24 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 78696d331639..3a0fc57d5db9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -795,8 +795,8 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, while (!find_first_extent_bit(&srcdev->alloc_state, start, &found_start, &found_end, CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bits(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED); + ret = set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL, GFP_NOFS); if (ret) break; start = found_end + 1; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index ef9d54cdee5c..5a53a4558366 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -156,12 +156,6 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, gfp_t mask); -static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) -{ - return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS); -} - static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47cfb694cdbf..03b2a7c508b9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -73,8 +73,8 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); + set_extent_bit(&fs_info->excluded_extents, start, end, + EXTENT_UPTODATE, NULL, GFP_NOFS); return 0; } @@ -5981,9 +5981,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bits(&device->alloc_state, start, - start + bytes - 1, - CHUNK_TRIMMED); + set_extent_bit(&device->alloc_state, start, + start + bytes - 1, + CHUNK_TRIMMED, NULL, GFP_NOFS); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index e74b9804bcde..1e364a7b74aa 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -94,8 +94,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bits(&inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY); + return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL, GFP_NOFS); } /* @@ -438,9 +438,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bits(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM); + set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL, GFP_NOFS); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 38cfbd38a819..1ed8b132fe2a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -174,8 +174,9 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY); + set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + NULL, GFP_NOFS); } node->processed = 1; } @@ -3051,9 +3052,9 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bits(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY); + set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL, GFP_NOFS); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index b9de94a50868..acaaddf7181a 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -503,8 +503,8 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bits(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL, GFP_NOFS); find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); @@ -516,8 +516,8 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bits(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL, GFP_NOFS); /* * Request first hole starting at 12M, we should get 4M-32M @@ -548,7 +548,8 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL, + GFP_NOFS); find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); From 7dde7a8ab32417f28c4ac0111f8a8389fc0501ff Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:34 +0200 Subject: [PATCH 087/194] btrfs: drop NOFAIL from set_extent_bit allocation masks The __GFP_NOFAIL passed to set_extent_bit first appeared in 2010 (commit f0486c68e4bd9a ("Btrfs: Introduce contexts for metadata reservation")), without any explanation why it would be needed. Meanwhile we've updated the semantics of set_extent_bit to handle failed allocations and do unlock, sleep and retry if needed. The use of the NOFAIL flag is also an outlier, we never want any of the set/clear extent bit helpers to fail, they're used for many critical changes like extent locking, besides the extent state bit changes. Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +-- fs/btrfs/extent-tree.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ec463f8d83ec..202e2aa949c5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3523,8 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL, - GFP_NOFS | __GFP_NOFAIL); + EXTENT_DIRTY, NULL, GFP_NOFS); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 03b2a7c508b9..6e319100e3a3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2508,8 +2508,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->space_info->lock); set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL, - GFP_NOFS | __GFP_NOFAIL); + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL, GFP_NOFS); return 0; } From 62bc60473ad202aa414edf304a78ddd7bc10ac49 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:37 +0200 Subject: [PATCH 088/194] btrfs: pass NOWAIT for set/clear extent bits as another bit The only flags we now pass to set_extent_bit/__clear_extent_bit are GFP_NOFS and GFP_NOWAIT (a few functions handling mappings). This requires an extra parameter to be passed everywhere but is almost always the same. Encode the GFP_NOWAIT as an artificial extent bit and extract the real bits and gfp mask in the lowest level helpers. Now the passed gfp mask is not actually used and can be removed. Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 12 ++++++++++++ fs/btrfs/extent-io-tree.h | 9 +++++++++ fs/btrfs/extent_map.c | 7 ++++--- fs/btrfs/zoned.c | 2 +- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 29a225836e28..83e40c02f62e 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -532,6 +532,16 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, return next; } +/* + * Detect if extent bits request NOWAIT semantics and set the gfp mask accordingly, + * unset the EXTENT_NOWAIT bit. + */ +static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) +{ + *mask = (*bits & EXTENT_NOWAIT ? GFP_NOWAIT : GFP_NOFS); + *bits &= EXTENT_NOWAIT - 1; +} + /* * Clear some bits on a range in the tree. This may require splitting or * inserting elements in the tree, so the gfp mask is used to indicate which @@ -557,6 +567,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int wake; int delete = (bits & EXTENT_CLEAR_ALL_BITS); + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits); @@ -979,6 +990,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_end; u32 exclusive_bits = (bits & EXTENT_LOCKED); + set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 5a53a4558366..d7f5afeb5ce7 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -43,6 +43,15 @@ enum { * want the extent states to go away. */ ENUM_BIT(EXTENT_CLEAR_ALL_BITS), + + /* + * This must be last. + * + * Bit not representing a state but a request for NOWAIT semantics, + * e.g. when allocating memory, and must be masked out from the other + * bits. + */ + ENUM_BIT(EXTENT_NOWAIT) }; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 918ce12ea412..4c8c87524d62 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -365,8 +365,8 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) struct btrfs_device *device = stripe->dev; set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits, NULL, - GFP_NOWAIT); + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, NULL, GFP_NOWAIT); } } @@ -381,7 +381,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, bits, + stripe->physical + stripe_size - 1, + bits | EXTENT_NOWAIT, NULL, GFP_NOWAIT, NULL); } } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b82a350c4c59..fb90e2b20614 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1612,7 +1612,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY, NULL, GFP_NOWAIT); + EXTENT_DIRTY | EXTENT_NOWAIT, NULL, GFP_NOWAIT); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) From 1d12680044301760485b7f056812cd0d33dca2c6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 May 2023 01:04:39 +0200 Subject: [PATCH 089/194] btrfs: drop gfp from parameter extent state helpers Now that all extent state bit helpers effectively take the GFP_NOFS mask (and GFP_NOWAIT is encoded in the bits) we can remove the parameter. This reduces stack consumption in many functions and simplifies a lot of code. Net effect on module on a release build: text data bss dec hex filename 1250432 20985 16088 1287505 13a551 pre/btrfs.ko 1247074 20985 16088 1284147 139833 post/btrfs.ko DELTA: -3358 Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/defrag.c | 3 +-- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/extent-io-tree.c | 25 +++++++++++++------------ fs/btrfs/extent-io-tree.h | 12 +++++------- fs/btrfs/extent-tree.c | 14 ++++++-------- fs/btrfs/extent_io.c | 3 +-- fs/btrfs/extent_map.c | 4 ++-- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/inode.c | 7 +++---- fs/btrfs/relocation.c | 5 ++--- fs/btrfs/tests/extent-io-tests.c | 13 ++++++------- fs/btrfs/zoned.c | 2 +- 13 files changed, 44 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 202e2aa949c5..618ba7670e66 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3523,7 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL, GFP_NOFS); + EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 4e7a1e0a0441..f2ff4cbe8656 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1041,8 +1041,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached_state); set_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DEFRAG, - cached_state, GFP_NOFS); + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3a0fc57d5db9..dc3f30c79320 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -796,7 +796,7 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, &found_start, &found_end, CHUNK_ALLOCATED, &cached_state)) { ret = set_extent_bit(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED, NULL, GFP_NOFS); + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 83e40c02f62e..a2315a4b8b75 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -556,7 +556,7 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) */ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, - gfp_t mask, struct extent_changeset *changeset) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -566,6 +566,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear = 0; int wake; int delete = (bits & EXTENT_CLEAR_ALL_BITS); + gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); @@ -964,7 +965,8 @@ out: /* * Set some bits on a range in the tree. This may require allocations or - * sleeping, so the gfp mask is used to indicate what is allowed. + * sleeping. By default all allocations use GFP_NOFS, use EXTENT_NOWAIT for + * GFP_NOWAIT. * * If any of the exclusive bits are set, this will fail with -EEXIST if some * part of the range already has the desired bits set. The extent_state of the @@ -979,7 +981,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, struct extent_state **failed_state, struct extent_state **cached_state, - struct extent_changeset *changeset, gfp_t mask) + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -989,6 +991,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; u32 exclusive_bits = (bits & EXTENT_LOCKED); + gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); btrfs_debug_check_extent_io_range(tree, start, end); @@ -1200,10 +1203,10 @@ out: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask) + u32 bits, struct extent_state **cached_state) { return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL, mask); + cached_state, NULL); } /* @@ -1699,8 +1702,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, - changeset, GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1712,8 +1714,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __clear_extent_bit(tree, start, end, bits, NULL, GFP_NOFS, - changeset); + return __clear_extent_bit(tree, start, end, bits, NULL, changeset); } int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, @@ -1723,7 +1724,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, cached, NULL, GFP_NOFS); + NULL, cached, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1745,7 +1746,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - &failed_state, cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, @@ -1755,7 +1756,7 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, &failed_state, - cached_state, NULL, GFP_NOFS); + cached_state, NULL); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index d7f5afeb5ce7..fbd3b275ab1c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -136,22 +136,20 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, gfp_t mask, + u32 bits, struct extent_state **cached, struct extent_changeset *changeset); static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, bits, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, bits, cached, NULL); } static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_NOFS, NULL); + return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, @@ -163,13 +161,13 @@ static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, gfp_t mask); + u32 bits, struct extent_state **cached_state); static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, GFP_NOFS, NULL); + cached_state, NULL); } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6e319100e3a3..71aaf28fafc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,7 +74,7 @@ int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, { u64 end = start + num_bytes - 1; set_extent_bit(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE, NULL, GFP_NOFS); + EXTENT_UPTODATE, NULL); return 0; } @@ -2508,7 +2508,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->space_info->lock); set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL, GFP_NOFS); + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -4831,16 +4831,15 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (buf->log_index == 0) set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_DIRTY, NULL, GFP_NOFS); + EXTENT_DIRTY, NULL); else set_extent_bit(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, - EXTENT_NEW, NULL, GFP_NOFS); + EXTENT_NEW, NULL); } else { buf->log_index = -1; set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL, - GFP_NOFS); + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5981,8 +5980,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) &bytes); if (!ret) set_extent_bit(&device->alloc_state, start, - start + bytes - 1, - CHUNK_TRIMMED, NULL, GFP_NOFS); + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b38fbdd70a6c..3c7773995d63 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2456,8 +2456,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, * The delalloc new bit will be cleared by ordered extent * completion. */ - ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, - mask, NULL); + ret = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); /* if clear_extent_bit failed for enomem reasons, * we can't allow the release to continue. diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4c8c87524d62..4d86d4739217 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -366,7 +366,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) set_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, NULL, GFP_NOWAIT); + bits | EXTENT_NOWAIT, NULL); } } @@ -383,7 +383,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) __clear_extent_bit(&device->alloc_state, stripe->physical, stripe->physical + stripe_size - 1, bits | EXTENT_NOWAIT, - NULL, GFP_NOWAIT, NULL); + NULL, NULL); } } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1e364a7b74aa..594b69d94241 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -95,7 +95,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY, NULL, GFP_NOFS); + EXTENT_DIRTY, NULL); } /* @@ -440,7 +440,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) set_extent_bit(&inode->io_tree, file_offset, file_offset + sectorsize - 1, - EXTENT_NODATASUM, NULL, GFP_NOFS); + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8dd979b2f084..40b5f2df9ecc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2888,8 +2888,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state, - GFP_NOFS); + EXTENT_DELALLOC_NEW, cached_state); next: search_start = extent_map_end(em); free_extent_map(em); @@ -2923,7 +2922,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, } return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state, GFP_NOFS); + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -5000,7 +4999,7 @@ again: if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL, GFP_NOFS); + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1ed8b132fe2a..0bda67ad349e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -175,8 +175,7 @@ static void mark_block_processed(struct reloc_control *rc, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; set_extent_bit(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - NULL, GFP_NOFS); + node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); } node->processed = 1; } @@ -3054,7 +3053,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, set_extent_bit(&BTRFS_I(inode)->io_tree, boundary_start, boundary_end, - EXTENT_BOUNDARY, NULL, GFP_NOFS); + EXTENT_BOUNDARY, NULL); } unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, &cached_state); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index acaaddf7181a..f6bc6d738555 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -159,7 +159,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); + set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -190,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); + set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -245,7 +245,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL, GFP_NOFS); + set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, @@ -504,7 +504,7 @@ static int test_find_first_clear_extent_bit(void) * 4M-32M */ set_extent_bit(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL, GFP_NOFS); + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); @@ -517,7 +517,7 @@ static int test_find_first_clear_extent_bit(void) /* Now add 32M-64M so that we have a hole between 4M-32M */ set_extent_bit(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL, GFP_NOFS); + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M @@ -548,8 +548,7 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL, - GFP_NOFS); + set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fb90e2b20614..98d6b8cc3874 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1612,7 +1612,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY | EXTENT_NOWAIT, NULL, GFP_NOWAIT); + EXTENT_DIRTY | EXTENT_NOWAIT, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) From 58e814fcacc1f652d2b794c82b7c9d96ee3c3bab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 25 May 2023 13:33:08 -1000 Subject: [PATCH 090/194] btrfs: use alloc_ordered_workqueue() to create ordered workqueues BACKGROUND ========== When multiple work items are queued to a workqueue, their execution order doesn't match the queueing order. They may get executed in any order and simultaneously. When fully serialized execution - one by one in the queueing order - is needed, an ordered workqueue should be used which can be created with alloc_ordered_workqueue(). However, alloc_ordered_workqueue() was a later addition. Before it, an ordered workqueue could be obtained by creating an UNBOUND workqueue with @max_active==1. This originally was an implementation side-effect which was broken by 4c16bd327c74 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered"). Because there were users that depended on the ordered execution, 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") made workqueue allocation path to implicitly promote UNBOUND workqueues w/ @max_active==1 to ordered workqueues. While this has worked okay, overloading the UNBOUND allocation interface this way creates other issues. It's difficult to tell whether a given workqueue actually needs to be ordered and users that legitimately want a min concurrency level wq unexpectedly gets an ordered one instead. With planned UNBOUND workqueue updates to improve execution locality and more prevalence of chiplet designs which can benefit from such improvements, this isn't a state we wanna be in forever. This patch series audits all call sites that create an UNBOUND workqueue w/ @max_active==1 and converts them to alloc_ordered_workqueue() as necessary. BTRFS ===== * fs_info->scrub_workers initialized in scrub_workers_get() was setting @max_active to 1 when @is_dev_replace is set and it seems that the workqueue actually needs to be ordered if @is_dev_replace. Update the code so that alloc_ordered_workqueue() is used if @is_dev_replace. * fs_info->discard_ctl.discard_workers initialized in btrfs_init_workqueues() was directly using alloc_workqueue() w/ @max_active==1. Converted to alloc_ordered_workqueue(). * fs_info->fixup_workers and fs_info->qgroup_rescan_workers initialized in btrfs_queue_work() use the btrfs's workqueue wrapper, btrfs_workqueue, which are allocated with btrfs_alloc_workqueue(). btrfs_workqueue implements automatic @max_active adjustment which is disabled when the specified max limit is below a certain threshold, so calling btrfs_alloc_workqueue() with @limit_active==1 yields an ordered workqueue whose @max_active won't be changed as the auto-tuning is disabled. This is rather brittle in that nothing clearly indicates that the two workqueues should be ordered or btrfs_alloc_workqueue() must disable auto-tuning when @limit_active==1. This patch factors out the common btrfs_workqueue init code into btrfs_init_workqueue() and add explicit btrfs_alloc_ordered_workqueue(). The two workqueues are converted to use the new ordered allocation interface. Signed-off-by: Tejun Heo Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 44 ++++++++++++++++++++++++++++++++++++----- fs/btrfs/async-thread.h | 3 +++ fs/btrfs/disk-io.c | 8 +++++--- fs/btrfs/scrub.c | 6 ++++-- 4 files changed, 51 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index aac240430efe..ce083e99ef68 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -71,6 +71,16 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->pending) > wq->thresh * 2; } +static void btrfs_init_workqueue(struct btrfs_workqueue *wq, + struct btrfs_fs_info *fs_info) +{ + wq->fs_info = fs_info; + atomic_set(&wq->pending, 0); + INIT_LIST_HEAD(&wq->ordered_list); + spin_lock_init(&wq->list_lock); + spin_lock_init(&wq->thres_lock); +} + struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -80,9 +90,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, if (!ret) return NULL; - ret->fs_info = fs_info; + btrfs_init_workqueue(ret, fs_info); + ret->limit_active = limit_active; - atomic_set(&ret->pending, 0); if (thresh == 0) thresh = DFT_THRESHOLD; /* For low threshold, disabling threshold is a better choice */ @@ -106,9 +116,33 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, return NULL; } - INIT_LIST_HEAD(&ret->ordered_list); - spin_lock_init(&ret->list_lock); - spin_lock_init(&ret->thres_lock); + trace_btrfs_workqueue_alloc(ret, name); + return ret; +} + +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags) +{ + struct btrfs_workqueue *ret; + + ret = kzalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return NULL; + + btrfs_init_workqueue(ret, fs_info); + + /* Ordered workqueues don't allow @max_active adjustments. */ + ret->limit_active = 1; + ret->current_active = 1; + ret->thresh = NO_THRESHOLD; + + ret->normal_wq = alloc_ordered_workqueue("btrfs-%s", flags, name); + if (!ret->normal_wq) { + kfree(ret); + return NULL; + } + trace_btrfs_workqueue_alloc(ret, name); return ret; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 6e2596ddae10..30f66c5e2e6e 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -31,6 +31,9 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, unsigned int flags, int limit_active, int thresh); +struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( + struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 06dab9d018c2..1ad7037c6631 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1939,6 +1939,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) { u32 max_active = fs_info->thread_pool_size; unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE; fs_info->workers = btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); @@ -1955,7 +1956,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); fs_info->fixup_workers = - btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags); fs_info->endio_workers = alloc_workqueue("btrfs-endio", flags, max_active); @@ -1974,9 +1975,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, max_active, 0); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", + ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); + alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d7a14458c4a7..316c5a0e3a44 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2740,8 +2740,10 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = alloc_workqueue("btrfs-scrub", flags, - is_dev_replace ? 1 : max_active); + if (is_dev_replace) + scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); + else + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) goto fail_scrub_workers; From 75258f20fb70ce9e1fa4fa0cb27c0bdee05b2701 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 26 May 2023 20:30:53 +0800 Subject: [PATCH 091/194] btrfs: subpage: dump extra subpage bitmaps for debug There is a bug report that assert_eb_page_uptodate() gets triggered for free space tree metadata. Without proper dump for the subpage bitmaps it's much harder to debug. Thus this patch would dump all the subpage bitmaps (split them into their own bitmaps) for a easier debugging. The output would look like this: (Dumped after a tree block got read from disk) page:000000006e34bf49 refcount:4 mapcount:0 mapping:0000000067661ac4 index:0x1d1 pfn:0x110e9 memcg:ffff0000d7d62000 aops:btree_aops [btrfs] ino:1 flags: 0x8000000000002002(referenced|private|zone=2) page_type: 0xffffffff() raw: 8000000000002002 0000000000000000 dead000000000122 ffff00000188bed0 raw: 00000000000001d1 ffff0000c7992700 00000004ffffffff ffff0000d7d62000 page dumped because: btrfs subpage dump BTRFS warning (device dm-1): start=30490624 len=16384 page=30474240 bitmaps: uptodate=4-7 error= dirty= writeback= ordered= checked= Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 +++-- fs/btrfs/subpage.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 2 ++ 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3c7773995d63..87ee376b3cc8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4262,8 +4262,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, return; if (fs_info->nodesize < PAGE_SIZE) { - WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, - eb->start, eb->len)); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len))) + btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); } else { WARN_ON(!PageUptodate(page)); } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 045117ca0ddc..74bc43040531 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -745,3 +745,45 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, /* Have writers, use proper subpage helper to end it */ btrfs_page_end_writer_lock(fs_info, page, start, len); } + +#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ + bitmap_cut(dst, subpage->bitmaps, 0, \ + subpage_info->name##_offset, subpage_info->bitmap_nr_bits) + +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; + struct btrfs_subpage *subpage; + unsigned long uptodate_bitmap; + unsigned long error_bitmap; + unsigned long dirty_bitmap; + unsigned long writeback_bitmap; + unsigned long ordered_bitmap; + unsigned long checked_bitmap; + unsigned long flags; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage_info); + subpage = (struct btrfs_subpage *)page->private; + + spin_lock_irqsave(&subpage->lock, flags); + GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, error, &error_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); + spin_unlock_irqrestore(&subpage->lock, flags); + + dump_page(page, "btrfs subpage dump"); + btrfs_warn(fs_info, +"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", + start, len, page_offset(page), + subpage_info->bitmap_nr_bits, &uptodate_bitmap, + subpage_info->bitmap_nr_bits, &error_bitmap, + subpage_info->bitmap_nr_bits, &dirty_bitmap, + subpage_info->bitmap_nr_bits, &writeback_bitmap, + subpage_info->bitmap_nr_bits, &ordered_bitmap, + subpage_info->bitmap_nr_bits, &checked_bitmap); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0e80ad336904..5905caea8409 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -154,5 +154,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif From b831306b3b7d9202b300f7f6ec32f5be2b4926a3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 3 May 2023 21:08:16 +0200 Subject: [PATCH 092/194] btrfs: print assertion failure report and stack trace from the same line Assertions reports are split into two parts, the exact file and location of the condition and then the stack trace printed from btrfs_assertfail(). This means all the stack traces report the same line and this is what's typically reported by various tools, making it harder to distinguish the reports. [403.2467] assertion failed: refcount_read(&block_group->refs) == 1, in fs/btrfs/block-group.c:4259 [403.2479] ------------[ cut here ]------------ [403.2484] kernel BUG at fs/btrfs/messages.c:259! [403.2488] invalid opcode: 0000 [#1] PREEMPT SMP KASAN [403.2493] CPU: 2 PID: 23202 Comm: umount Not tainted 6.2.0-rc4-default+ #67 [403.2499] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552-rebuilt.opensuse.org 04/01/2014 [403.2509] RIP: 0010:btrfs_assertfail+0x19/0x1b [btrfs] ... [403.2595] Call Trace: [403.2598] [403.2601] btrfs_free_block_groups.cold+0x52/0xae [btrfs] [403.2608] close_ctree+0x6c2/0x761 [btrfs] [403.2613] ? __wait_for_common+0x2b8/0x360 [403.2618] ? btrfs_cleanup_one_transaction.cold+0x7a/0x7a [btrfs] [403.2626] ? mark_held_locks+0x6b/0x90 [403.2630] ? lockdep_hardirqs_on_prepare+0x13d/0x200 [403.2636] ? __call_rcu_common.constprop.0+0x1ea/0x3d0 [403.2642] ? trace_hardirqs_on+0x2d/0x110 [403.2646] ? __call_rcu_common.constprop.0+0x1ea/0x3d0 [403.2652] generic_shutdown_super+0xb0/0x1c0 [403.2657] kill_anon_super+0x1e/0x40 [403.2662] btrfs_kill_super+0x25/0x30 [btrfs] [403.2668] deactivate_locked_super+0x4c/0xc0 By making btrfs_assertfail a macro we'll get the same line number for the BUG output: [63.5736] assertion failed: 0, in fs/btrfs/super.c:1572 [63.5758] ------------[ cut here ]------------ [63.5782] kernel BUG at fs/btrfs/super.c:1572! [63.5807] invalid opcode: 0000 [#2] PREEMPT SMP KASAN [63.5831] CPU: 0 PID: 859 Comm: mount Tainted: G D 6.3.0-rc7-default+ #2062 [63.5868] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 [63.5905] RIP: 0010:btrfs_mount+0x24/0x30 [btrfs] [63.5964] RSP: 0018:ffff88800e69fcd8 EFLAGS: 00010246 [63.5982] RAX: 000000000000002d RBX: ffff888008fc1400 RCX: 0000000000000000 [63.6004] RDX: 0000000000000000 RSI: ffffffffb90fd868 RDI: ffffffffbcc3ff20 [63.6026] RBP: ffffffffc081b200 R08: 0000000000000001 R09: ffff88800e69fa27 [63.6046] R10: ffffed1001cd3f44 R11: 0000000000000001 R12: ffff888005a3c370 [63.6062] R13: ffffffffc058e830 R14: 0000000000000000 R15: 00000000ffffffff [63.6081] FS: 00007f7b3561f800(0000) GS:ffff88806c600000(0000) knlGS:0000000000000000 [63.6105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [63.6120] CR2: 00007fff83726e10 CR3: 0000000002a9e000 CR4: 00000000000006b0 [63.6137] Call Trace: [63.6143] [63.6148] legacy_get_tree+0x80/0xd0 [63.6158] vfs_get_tree+0x43/0x120 [63.6166] do_new_mount+0x1f3/0x3d0 [63.6176] ? do_add_mount+0x140/0x140 [63.6187] ? cap_capable+0xa4/0xe0 [63.6197] path_mount+0x223/0xc10 This comes at a cost of bloating the final btrfs.ko module due all the inlining, as long as assertions are compiled in. This is a must for debugging builds but this is often enabled on release builds too. Release build: text data bss dec hex filename 1251676 20317 16088 1288081 13a791 pre/btrfs.ko 1260612 29473 16088 1306173 13ee3d post/btrfs.ko DELTA: +8936 CC: Josh Poimboeuf Signed-off-by: David Sterba --- fs/btrfs/messages.c | 8 -------- fs/btrfs/messages.h | 8 +++++++- tools/objtool/check.c | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 310a05cf95ef..23fc11af498a 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -252,14 +252,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -#ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} -#endif - void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) { btrfs_err(fs_info, diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 99143bbf78a5..deedc1a168e2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,6 +4,8 @@ #define BTRFS_MESSAGES_H #include +#include +#include struct btrfs_fs_info; @@ -167,7 +169,11 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); + +#define btrfs_assertfail(expr, file, line) ({ \ + pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ + BUG(); \ +}) #define ASSERT(expr) \ (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0fcf99c91400..c19b1103e4ec 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -204,7 +204,6 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "__ubsan_handle_builtin_unreachable", "arch_call_rest_init", "arch_cpu_idle_dead", - "btrfs_assertfail", "cpu_bringup_and_idle", "cpu_startup_entry", "do_exit", From 5a96341927b097601e1fefc292f0a6e5273e7554 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 30 May 2023 09:45:27 +0800 Subject: [PATCH 093/194] btrfs: subpage: make alloc_extent_buffer() handle previously uptodate range efficiently Currently alloc_extent_buffer() would make the extent buffer uptodate if the corresponding pages are also uptodate. But this check is only checking PageUptodate, which is fine for regular cases, but not for subpage cases, as we can have multiple extent buffers in the same page. So here we go btrfs_page_test_uptodate() instead. The old code doesn't cause any problem, but is not efficient, as it would cause extra metadata read even if the range is already uptodate. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 87ee376b3cc8..f578813be749 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3712,7 +3712,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; - if (!PageUptodate(p)) + if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) uptodate = 0; /* From 31dd8c81ddfa142dab53a0837eed88e8febe7b1e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 30 May 2023 09:45:28 +0800 Subject: [PATCH 094/194] btrfs: use the same uptodate variable for end_bio_extent_readpage() In function end_bio_extent_readpage() we call endio_readpage_release_extent() to unlock the extent io tree. However we pass PageUptodate(page) as @uptodate parameter for it, while for previous end_page_read() call, we use a dedicated @uptodate local variable. This is not a big deal, as even for subpage cases, either the bio only covers part of the page, then the @uptodate is always false, and the subpage ranges can still be merged. But for the sake of consistency, always use @uptodate variable when possible. Reviewed-by: Christoph Hellwig Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f578813be749..d5b3b15f7ab2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -745,7 +745,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) /* Update page status and unlock. */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); + start, end, uptodate); ASSERT(bio_offset + len > bio_offset); bio_offset += len; From 315dd5cc75ddccb6ebd863366b7a7d0488057637 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:16:57 +0100 Subject: [PATCH 095/194] btrfs: reorder some members of struct btrfs_delayed_ref_head Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members in different cache lines (even on a release, non-debug, kernel). This is not optimal because when iterating the red black tree of delayed ref heads for inserting a new delayed ref head (htree_insert()) we have to pull in 2 cache lines of delayed ref heads we find in a patch, one for the tree node (struct rb_node) and another one for the 'bytenr' field. The same applies when searching for an existing delayed ref head (find_ref_head()). On a release (non-debug) kernel, the structure also has two 4 bytes holes, which makes it 8 bytes longer than necessary. Its current layout is the following: struct btrfs_delayed_ref_head { u64 bytenr; /* 0 8 */ u64 num_bytes; /* 8 8 */ refcount_t refs; /* 16 4 */ /* XXX 4 bytes hole, try to pack */ struct mutex mutex; /* 24 32 */ spinlock_t lock; /* 56 4 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct rb_root_cached ref_tree; /* 64 16 */ struct list_head ref_add_list; /* 80 16 */ struct rb_node href_node __attribute__((__aligned__(8))); /* 96 24 */ struct btrfs_delayed_extent_op * extent_op; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ int total_ref_mod; /* 128 4 */ int ref_mod; /* 132 4 */ unsigned int must_insert_reserved:1; /* 136: 0 4 */ unsigned int is_data:1; /* 136: 1 4 */ unsigned int is_system:1; /* 136: 2 4 */ unsigned int processing:1; /* 136: 3 4 */ /* size: 144, cachelines: 3, members: 15 */ /* sum members: 128, holes: 2, sum holes: 8 */ /* sum bitfield members: 4 bits (0 bytes) */ /* padding: 4 */ /* bit_padding: 28 bits */ /* forced alignments: 1 */ /* last cacheline: 16 bytes */ } __attribute__((__aligned__(8))); This change reorders the 'href_node' and 'refs' members so that we have the 'href_node' in the same cache line as the 'bytenr' field, while also eliminating the two holes and reducing the structure size from 144 bytes down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64) instead of 28. The new structure layout after this change is now: struct btrfs_delayed_ref_head { u64 bytenr; /* 0 8 */ u64 num_bytes; /* 8 8 */ struct rb_node href_node __attribute__((__aligned__(8))); /* 16 24 */ struct mutex mutex; /* 40 32 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ refcount_t refs; /* 72 4 */ spinlock_t lock; /* 76 4 */ struct rb_root_cached ref_tree; /* 80 16 */ struct list_head ref_add_list; /* 96 16 */ struct btrfs_delayed_extent_op * extent_op; /* 112 8 */ int total_ref_mod; /* 120 4 */ int ref_mod; /* 124 4 */ /* --- cacheline 2 boundary (128 bytes) --- */ unsigned int must_insert_reserved:1; /* 128: 0 4 */ unsigned int is_data:1; /* 128: 1 4 */ unsigned int is_system:1; /* 128: 2 4 */ unsigned int processing:1; /* 128: 3 4 */ /* size: 136, cachelines: 3, members: 15 */ /* padding: 4 */ /* bit_padding: 28 bits */ /* forced alignments: 1 */ /* last cacheline: 8 bytes */ } __attribute__((__aligned__(8))); Running the following fs_mark test shows some significant improvement. $ cat test.sh #!/bin/bash # 15G null block device DEV=/dev/nullb0 MNT=/mnt/nullb0 FILES=100000 THREADS=$(nproc --all) FILE_SIZE=0 echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount -o ssd $DEV $MNT OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before this change: FSUse% Count Size Files/sec App Overhead 10 1200000 0 112631.3 11928055 16 2400000 0 189943.8 12140777 23 3600000 0 150719.2 13178480 50 4800000 0 99137.3 12504293 53 6000000 0 111733.9 12670836 Total files/sec: 664165.5 After this change: FSUse% Count Size Files/sec App Overhead 10 1200000 0 148589.5 11565889 16 2400000 0 227743.8 11561596 23 3600000 0 191590.5 12550755 30 4800000 0 179812.3 12629610 53 6000000 0 92471.4 12352383 Total files/sec: 840207.5 Measuring the execution times of htree_insert(), in nanoseconds, during those fs_mark runs: Before this change: Range: 0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231 Percentiles: 90th: 980.000; 95th: 1208.000; 99th: 2090.000 0.000 - 6.384: 257 | 6.384 - 26.259: 977 | 26.259 - 99.635: 4963 | 99.635 - 370.526: 136800 ############# 370.526 - 1370.603: 566110 ##################################################### 1370.603 - 5062.704: 24945 ## 5062.704 - 18693.248: 944 | 18693.248 - 69014.670: 211 | 69014.670 - 254791.959: 30 | 254791.959 - 940647.000: 4 | After this change: Range: 0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422 Percentiles: 90th: 918.000; 95th: 1113.000; 99th: 1987.000 0.000 - 5.585: 163 | 5.585 - 20.678: 452 | 20.678 - 70.369: 1806 | 70.369 - 233.965: 26268 #### 233.965 - 772.564: 333519 ##################################################### 772.564 - 2545.771: 91820 ############### 2545.771 - 8383.615: 2238 | 8383.615 - 27603.280: 170 | 27603.280 - 90879.297: 68 | 90879.297 - 299200.000: 12 | Mean, percentiles, maximum times are all better, as well as a lower standard deviation. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b54261fe509b..9b28e800a604 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -70,20 +70,26 @@ struct btrfs_delayed_extent_op { struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; - refcount_t refs; + /* + * For insertion into struct btrfs_delayed_ref_root::href_root. + * Keep it in the same cache line as 'bytenr' for more efficient + * searches in the rbtree. + */ + struct rb_node href_node; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ struct mutex mutex; + refcount_t refs; + + /* Protects 'ref_tree' and 'ref_add_list'. */ spinlock_t lock; struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; - struct rb_node href_node; - struct btrfs_delayed_extent_op *extent_op; /* From 53499d5f6b68766317a62b1513e9386c2584e97c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:16:58 +0100 Subject: [PATCH 096/194] btrfs: remove unused is_head field from struct btrfs_delayed_ref_node The 'is_head' field of struct btrfs_delayed_ref_node is no longer after commit d278850eff30 ("btrfs: remove delayed_ref_node from ref_head"), so remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 1 - fs/btrfs/delayed-ref.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0b32432d7d56..be1d18ec5cef 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -853,7 +853,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, ref->num_bytes = num_bytes; ref->ref_mod = 1; ref->action = action; - ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; ref->type = ref_type; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 9b28e800a604..77b3e735772d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -48,8 +48,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - /* is this node still in the rbtree? */ - unsigned int is_head:1; unsigned int in_tree:1; }; From 4d34ad34d7cc5390ec03b25f2a7f2fd5041cb7d8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:16:59 +0100 Subject: [PATCH 097/194] btrfs: remove pointless in_tree field from struct btrfs_delayed_ref_node The 'in_tree' field is really not needed in struct btrfs_delayed_ref_node, as we can check whether a reference is in the tree or not simply by checking its red black tree node member with RB_EMPTY_NODE(), as when we remove it from the tree we always call RB_CLEAR_NODE(). So remove that field and use RB_EMPTY_NODE(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 2 -- fs/btrfs/delayed-ref.h | 3 +-- fs/btrfs/disk-io.c | 1 - fs/btrfs/extent-tree.c | 1 - 4 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index be1d18ec5cef..d6dce5792c0f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -407,7 +407,6 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); - ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); } @@ -853,7 +852,6 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, ref->num_bytes = num_bytes; ref->ref_mod = 1; ref->action = action; - ref->in_tree = 1; ref->seq = seq; ref->type = ref_type; RB_CLEAR_NODE(&ref->ref_node); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 77b3e735772d..9b103bf27185 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -48,7 +48,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - unsigned int in_tree:1; }; struct btrfs_delayed_extent_op { @@ -341,7 +340,7 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { - WARN_ON(ref->in_tree); + WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: case BTRFS_SHARED_BLOCK_REF_KEY: diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ad7037c6631..55cbbe7b0dba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4602,7 +4602,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((n = rb_first_cached(&head->ref_tree)) != NULL) { ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node); - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &head->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71aaf28fafc9..81a673506077 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1913,7 +1913,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, return -EAGAIN; } - ref->in_tree = 0; rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) From 293f8197a4902227e39e7288aa06f7f2b0e29cc8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:00 +0100 Subject: [PATCH 098/194] btrfs: use a bool to track qgroup record insertion when adding ref head We are using an integer as a boolean to track the qgroup record insertion status when adding a delayed reference head. Since all we need is a boolean, switch the type from int to bool to make it more obvious. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d6dce5792c0f..0bcec428766c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -762,11 +762,11 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret) + int action, bool *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; - int qrecord_inserted = 0; + bool qrecord_inserted = false; delayed_refs = &trans->transaction->delayed_refs; @@ -776,7 +776,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs, qrecord)) kfree(qrecord); else - qrecord_inserted = 1; + qrecord_inserted = true; } trace_add_delayed_ref_head(trans->fs_info, head_ref, action); @@ -872,7 +872,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; bool is_system; int action = generic_ref->action; int level = generic_ref->tree_ref.level; @@ -965,7 +965,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - int qrecord_inserted; + bool qrecord_inserted; int action = generic_ref->action; int ret; u64 bytenr = generic_ref->bytenr; From f38462c4476cc57ef0e739bc8b8bc8f0a5754b3b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:01 +0100 Subject: [PATCH 099/194] btrfs: make insert_delayed_ref() return a bool instead of an int insert_delayed_ref() can only return 0 or 1, to indicate if the given delayed reference was added to the head reference or if it was merged into an existing delayed ref, respectively. So just make it return a boolean instead. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 0bcec428766c..c3da7c3185de 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -555,16 +555,17 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, /* * Helper to insert the ref_node to the tail or merge with tail. * - * Return 0 for insert. - * Return >0 for merge. + * Return false if the ref was inserted. + * Return true if the ref was merged into an existing one (and therefore can be + * freed by the caller). */ -static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; - int ret = 0; + bool ret = false; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); @@ -572,7 +573,7 @@ static int insert_delayed_ref(struct btrfs_delayed_ref_root *root, goto inserted; /* Now we are sure we can merge */ - ret = 1; + ret = true; if (exist->action == ref->action) { mod = ref->ref_mod; } else { @@ -874,9 +875,9 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *record = NULL; bool qrecord_inserted; bool is_system; + bool merged; int action = generic_ref->action; int level = generic_ref->tree_ref.level; - int ret; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -932,7 +933,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -944,7 +945,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, trace_add_delayed_tree_ref(fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); if (qrecord_inserted) @@ -967,7 +968,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *record = NULL; bool qrecord_inserted; int action = generic_ref->action; - int ret; + bool merged; u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; @@ -1024,7 +1025,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - ret = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1036,7 +1037,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref, action == BTRFS_ADD_DELAYED_EXTENT ? BTRFS_ADD_DELAYED_REF : action); - if (ret > 0) + if (merged) kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); From 798f4d95db0d71a2803c61b5e1fd5b739569c3da Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:02 +0100 Subject: [PATCH 100/194] btrfs: get rid of label and goto at insert_delayed_ref() At insert_delayed_ref() there's no point of having a label and goto in the case we were able to insert the delayed ref head. We can just add the code under label to the if statement's body and return immediately, and also there is no need to track the return value in a variable, we can just return a literal true or false value directly. So do those changes. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c3da7c3185de..e4579e66a57a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -565,15 +565,18 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, { struct btrfs_delayed_ref_node *exist; int mod; - bool ret = false; spin_lock(&href->lock); exist = tree_insert(&href->ref_tree, ref); - if (!exist) - goto inserted; + if (!exist) { + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); + atomic_inc(&root->num_entries); + spin_unlock(&href->lock); + return false; + } /* Now we are sure we can merge */ - ret = true; if (exist->action == ref->action) { mod = ref->ref_mod; } else { @@ -600,13 +603,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, if (exist->ref_mod == 0) drop_delayed_ref(root, href, exist); spin_unlock(&href->lock); - return ret; -inserted: - if (ref->action == BTRFS_ADD_DELAYED_REF) - list_add_tail(&ref->add_list, &href->ref_add_list); - atomic_inc(&root->num_entries); - spin_unlock(&href->lock); - return ret; + return true; } /* From 1e6b71c34bbbbde920357265abd161cb727487b8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:03 +0100 Subject: [PATCH 101/194] btrfs: assert correct lock is held at btrfs_select_ref_head() The function btrfs_select_ref_head() iterates over the red black tree of delayed reference heads, which is protected by the spinlock in the delayed refs root. The function doesn't take the lock, it's taken by its single caller, btrfs_obtain_ref_head(), because it needs to call that function and btrfs_delayed_ref_lock() in the same critical section (delimited by that spinlock). So assert at btrfs_select_ref_head() that we are holding the expected lock. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e4579e66a57a..c61af9012a82 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -506,6 +506,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( { struct btrfs_delayed_ref_head *head; + lockdep_assert_held(&delayed_refs->lock); again: head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start, true); From 61c681fef7dd44521070fd4198abede2077afb8e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:04 +0100 Subject: [PATCH 102/194] btrfs: use bool type for delayed ref head fields that are used as booleans There's no point in have several fields defined as 1 bit unsigned int in struct btrfs_delayed_ref_head, we can instead use a bool type, it makes the code a bit more readable and it doesn't change the structure size. So switch them to proper booleans. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 12 ++++++------ fs/btrfs/delayed-ref.h | 8 ++++---- fs/btrfs/extent-tree.c | 14 +++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c61af9012a82..bf8c2ac6c95e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -531,7 +531,7 @@ again: href_node); } - head->processing = 1; + head->processing = true; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; delayed_refs->run_delayed_start = head->bytenr + @@ -549,7 +549,7 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); delayed_refs->num_heads--; - if (head->processing == 0) + if (!head->processing) delayed_refs->num_heads_ready--; } @@ -697,7 +697,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, bool is_system) { int count_mod = 1; - int must_insert_reserved = 0; + bool must_insert_reserved = false; /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); @@ -722,9 +722,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, * BTRFS_ADD_DELAYED_REF because other special casing is not required. */ if (action == BTRFS_ADD_DELAYED_EXTENT) - must_insert_reserved = 1; + must_insert_reserved = true; else - must_insert_reserved = 0; + must_insert_reserved = false; refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; @@ -736,7 +736,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->ref_tree = RB_ROOT_CACHED; INIT_LIST_HEAD(&head_ref->ref_add_list); RB_CLEAR_NODE(&head_ref->href_node); - head_ref->processing = 0; + head_ref->processing = false; head_ref->total_ref_mod = count_mod; spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 9b103bf27185..b8e14b0ba5f1 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -116,10 +116,10 @@ struct btrfs_delayed_ref_head { * we need to update the in ram accounting to properly reflect * the free has happened. */ - unsigned int must_insert_reserved:1; - unsigned int is_data:1; - unsigned int is_system:1; - unsigned int processing:1; + bool must_insert_reserved; + bool is_data; + bool is_system; + bool processing; }; struct btrfs_delayed_tree_ref { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 81a673506077..911908ea5f6f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1497,7 +1497,7 @@ out: static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; @@ -1647,7 +1647,7 @@ out: static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; struct btrfs_delayed_tree_ref *ref; @@ -1687,7 +1687,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, - int insert_reserved) + bool insert_reserved) { int ret = 0; @@ -1745,7 +1745,7 @@ static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_ref struct btrfs_delayed_ref_head *head) { spin_lock(&delayed_refs->lock); - head->processing = 0; + head->processing = false; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); btrfs_delayed_ref_unlock(head); @@ -1897,7 +1897,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_extent_op *extent_op; struct btrfs_delayed_ref_node *ref; - int must_insert_reserved = 0; + bool must_insert_reserved; int ret; delayed_refs = &trans->transaction->delayed_refs; @@ -1939,7 +1939,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; + locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; locked_ref->extent_op = NULL; @@ -3211,7 +3211,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out; btrfs_delete_ref_head(delayed_refs, head); - head->processing = 0; + head->processing = false; spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); From f1ed785a5b9ca944771a08d0a2cdca961ac24329 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:05 +0100 Subject: [PATCH 103/194] btrfs: use a single switch statement when initializing delayed ref head At init_delayed_ref_head(), we are using two separate if statements to check the delayed ref head action, and initializing 'must_insert_reserved' to false twice, once when the variable is declared and once again in an else branch. Make this simpler and more straightforward by having a single switch statement, also moving the comment about a drop action to the corresponding switch case to make it more clear and eliminating the duplicated initialization of 'must_insert_reserved' to false. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 44 +++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index bf8c2ac6c95e..6a13cf00218b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -702,29 +702,33 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, /* If reserved is provided, it must be a data extent. */ BUG_ON(!is_data && reserved); - /* - * The head node stores the sum of all the mods, so dropping a ref - * should drop the sum in the head node by one. - */ - if (action == BTRFS_UPDATE_DELAYED_HEAD) + switch (action) { + case BTRFS_UPDATE_DELAYED_HEAD: count_mod = 0; - else if (action == BTRFS_DROP_DELAYED_REF) + break; + case BTRFS_DROP_DELAYED_REF: + /* + * The head node stores the sum of all the mods, so dropping a ref + * should drop the sum in the head node by one. + */ count_mod = -1; - - /* - * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved - * accounting when the extent is finally added, or if a later - * modification deletes the delayed ref without ever inserting the - * extent into the extent allocation tree. ref->must_insert_reserved - * is the flag used to record that accounting mods are required. - * - * Once we record must_insert_reserved, switch the action to - * BTRFS_ADD_DELAYED_REF because other special casing is not required. - */ - if (action == BTRFS_ADD_DELAYED_EXTENT) + break; + case BTRFS_ADD_DELAYED_EXTENT: + /* + * BTRFS_ADD_DELAYED_EXTENT means that we need to update the + * reserved accounting when the extent is finally added, or if a + * later modification deletes the delayed ref without ever + * inserting the extent into the extent allocation tree. + * ref->must_insert_reserved is the flag used to record that + * accounting mods are required. + * + * Once we record must_insert_reserved, switch the action to + * BTRFS_ADD_DELAYED_REF because other special casing is not + * required. + */ must_insert_reserved = true; - else - must_insert_reserved = false; + break; + } refcount_set(&head_ref->refs, 1); head_ref->bytenr = bytenr; From 184533e3618f4d0b382c1ef3de0ce34e849005d7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 May 2023 16:17:06 +0100 Subject: [PATCH 104/194] btrfs: remove unnecessary prototype declarations at disk-io.c We have a few static functions at disk-io.c for which we have a forward declaration of their prototype, but it's not needed because all those functions are defined before they are called, so remove them. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 55cbbe7b0dba..c22f8f58c951 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,15 +60,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void btrfs_destroy_ordered_extents(struct btrfs_root *root); -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info); -static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark); -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *pinned_extents); static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); From 99f09ce309b8307ce8dca209f936e99a7c332214 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 2 Jun 2023 12:19:42 +0100 Subject: [PATCH 105/194] btrfs: make btrfs_destroy_delayed_refs() return void btrfs_destroy_delayed_refs() always returns 0 and its single caller does not check its return value, as it also returns void, and so does the callers' caller and so on. This is because we are in the transaction abort path, where we have no way to deal with errors (we are in a critical situation) and all cleanup of resources works in a best effort fashion. So make btrfs_destroy_delayed_refs() return void. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c22f8f58c951..4328c6b1120e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4562,13 +4562,12 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } -static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_fs_info *fs_info) +static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_fs_info *fs_info) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; - int ret = 0; delayed_refs = &trans->delayed_refs; @@ -4576,7 +4575,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "delayed_refs has NO entry"); - return ret; + return; } while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { @@ -4637,8 +4636,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, btrfs_qgroup_destroy_extent_records(trans); spin_unlock(&delayed_refs->lock); - - return ret; } static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) From dd8b7b0416704efb0dcd74801a1a48aa221f1cf5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:04 +0200 Subject: [PATCH 106/194] btrfs: optimize out btrfs_is_zoned for !CONFIG_BLK_DEV_ZONED Add an IS_ENABLED check for CONFIG_BLK_DEV_ZONED in addition to the run-time check for the zone size. This will allow to make use of compiler dead code elimination for code guarded by btrfs_is_zoned, and for example provide just a dangling prototype for a function instead of adding a stub. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 840e4def18b5..5dd24c2916a1 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -853,7 +853,7 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zone_size > 0; + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0; } /* From e9cb93b9fbd0bfcef2774e82d095362ad16acf6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:05 +0200 Subject: [PATCH 107/194] btrfs: don't call btrfs_record_physical_zoned for failed append When a zoned append command fails there is no written address reported, so don't try to record it. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 2e2e1abd5838..ea6c81f9d1a3 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -348,7 +348,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_orig_bbio_end_io(bbio); } From 6e4b2479ab38b3f949a85964da212295d32102f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:06 +0200 Subject: [PATCH 108/194] btrfs: mark the len field in struct btrfs_ordered_sum as unsigned len can't ever be negative, so mark it as an u32 instead of int. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 2 +- fs/btrfs/ordered-data.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 594b69d94241..24b974bb828a 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -561,7 +561,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, } sums->bytenr = start; - sums->len = (int)size; + sums->len = size; offset = bytes_to_csum_size(fs_info, start - key.offset); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f0f1138d23c3..2e54820a5e6f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -20,7 +20,7 @@ struct btrfs_ordered_sum { /* * this is the length in bytes covered by the sums array below. */ - int len; + u32 len; struct list_head list; /* last field is a variable length array of csums */ u8 sums[]; From 5cfe76f846d5034058c0c4813259d5d831757c36 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:07 +0200 Subject: [PATCH 109/194] btrfs: rename the bytenr field in struct btrfs_ordered_sum to logical btrfs_ordered_sum::bytendr stores a logical address. Make that clear by renaming it to ->logical. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 8 ++++---- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.h | 8 ++++---- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/tree-log.c | 12 ++++++------ fs/btrfs/zoned.c | 4 ++-- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 24b974bb828a..415e50904db3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -560,7 +560,7 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, goto fail; } - sums->bytenr = start; + sums->logical = start; sums->len = size; offset = bytes_to_csum_size(fs_info, start - key.offset); @@ -749,7 +749,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - sums->bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; index = 0; shash->tfm = fs_info->csum_shash; @@ -799,7 +799,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) ordered = btrfs_lookup_ordered_extent(inode, offset); ASSERT(ordered); /* Logic error */ - sums->bytenr = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + sums->logical = (bio->bi_iter.bi_sector << SECTOR_SHIFT) + total_bytes; index = 0; } @@ -1086,7 +1086,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, again: next_offset = (u64)-1; found_next = 0; - bytenr = sums->bytenr + total_bytes; + bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 40b5f2df9ecc..ad8196f31cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2850,7 +2850,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, - sum->bytenr); + sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 2e54820a5e6f..ebc980ac967a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -14,13 +14,13 @@ struct btrfs_ordered_inode_tree { }; struct btrfs_ordered_sum { - /* bytenr is the start of this extent on disk */ - u64 bytenr; - /* - * this is the length in bytes covered by the sums array below. + * Logical start address and length for of the blocks covered by + * the sums array. */ + u64 logical; u32 len; + struct list_head list; /* last field is a variable length array of csums */ u8 sums[]; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0bda67ad349e..1333c8e2ddad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4379,8 +4379,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; - sums->bytenr = new_bytenr; + new_bytenr = ordered->disk_bytenr + sums->logical - disk_bytenr; + sums->logical = new_bytenr; btrfs_add_ordered_sum(ordered, sums); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ecb73da5d25a..f91a6175fd11 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -859,10 +859,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); csum_root = btrfs_csum_root(fs_info, - sums->bytenr); + sums->logical); if (!ret) ret = btrfs_del_csums(trans, csum_root, - sums->bytenr, + sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, @@ -4221,7 +4221,7 @@ static int log_csums(struct btrfs_trans_handle *trans, struct btrfs_root *log_root, struct btrfs_ordered_sum *sums) { - const u64 lock_end = sums->bytenr + sums->len - 1; + const u64 lock_end = sums->logical + sums->len - 1; struct extent_state *cached_state = NULL; int ret; @@ -4239,7 +4239,7 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); if (ret) return ret; @@ -4252,11 +4252,11 @@ static int log_csums(struct btrfs_trans_handle *trans, * some checksums missing in the fs/subvolume tree. So just delete (or * trim and adjust) any existing csum items in the log for this range. */ - ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->bytenr, lock_end, + unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 98d6b8cc3874..eca49e6e0e5f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1711,9 +1711,9 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) list_for_each_entry(sum, &ordered->list, list) { if (logical < orig_logical) - sum->bytenr -= orig_logical - logical; + sum->logical -= orig_logical - logical; else - sum->bytenr += logical - orig_logical; + sum->logical += logical - orig_logical; } } From cbfce4c7fbde23cc8bcba44822a58c728caf6ec9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:08 +0200 Subject: [PATCH 110/194] btrfs: optimize the logical to physical mapping for zoned writes The current code to store the final logical to physical mapping for a zone append write in the extent tree is rather inefficient. It first has to split the ordered extent so that there is one ordered extent per bio, so that it can look up the ordered extent on I/O completion in btrfs_record_physical_zoned and store the physical LBA returned by the block driver in the ordered extent. btrfs_rewrite_logical_zoned then has to do a lookup in the chunk tree to see what physical address the logical address for this bio / ordered extent is mapped to, and then rewrite it in the extent tree. To optimize this process, we can store the physical address assigned in the chunk tree to the original logical address and a pointer to btrfs_ordered_sum structure the in the btrfs_bio structure, and then use this information to rewrite the logical address in the btrfs_ordered_sum structure directly at I/O completion time in btrfs_record_physical_zoned. btrfs_rewrite_logical_zoned then simply updates the logical address in the extent tree and the ordered_extent itself. The code in btrfs_rewrite_logical_zoned now runs for all data I/O completions in zoned file systems, which is fine as there is no remapping to do for non-append writes to conventional zones or for relocation, and the overhead for quickly breaking out of the loop is very low. Because zoned file systems now need the ordered_sums structure to record the actual write location returned by zone append, allocate dummy structures without the csum array for them when the I/O doesn't use checksums, and free them when completing the ordered_extent. Note that the btrfs_bio doesn't grow as the new field are places into a union that is so far not used for data writes and has plenty of space left in it. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 5 ++++ fs/btrfs/bio.h | 17 ++++++++++-- fs/btrfs/file-item.c | 30 ++++++++++++++++++++ fs/btrfs/file-item.h | 1 + fs/btrfs/inode.c | 6 +--- fs/btrfs/ordered-data.c | 1 - fs/btrfs/ordered-data.h | 6 ---- fs/btrfs/zoned.c | 61 +++++++++++++++-------------------------- 8 files changed, 73 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index ea6c81f9d1a3..54cfab7394d3 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -431,6 +431,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) u64 zone_start = round_down(physical, dev->fs_info->zone_size); ASSERT(btrfs_dev_is_sequential(dev, physical)); + btrfs_bio(bio)->orig_physical = physical; bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, @@ -685,6 +686,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) ret = btrfs_bio_csum(bbio); if (ret) goto fail_put_bio; + } else if (use_append) { + ret = btrfs_alloc_dummy_sum(bbio); + if (ret) + goto fail_put_bio; } } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index a8eca3a65673..8a29980159b4 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -39,8 +39,8 @@ struct btrfs_bio { union { /* - * Data checksumming and original I/O information for internal - * use in the btrfs_submit_bio machinery. + * For data reads: checksumming and original I/O information. + * (for internal use in the btrfs_submit_bio machinery only) */ struct { u8 *csum; @@ -48,7 +48,18 @@ struct btrfs_bio { struct bvec_iter saved_iter; }; - /* For metadata parentness verification. */ + /* + * For data writes: + * - pointer to the checksums for this bio + * - original physical address from the allocator + * (for zone append only) + */ + struct { + struct btrfs_ordered_sum *sums; + u64 orig_physical; + }; + + /* For metadata reads: parentness verification. */ struct btrfs_tree_parent_check parent_check; }; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 415e50904db3..0cb4a9921d21 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -818,11 +818,41 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) } this_sum_bytes = 0; + + /* + * The ->sums assignment is for zoned writes, where a bio never spans + * ordered extents and is only done unconditionally because that's cheaper + * than a branch. + */ + bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); return 0; } +/* + * Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to + * record the updated logical address on Zone Append completion. + * Allocate just the structure with an empty sums array here for that case. + */ +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +{ + struct btrfs_ordered_extent *ordered = + btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); + + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + + bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); + if (!bbio->sums) + return BLK_STS_RESOURCE; + bbio->sums->len = bbio->bio.bi_iter.bi_size; + bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_add_ordered_sum(ordered, bbio->sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + /* * Remove one checksum overlapping a range. * diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 6be8725cd574..4ec669b69008 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); +blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ad8196f31cdb..31c5b7c176d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3301,14 +3301,10 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - /* A valid ->physical implies a write on a sequential zone. */ - if (ordered_extent->physical != (u64)-1) { + if (btrfs_is_zoned(fs_info)) { btrfs_rewrite_logical_zoned(ordered_extent); btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); - } else if (btrfs_is_data_reloc_root(inode->root)) { - btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes); } if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a9778a91511e..324a5a8c844a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -209,7 +209,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - entry->physical = (u64)-1; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); entry->flags = flags; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ebc980ac967a..dc700aa515b5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -151,12 +151,6 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; - - /* - * Used to reverse-map physical address returned from ZONE_APPEND write - * command in a workqueue context - */ - u64 physical; }; static inline void diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index eca49e6e0e5f..b55b0d4ee86f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1657,51 +1657,28 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_sum *sum = bbio->sums; - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON(!ordered)) - return; - - ordered->physical = physical; - btrfs_put_ordered_extent(ordered); + if (physical < bbio->orig_physical) + sum->logical -= bbio->orig_physical - physical; + else + sum->logical += physical - bbio->orig_physical; } void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_map_tree *em_tree; + struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; - struct btrfs_ordered_sum *sum; - u64 orig_logical = ordered->disk_bytenr; - struct map_lookup *map; - u64 physical = ordered->physical; - u64 chunk_start_phys; - u64 logical; + struct btrfs_ordered_sum *sum = + list_first_entry(&ordered->list, typeof(*sum), list); + u64 logical = sum->logical; - em = btrfs_get_chunk_map(fs_info, orig_logical, 1); - if (IS_ERR(em)) - return; - map = em->map_lookup; - chunk_start_phys = map->stripes[0].physical; - - if (WARN_ON_ONCE(map->num_stripes > 1) || - WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) || - WARN_ON_ONCE(physical < chunk_start_phys) || - WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) { - free_extent_map(em); - return; - } - logical = em->start + (physical - map->stripes[0].physical); - free_extent_map(em); - - if (orig_logical == logical) - return; + if (ordered->disk_bytenr == logical) + goto out; ordered->disk_bytenr = logical; - em_tree = &inode->extent_tree; write_lock(&em_tree->lock); em = search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); @@ -1709,11 +1686,17 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) free_extent_map(em); write_unlock(&em_tree->lock); - list_for_each_entry(sum, &ordered->list, list) { - if (logical < orig_logical) - sum->logical -= orig_logical - logical; - else - sum->logical += logical - orig_logical; +out: + /* + * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures + * were allocated by btrfs_alloc_dummy_sum only to record the logical + * addresses and don't contain actual checksums. We thus must free them + * here so that we don't attempt to log the csums later. + */ + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &inode->root->fs_info->fs_state)) { + list_del(&sum->list); + kfree(sum); } } From 3887653c44ec1de089b8624bd53af05b946939e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jun 2023 07:27:04 +0200 Subject: [PATCH 111/194] btrfs: record orig_physical only for the original bio btrfs_submit_dev_bio is also called for clone bios that aren't embedded into a btrfs_bio structure, but previous commit "btrfs: optimize the logical to physical mapping for zoned writes" added code to assign btrfs_bio.orig_physical in it. This is harmless right now as only the single data profile can be used on zoned devices, but will blow up when the RAID stripe tree is added. Move it out into the single I/O specific branch in the caller. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 54cfab7394d3..b1cdd145aada 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -431,7 +431,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) u64 zone_start = round_down(physical, dev->fs_info->zone_size); ASSERT(btrfs_dev_is_sequential(dev, physical)); - btrfs_bio(bio)->orig_physical = physical; bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; } btrfs_debug_in_rcu(dev->fs_info, @@ -480,6 +479,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_private = smap->dev; bio->bi_end_io = btrfs_simple_end_io; btrfs_submit_dev_bio(smap->dev, bio); From a6f3e205e4916e4f29ce2c4c38d520616e6b0080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:09 +0200 Subject: [PATCH 112/194] btrfs: move split_extent_map to extent_map.c split_extent_map doesn't have anything to do with the other code in inode.c, so move it to extent_map.c. This also allows marking replace_extent_mapping static. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 98 +++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/extent_map.h | 5 +-- fs/btrfs/inode.c | 90 --------------------------------------- 3 files changed, 95 insertions(+), 98 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4d86d4739217..e5124461ea02 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -504,10 +504,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&em->rb_node); } -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified) +static void replace_extent_mapping(struct extent_map_tree *tree, + struct extent_map *cur, + struct extent_map *new, + int modified) { lockdep_assert_held_write(&tree->lock); @@ -961,3 +961,93 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, return ret; } + +/* + * Split off the first pre bytes from the extent_map at [start, start + len] + * + * This function is used when an ordered_extent needs to be split. + */ +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) +{ + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; + int ret = 0; + unsigned long flags; + + ASSERT(pre != 0); + ASSERT(pre < len); + + split_pre = alloc_extent_map(); + if (!split_pre) + return -ENOMEM; + split_mid = alloc_extent_map(); + if (!split_mid) { + ret = -ENOMEM; + goto out_free_pre; + } + + lock_extent(&inode->io_tree, start, start + len - 1, NULL); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + ret = -EIO; + goto out_unlock; + } + + ASSERT(em->len == len); + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); + + flags = em->flags; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; + split_pre->len = pre; + split_pre->orig_start = split_pre->start; + split_pre->block_start = em->block_start; + split_pre->block_len = split_pre->len; + split_pre->orig_block_len = split_pre->block_len; + split_pre->ram_bytes = split_pre->len; + split_pre->flags = flags; + split_pre->compress_type = em->compress_type; + split_pre->generation = em->generation; + + replace_extent_mapping(em_tree, em, split_pre, 1); + + /* + * Now we only have an extent_map at: + * [em->start, em->start + pre] + */ + + /* Insert the middle extent_map. */ + split_mid->start = em->start + pre; + split_mid->len = em->len - pre; + split_mid->orig_start = split_mid->start; + split_mid->block_start = em->block_start + pre; + split_mid->block_len = split_mid->len; + split_mid->orig_block_len = split_mid->block_len; + split_mid->ram_bytes = split_mid->len; + split_mid->flags = flags; + split_mid->compress_type = em->compress_type; + split_mid->generation = em->generation; + add_extent_mapping(em_tree, split_mid, 1); + + /* Once for us */ + free_extent_map(em); + /* Once for the tree */ + free_extent_map(em); + +out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + free_extent_map(split_mid); +out_free_pre: + free_extent_map(split_pre); + return ret; +} diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ad311864272a..7df39112388d 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -90,10 +90,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -void replace_extent_mapping(struct extent_map_tree *tree, - struct extent_map *cur, - struct extent_map *new, - int modified); +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 31c5b7c176d3..aba6dba346f4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2714,96 +2714,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -/* - * Split off the first pre bytes from the extent_map at [start, start + len] - * - * This function is intended to be used only for extract_ordered_extent(). - */ -static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - struct extent_map *split_pre = NULL; - struct extent_map *split_mid = NULL; - int ret = 0; - unsigned long flags; - - ASSERT(pre != 0); - ASSERT(pre < len); - - split_pre = alloc_extent_map(); - if (!split_pre) - return -ENOMEM; - split_mid = alloc_extent_map(); - if (!split_mid) { - ret = -ENOMEM; - goto out_free_pre; - } - - lock_extent(&inode->io_tree, start, start + len - 1, NULL); - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - ret = -EIO; - goto out_unlock; - } - - ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); - ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); - ASSERT(!list_empty(&em->list)); - - flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - - /* First, replace the em with a new extent_map starting from * em->start */ - split_pre->start = em->start; - split_pre->len = pre; - split_pre->orig_start = split_pre->start; - split_pre->block_start = em->block_start; - split_pre->block_len = split_pre->len; - split_pre->orig_block_len = split_pre->block_len; - split_pre->ram_bytes = split_pre->len; - split_pre->flags = flags; - split_pre->compress_type = em->compress_type; - split_pre->generation = em->generation; - - replace_extent_mapping(em_tree, em, split_pre, 1); - - /* - * Now we only have an extent_map at: - * [em->start, em->start + pre] - */ - - /* Insert the middle extent_map. */ - split_mid->start = em->start + pre; - split_mid->len = em->len - pre; - split_mid->orig_start = split_mid->start; - split_mid->block_start = em->block_start + pre; - split_mid->block_len = split_mid->len; - split_mid->orig_block_len = split_mid->block_len; - split_mid->ram_bytes = split_mid->len; - split_mid->flags = flags; - split_mid->compress_type = em->compress_type; - split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, 1); - - /* Once for us */ - free_extent_map(em); - /* Once for the tree */ - free_extent_map(em); - -out_unlock: - write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); -out_free_pre: - free_extent_map(split_pre); - return ret; -} - int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, struct btrfs_ordered_extent *ordered) { From ebdb44a00e257641a398fe2484ba9c65a03beea1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:10 +0200 Subject: [PATCH 113/194] btrfs: reorder conditions in btrfs_extract_ordered_extent There is no good reason for doing one before the other in terms of failure implications, but doing the extent_map split first will simplify some upcoming refactoring. Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aba6dba346f4..f7e06839a645 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2719,9 +2719,7 @@ int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; - struct btrfs_inode *inode = bbio->inode; - u64 ordered_len = ordered->num_bytes; - int ret = 0; + int ret; /* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) @@ -2731,18 +2729,18 @@ int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, if (ordered->disk_num_bytes == len) return 0; - ret = btrfs_split_ordered_extent(ordered, len); - if (ret) - return ret; - /* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one. */ - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - return 0; + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len); + if (ret) + return ret; + } - return split_extent_map(inode, bbio->file_offset, ordered_len, len); + return btrfs_split_ordered_extent(ordered, len); } /* From b0307e28642efa3bcc6353b9af213711f6d3848e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:11 +0200 Subject: [PATCH 114/194] btrfs: return the new ordered_extent from btrfs_split_ordered_extent Return the ordered_extent split from the passed in one. This will be needed to be able to store an ordered_extent in the btrfs_bio. Reviewed-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 +++++++- fs/btrfs/ordered-data.c | 19 ++++++++++--------- fs/btrfs/ordered-data.h | 3 ++- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f7e06839a645..9f026e632bf7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2719,6 +2719,7 @@ int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_ordered_extent *new; int ret; /* Must always be called for the beginning of an ordered extent. */ @@ -2740,7 +2741,12 @@ int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, return ret; } - return btrfs_split_ordered_extent(ordered, len); + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return PTR_ERR(new); + btrfs_put_ordered_extent(new); + + return 0; } /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 324a5a8c844a..15a410bbbe70 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1116,7 +1116,8 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, } /* Split out a new ordered extent for this first @len bytes of @ordered. */ -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len) { struct inode *inode = ordered->inode; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; @@ -1135,16 +1136,16 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) * reduce the original extent to a zero length either. */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* We cannot split once ordered extent is past end_bio. */ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* We cannot split a compressed ordered extent. */ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) - return -EINVAL; + return ERR_PTR(-EINVAL); /* Checksum list should be empty. */ if (WARN_ON_ONCE(!list_empty(&ordered->list))) - return -EINVAL; + return ERR_PTR(-EINVAL); spin_lock_irq(&tree->lock); /* Remove from tree once */ @@ -1171,13 +1172,13 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) /* * The splitting extent is already counted and will be added again in - * btrfs_add_ordered_extent(). Subtract len to avoid double counting. + * btrfs_alloc_ordered_extent(). Subtract len to avoid double counting. */ percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); + return btrfs_alloc_ordered_extent(BTRFS_I(inode), file_offset, len, len, + disk_bytenr, len, 0, flags, + ordered->compress_type); } int __init ordered_data_init(void) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index dc700aa515b5..0ae0f52c148f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -206,7 +206,8 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, struct extent_state **cached_state); bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); +struct btrfs_ordered_extent *btrfs_split_ordered_extent( + struct btrfs_ordered_extent *ordered, u64 len); int __init ordered_data_init(void); void __cold ordered_data_exit(void); From 53d9981ca20efcded0f6cf6b480d713f490ca9f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:13 +0200 Subject: [PATCH 115/194] btrfs: split btrfs_alloc_ordered_extent to allocation and insertion helpers Split two low-level helpers out of btrfs_alloc_ordered_extent to allocate and insert the logic extent. The pure alloc helper will be used to improve btrfs_split_ordered_extent. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 186 ++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 85 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 15a410bbbe70..ea1c2ec10573 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -146,6 +146,102 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } +static struct btrfs_ordered_extent *alloc_ordered_extent( + struct btrfs_inode *inode, u64 file_offset, u64 num_bytes, + u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, + u64 offset, unsigned long flags, int compress_type) +{ + struct btrfs_ordered_extent *entry; + int ret; + + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + if (ret < 0) + return ERR_PTR(ret); + } else { + /* + * The ordered extent has reserved qgroup space, release now + * and pass the reserved number for qgroup_record to free. + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + if (ret < 0) + return ERR_PTR(ret); + } + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); + if (!entry) + return ERR_PTR(-ENOMEM); + + entry->file_offset = file_offset; + entry->num_bytes = num_bytes; + entry->ram_bytes = ram_bytes; + entry->disk_bytenr = disk_bytenr; + entry->disk_num_bytes = disk_num_bytes; + entry->offset = offset; + entry->bytes_left = num_bytes; + entry->inode = igrab(&inode->vfs_inode); + entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; + entry->qgroup_rsv = ret; + entry->flags = flags; + refcount_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->root_extent_list); + INIT_LIST_HEAD(&entry->work_list); + init_completion(&entry->completion); + + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + + return entry; +} + +static void insert_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct btrfs_inode *inode = BTRFS_I(entry->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct rb_node *node; + + trace_btrfs_ordered_extent_add(inode, entry); + + percpu_counter_add_batch(&fs_info->ordered_bytes, entry->num_bytes, + fs_info->delalloc_batch); + + /* One ref for the tree. */ + refcount_inc(&entry->refs); + + spin_lock_irq(&tree->lock); + node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "inconsistency in ordered tree at offset %llu", + entry->file_offset); + spin_unlock_irq(&tree->lock); + + spin_lock(&root->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &root->ordered_extents); + root->nr_ordered_extents++; + if (root->nr_ordered_extents == 1) { + spin_lock(&fs_info->ordered_root_lock); + BUG_ON(!list_empty(&root->ordered_root)); + list_add_tail(&root->ordered_root, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); + } + spin_unlock(&root->ordered_extent_lock); +} + /* * Add an ordered extent to the per-inode tree. * @@ -171,95 +267,15 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type) { - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct rb_node *node; struct btrfs_ordered_extent *entry; - int ret; - - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); - if (ret < 0) - return ERR_PTR(ret); - ret = 0; - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); - if (ret < 0) - return ERR_PTR(ret); - } - entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); - - entry->file_offset = file_offset; - entry->num_bytes = num_bytes; - entry->ram_bytes = ram_bytes; - entry->disk_bytenr = disk_bytenr; - entry->disk_num_bytes = disk_num_bytes; - entry->offset = offset; - entry->bytes_left = num_bytes; - entry->inode = igrab(&inode->vfs_inode); - entry->compress_type = compress_type; - entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); - entry->flags = flags; - - percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, - fs_info->delalloc_batch); - - /* one ref for the tree */ - refcount_set(&entry->refs, 1); - init_waitqueue_head(&entry->wait); - INIT_LIST_HEAD(&entry->list); - INIT_LIST_HEAD(&entry->log_list); - INIT_LIST_HEAD(&entry->root_extent_list); - INIT_LIST_HEAD(&entry->work_list); - init_completion(&entry->completion); - - trace_btrfs_ordered_extent_add(inode, entry); - - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, file_offset, - &entry->rb_node); - if (node) - btrfs_panic(fs_info, -EEXIST, - "inconsistency in ordered tree at offset %llu", - file_offset); - spin_unlock_irq(&tree->lock); - - spin_lock(&root->ordered_extent_lock); - list_add_tail(&entry->root_extent_list, - &root->ordered_extents); - root->nr_ordered_extents++; - if (root->nr_ordered_extents == 1) { - spin_lock(&fs_info->ordered_root_lock); - BUG_ON(!list_empty(&root->ordered_root)); - list_add_tail(&root->ordered_root, &fs_info->ordered_roots); - spin_unlock(&fs_info->ordered_root_lock); - } - spin_unlock(&root->ordered_extent_lock); - - /* - * We don't need the count_max_extents here, we can assume that all of - * that work has been done at higher layers, so this is truly the - * smallest the extent is going to get. - */ - spin_lock(&inode->lock); - btrfs_mod_outstanding_extents(inode, 1); - spin_unlock(&inode->lock); - - /* One ref for the returned entry to match semantics of lookup. */ - refcount_inc(&entry->refs); + entry = alloc_ordered_extent(inode, file_offset, num_bytes, ram_bytes, + disk_bytenr, disk_num_bytes, offset, flags, + compress_type); + if (!IS_ERR(entry)) + insert_ordered_extent(entry); return entry; } From 816f589b8d43f7c258b0c10a5ce29daf01614651 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:14 +0200 Subject: [PATCH 116/194] btrfs: atomically insert the new extent in btrfs_split_ordered_extent Currently there is a small race window in btrfs_split_ordered_extent, where the reduced old extent can be looked up on the per-inode rbtree or the per-root list while the newly split out one isn't visible yet. Fix this by open coding btrfs_alloc_ordered_extent in btrfs_split_ordered_extent, and holding the tree lock and root->ordered_extent_lock over the entire tree and extent manipulation. Note that this introduces new lock ordering because previously ordered_extent_lock was never held over the tree lock. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ea1c2ec10573..0dc2bd0bd675 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1135,15 +1135,17 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { - struct inode *inode = ordered->inode; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; u64 disk_bytenr = ordered->disk_bytenr; unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + struct btrfs_ordered_extent *new; struct rb_node *node; - trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + trace_btrfs_ordered_extent_split(inode, ordered); ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); @@ -1163,7 +1165,16 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( if (WARN_ON_ONCE(!list_empty(&ordered->list))) return ERR_PTR(-EINVAL); - spin_lock_irq(&tree->lock); + new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, + len, 0, flags, ordered->compress_type); + if (IS_ERR(new)) + return new; + + /* One ref for the tree. */ + refcount_inc(&new->refs); + + spin_lock_irq(&root->ordered_extent_lock); + spin_lock(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; rb_erase(node, &tree->tree); @@ -1182,19 +1193,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", - ordered->file_offset); + ordered->file_offset); - spin_unlock_irq(&tree->lock); + node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + new->file_offset); + spin_unlock(&tree->lock); - /* - * The splitting extent is already counted and will be added again in - * btrfs_alloc_ordered_extent(). Subtract len to avoid double counting. - */ - percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); - - return btrfs_alloc_ordered_extent(BTRFS_I(inode), file_offset, len, len, - disk_bytenr, len, 0, flags, - ordered->compress_type); + list_add_tail(&new->root_extent_list, &root->ordered_extents); + root->nr_ordered_extents++; + spin_unlock_irq(&root->ordered_extent_lock); + return new; } int __init ordered_data_init(void) From 52b1fdca23ac0fbcad363a1a5b426bf0d56b715a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:15 +0200 Subject: [PATCH 117/194] btrfs: handle completed ordered extents in btrfs_split_ordered_extent To delay splitting ordered_extents to I/O completion time we need to be able to handle fully completed ordered extents in btrfs_split_ordered_extent. Besides a bit of accounting this primarily involved moving over the csums to the split bio for the range that it covers, which is simple enough because we always have one btrfs_ordered_sum per bio. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0dc2bd0bd675..bb76bf01a332 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1141,9 +1141,11 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; u64 disk_bytenr = ordered->disk_bytenr; - unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; + unsigned long flags = ordered->flags; + struct btrfs_ordered_sum *sum, *tmpsum; struct btrfs_ordered_extent *new; struct rb_node *node; + u64 offset = 0; trace_btrfs_ordered_extent_split(inode, ordered); @@ -1155,15 +1157,15 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( */ if (WARN_ON_ONCE(len >= ordered->num_bytes)) return ERR_PTR(-EINVAL); - /* We cannot split once ordered extent is past end_bio. */ - if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) - return ERR_PTR(-EINVAL); + /* We cannot split partially completed ordered extents. */ + if (ordered->bytes_left) { + ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS)); + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) + return ERR_PTR(-EINVAL); + } /* We cannot split a compressed ordered extent. */ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) return ERR_PTR(-EINVAL); - /* Checksum list should be empty. */ - if (WARN_ON_ONCE(!list_empty(&ordered->list))) - return ERR_PTR(-EINVAL); new = alloc_ordered_extent(inode, file_offset, len, len, disk_bytenr, len, 0, flags, ordered->compress_type); @@ -1186,7 +1188,29 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( ordered->disk_bytenr += len; ordered->num_bytes -= len; ordered->disk_num_bytes -= len; - ordered->bytes_left -= len; + + if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) { + ASSERT(ordered->bytes_left == 0); + new->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags)) { + if (ordered->truncated_len > len) { + ordered->truncated_len -= len; + } else { + new->truncated_len = ordered->truncated_len; + ordered->truncated_len = 0; + } + } + + list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { + if (offset == len) + break; + list_move_tail(&sum->list, &new->list); + offset += sum->len; + } /* Re-insert the node */ node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); From 71df088c1cc090d232eb691d8f42284a2c6409eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:16 +0200 Subject: [PATCH 118/194] btrfs: defer splitting of ordered extents until I/O completion The btrfs zoned completion code currently needs an ordered_extent and extent_map per bio so that it can account for the non-predictable write location from Zone Append. To archive that it currently splits the ordered_extent and extent_map at I/O submission time, and then records the actual physical address in the ->physical field of the ordered_extent. This patch instead switches to record the "original" physical address that the btrfs allocator assigned in spare space in the btrfs_bio, and then rewrites the logical address in the btrfs_ordered_sum structure at I/O completion time. This allows the ordered extent completion handler to simply walk the list of ordered csums and split the ordered extent as needed. This removes an extra ordered extent and extent_map lookup and manipulation during the I/O submission path, and instead batches it in the I/O completion path where we need to touch these anyway. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/bio.c | 17 ---------- fs/btrfs/btrfs_inode.h | 2 -- fs/btrfs/inode.c | 18 +++++++---- fs/btrfs/ordered-data.h | 1 + fs/btrfs/zoned.c | 70 ++++++++++++++++++++++++++++++++++------- fs/btrfs/zoned.h | 6 ++-- 6 files changed, 73 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b1cdd145aada..1b4d8d60c66f 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -61,20 +61,6 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, return bbio; } -static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) -{ - struct btrfs_ordered_extent *ordered; - int ret; - - ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - ret = btrfs_extract_ordered_extent(bbio, ordered); - btrfs_put_ordered_extent(ordered); - - return errno_to_blk_status(ret); -} - static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, struct btrfs_bio *orig_bbio, u64 map_length, bool use_append) @@ -668,9 +654,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (use_append) { bio->bi_opf &= ~REQ_OP_WRITE; bio->bi_opf |= REQ_OP_ZONE_APPEND; - ret = btrfs_bio_extract_ordered_extent(bbio); - if (ret) - goto fail_put_bio; } /* diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 08c996023394..8abf96cfea8f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -410,8 +410,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9f026e632bf7..ff3110a2b4e6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2714,8 +2714,8 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } -int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, - struct btrfs_ordered_extent *ordered) +static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, + struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; @@ -3180,7 +3180,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3215,11 +3215,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (btrfs_is_zoned(fs_info)) { - btrfs_rewrite_logical_zoned(ordered_extent); + if (btrfs_is_zoned(fs_info)) btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); - } if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3387,6 +3385,14 @@ out: return ret; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) +{ + if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + btrfs_finish_ordered_zoned(ordered); + return btrfs_finish_one_ordered(ordered); +} + void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0ae0f52c148f..828bb039634a 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -161,6 +161,7 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b55b0d4ee86f..b4dd018ba332 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "super.h" #include "fs.h" #include "accessors.h" #include "bio.h" @@ -1665,17 +1666,11 @@ void btrfs_record_physical_zoned(struct btrfs_bio *bbio) sum->logical += physical - bbio->orig_physical; } -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, + u64 logical) { - struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map_tree *em_tree = &BTRFS_I(ordered->inode)->extent_tree; struct extent_map *em; - struct btrfs_ordered_sum *sum = - list_first_entry(&ordered->list, typeof(*sum), list); - u64 logical = sum->logical; - - if (ordered->disk_bytenr == logical) - goto out; ordered->disk_bytenr = logical; @@ -1685,6 +1680,54 @@ void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) em->block_start = logical; free_extent_map(em); write_unlock(&em_tree->lock); +} + +static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, + u64 logical, u64 len) +{ + struct btrfs_ordered_extent *new; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, + ordered->num_bytes, len)) + return false; + + new = btrfs_split_ordered_extent(ordered, len); + if (IS_ERR(new)) + return false; + + if (new->disk_bytenr != logical) + btrfs_rewrite_logical_zoned(new, logical); + btrfs_finish_one_ordered(new); + return true; +} + +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_sum *sum = + list_first_entry(&ordered->list, typeof(*sum), list); + u64 logical = sum->logical; + u64 len = sum->len; + + while (len < ordered->disk_num_bytes) { + sum = list_next_entry(sum, list); + if (sum->logical == logical + len) { + len += sum->len; + continue; + } + if (!btrfs_zoned_split_ordered(ordered, logical, len)) { + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_err(fs_info, "failed to split ordered extent"); + goto out; + } + logical = sum->logical; + len = sum->len; + } + + if (ordered->disk_bytenr != logical) + btrfs_rewrite_logical_zoned(ordered, logical); out: /* @@ -1694,9 +1737,12 @@ out: * here so that we don't attempt to log the csums later. */ if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &inode->root->fs_info->fs_state)) { - list_del(&sum->list); - kfree(sum); + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + while ((sum = list_first_entry_or_null(&ordered->list, + typeof(*sum), list))) { + list_del(&sum->list); + kfree(sum); + } } } diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 3058ef559c98..27322b926038 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -30,6 +30,8 @@ struct btrfs_zoned_device_info { struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; +void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered); + #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); @@ -56,7 +58,6 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret); @@ -188,9 +189,6 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline void btrfs_rewrite_logical_zoned( - struct btrfs_ordered_extent *ordered) { } - static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, struct btrfs_block_group **cache_ret) From f000bc6fe43ce66c55fa1691000115b14ba95b33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 May 2023 17:03:17 +0200 Subject: [PATCH 119/194] btrfs: pass the new logical address to split_extent_map split_extent_map splits off the first chunk of an extent map into a new one. One of the two users is the zoned I/O completion code that wants to rewrite the logical block start address right after this split. Pass in the logical address to be set in the split off first extent_map as an argument to avoid an extra extent tree lookup for this case. Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 8 +++++--- fs/btrfs/extent_map.h | 3 ++- fs/btrfs/inode.c | 3 ++- fs/btrfs/zoned.c | 6 ++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index e5124461ea02..0cdb3e86f29b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -963,11 +963,13 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, } /* - * Split off the first pre bytes from the extent_map at [start, start + len] + * Split off the first pre bytes from the extent_map at [start, start + len], + * and set the block_start for it to new_logical. * * This function is used when an ordered_extent needs to be split. */ -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1010,7 +1012,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) split_pre->start = em->start; split_pre->len = pre; split_pre->orig_start = split_pre->start; - split_pre->block_start = em->block_start; + split_pre->block_start = new_logical; split_pre->block_len = split_pre->len; split_pre->orig_block_len = split_pre->block_len; split_pre->ram_bytes = split_pre->len; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 7df39112388d..35d27c756e08 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -90,7 +90,8 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre); +int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ff3110a2b4e6..b8c64882a1e7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2736,7 +2736,8 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len); + ordered->num_bytes, len, + ordered->disk_bytenr); if (ret) return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b4dd018ba332..f1471cb5fe26 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1689,15 +1689,13 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && split_extent_map(BTRFS_I(ordered->inode), ordered->file_offset, - ordered->num_bytes, len)) + ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return false; - - if (new->disk_bytenr != logical) - btrfs_rewrite_logical_zoned(new, logical); + new->disk_bytenr = logical; btrfs_finish_one_ordered(new); return true; } From efcfcbc6a36195c42d98e0ee697baba36da94dc8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 4 Apr 2023 00:06:02 +0200 Subject: [PATCH 120/194] btrfs: add xxhash to fast checksum implementations The implementation of XXHASH is now CPU only but still fast enough to be considered for the synchronous checksumming, like non-generic crc32c. A userspace benchmark comparing it to various implementations (patched hash-speedtest from btrfs-progs): Block size: 4096 Iterations: 1000000 Implementation: builtin Units: CPU cycles NULL-NOP: cycles: 73384294, cycles/i 73 NULL-MEMCPY: cycles: 228033868, cycles/i 228, 61664.320 MiB/s CRC32C-ref: cycles: 24758559416, cycles/i 24758, 567.950 MiB/s CRC32C-NI: cycles: 1194350470, cycles/i 1194, 11773.433 MiB/s CRC32C-ADLERSW: cycles: 6150186216, cycles/i 6150, 2286.372 MiB/s CRC32C-ADLERHW: cycles: 626979180, cycles/i 626, 22427.453 MiB/s CRC32C-PCL: cycles: 466746732, cycles/i 466, 30126.699 MiB/s XXHASH: cycles: 860656400, cycles/i 860, 16338.188 MiB/s Comparing purely software implementation (ref), current outdated accelerated using crc32q instruction (NI), optimized implementations by M. Adler (https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software/17646775#17646775) and the best one that was taken from kernel using the PCLMULQDQ instruction (PCL). Reviewed-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4328c6b1120e..7513388b0567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2011,6 +2011,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) if (!strstr(crypto_shash_driver_name(csum_shash), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); break; + case BTRFS_CSUM_TYPE_XXHASH: + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); + break; default: break; } From 3965a4c793d3b031119ff455c4773441584aee30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:34 +0200 Subject: [PATCH 121/194] btrfs: remove unused BTRFS_MAP_DISCARD BTRFS_MAP_DISCARD is never set, as REQ_OP_DISCARD is never passed to btrfs_op() only only checked in two ASSERTS. Remove it and let the catchall WARN_ON in btrfs_op() deal with accidental REQ_OP_DISCARDs leaked into btrfs_op(). Last use was in a4012f06f188 ("btrfs: split discard handling out of btrfs_map_block"). Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 --- fs/btrfs/volumes.h | 3 --- 2 files changed, 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4bfec088617..c236bfba0cec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6182,8 +6182,6 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) { - ASSERT(op != BTRFS_MAP_DISCARD); - /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. @@ -6261,7 +6259,6 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 max_len; ASSERT(bioc_ret); - ASSERT(op != BTRFS_MAP_DISCARD); num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); if (mirror_num > num_copies) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 16fc640cabd3..e960a51abf87 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -556,15 +556,12 @@ struct btrfs_dev_lookup_args { enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, - BTRFS_MAP_DISCARD, BTRFS_MAP_GET_READ_MIRRORS, }; static inline enum btrfs_map_op btrfs_op(struct bio *bio) { switch (bio_op(bio)) { - case REQ_OP_DISCARD: - return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; From 78a213a05df3510200331bfcc232bf0278c6ed50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:35 +0200 Subject: [PATCH 122/194] btrfs: optimize simple reads in btrfsic_map_block Pass a smap into __btrfs_map_block so that the usual case of a read that doesn't require parity raid recovery doesn't need an extra memory allocation for the btrfs_io_context. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b4408037b823..fe1536700014 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1459,13 +1459,13 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_io_context *multi = NULL; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap, *map; struct btrfs_device *device; length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - bytenr, &length, &multi, mirror_num); - + ret = __btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, + NULL, &mirror_num, 0); if (ret) { block_ctx_out->start = 0; block_ctx_out->dev_bytenr = 0; @@ -1478,21 +1478,26 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } - device = multi->stripes[0].dev; + if (bioc) + map = &bioc->stripes[0]; + else + map = &smap; + + device = map->dev; if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || !device->bdev || !device->name) block_ctx_out->dev = NULL; else block_ctx_out->dev = btrfsic_dev_state_lookup( device->bdev->bd_dev); - block_ctx_out->dev_bytenr = multi->stripes[0].physical; + block_ctx_out->dev_bytenr = map->physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; block_ctx_out->datav = NULL; block_ctx_out->pagev = NULL; block_ctx_out->mem_to_free = NULL; - kfree(multi); + kfree(bioc); if (NULL == block_ctx_out->dev) { ret = -ENXIO; pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); From d69d7ffc26f10d7be3768a6762809c57f6443a43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:36 +0200 Subject: [PATCH 123/194] btrfs: remove unused btrfs_map_block There are no users of btrfs_map_block left, so remove it. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 -------- fs/btrfs/volumes.h | 3 --- 2 files changed, 11 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c236bfba0cec..4c6405c4ce04 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6481,14 +6481,6 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num) -{ - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, &mirror_num, 0); -} - /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e960a51abf87..481f3ace988c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -582,9 +582,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); -int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); From cd4efd210edfb34f8cec3a0184899af751dc2d71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:37 +0200 Subject: [PATCH 124/194] btrfs: rename __btrfs_map_block to btrfs_map_block Now that the old btrfs_map_block is gone, drop the leading underscores from __btrfs_map_block. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 4 ++-- fs/btrfs/check-integrity.c | 4 ++-- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/volumes.c | 16 ++++++++-------- fs/btrfs/volumes.h | 10 +++++----- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1b4d8d60c66f..ead288fa24db 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -623,8 +623,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) int error; btrfs_bio_counter_inc_blocked(fs_info); - error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); if (error) { ret = errno_to_blk_status(error); goto fail; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index fe1536700014..3caf339c4bb3 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1464,8 +1464,8 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = __btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, - NULL, &mirror_num, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, + NULL, &mirror_num, 0); if (ret) { block_ctx_out->start = 0; block_ctx_out->dev_bytenr = 0; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index dc3f30c79320..5e86bea0a950 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -41,7 +41,7 @@ * All new writes will be written to both target and source devices, so even * if replace gets canceled, sources device still contains up-to-date data. * - * Location: handle_ops_on_dev_replace() from __btrfs_map_block() + * Location: handle_ops_on_dev_replace() from btrfs_map_block() * Start: btrfs_dev_replace_start() * End: btrfs_dev_replace_finishing() * Content: Latest data/metadata diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4c6405c4ce04..53059ee04f9b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6232,11 +6232,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); } -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6486,7 +6486,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, + return btrfs_map_block(fs_info, op, logical, length, bioc_ret, NULL, NULL, 1); } @@ -8066,8 +8066,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); - ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, + &bioc, smap, &mirror_ret, true); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 481f3ace988c..c70805c8d895 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -585,11 +585,11 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); -int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); From 723b8bb17e2e680934f59823ab1e72c1000d88d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:38 +0200 Subject: [PATCH 125/194] btrfs: open code btrfs_map_sblock btrfs_map_sblock just hard codes three arguments and calls btrfs_map_sblock. Remove it as it doesn't provide any real value, but makes following the btrfs_map_block call chains harder. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 9 +++++---- fs/btrfs/volumes.c | 9 --------- fs/btrfs/volumes.h | 3 --- fs/btrfs/zoned.c | 4 ++-- 4 files changed, 7 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 316c5a0e3a44..0cf8e2f277da 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -888,8 +888,9 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* For scrub, our mirror_num should always start at 1. */ ASSERT(stripe->mirror_num >= 1); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - stripe->logical, &mapped_len, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + stripe->logical, &mapped_len, &bioc, + NULL, NULL, 1); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -1921,8 +1922,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio->bi_end_io = raid56_scrub_wait_endio; btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL, 1); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 53059ee04f9b..6141a9fe5a28 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6481,15 +6481,6 @@ out: return ret; } -/* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret) -{ - return btrfs_map_block(fs_info, op, logical, length, bioc_ret, - NULL, NULL, 1); -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c70805c8d895..3930ee01d696 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -582,9 +582,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f1471cb5fe26..85b8b332add9 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1799,8 +1799,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int nmirrors; int i, ret; - ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bioc, NULL, NULL, 1); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; From 8680e58761eba8ea3552146ea005be30dfed2948 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 06:17:39 +0200 Subject: [PATCH 126/194] btrfs: open code need_full_stripe conditions need_full_stripe is just a somewhat complicated way to say "op != BTRFS_MAP_READ". Just spell that explicit check out, which makes a lot of the code currently using the helper easier to understand. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6141a9fe5a28..8137c04f31c9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6173,11 +6173,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static bool need_full_stripe(enum btrfs_map_op op) -{ - return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); -} - static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, u64 offset, u32 *stripe_nr, u64 *stripe_offset, u64 *full_stripe_start) @@ -6290,21 +6285,21 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_index = stripe_nr % map->num_stripes; stripe_nr /= map->num_stripes; - if (!need_full_stripe(op)) + if (op == BTRFS_MAP_READ) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; - else if (mirror_num) + } else if (mirror_num) { stripe_index = mirror_num - 1; - else { + } else { stripe_index = find_live_mirror(fs_info, map, 0, dev_replace_is_ongoing); mirror_num = stripe_index + 1; } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (need_full_stripe(op)) { + if (op != BTRFS_MAP_READ) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -6318,7 +6313,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = (stripe_nr % factor) * map->sub_stripes; stripe_nr /= factor; - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -6331,7 +6326,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { + if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr @@ -6366,7 +6361,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (!need_full_stripe(op) && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num <= 1) mirror_num = 1; } } else { @@ -6406,7 +6401,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ if (smap && num_alloc_stripes == 1 && !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (!need_full_stripe(op) || !dev_replace_is_ongoing || + (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); *mirror_num_ret = mirror_num; @@ -6430,7 +6425,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - (need_full_stripe(op) || mirror_num > 1)) { + (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6457,11 +6452,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } - if (need_full_stripe(op)) + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && - need_full_stripe(op)) { + op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, &num_stripes, &max_errors); } From 8ab546bb30bdf975e5ca6024ec6c4f7d324cb11a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 22 May 2023 16:51:10 +0200 Subject: [PATCH 127/194] btrfs: disable allocation warnings for compression workspaces The workspaces for compression are typically much larger than a page and for high zstd levels in the range of megabytes. There's a fallback to vmalloc but this can still fail (see the report). Some of the workspaces are preallocated at module load time so we have a safe fallback, otherwise when a new workspace is needed it's allocated but if this fails then the process waits. Which means the warning is only causing noise and we can use the GFP flag to disable it. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=217466 Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 6 +++--- fs/btrfs/zlib.c | 2 +- fs/btrfs/zstd.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 3a095b9c6373..d3fcfc628a4f 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -88,9 +88,9 @@ struct list_head *lzo_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL); - workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL); + workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN); + workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); + workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL | __GFP_NOWARN); if (!workspace->mem || !workspace->buf || !workspace->cbuf) goto fail; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8acb05e176c5..6c231a116a29 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -63,7 +63,7 @@ struct list_head *zlib_alloc_workspace(unsigned int level) workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), zlib_inflate_workspacesize()); - workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL); + workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL | __GFP_NOWARN); workspace->level = level; workspace->buf = NULL; /* diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index f798da267590..e7ac4ec809a4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -356,7 +356,7 @@ struct list_head *zstd_alloc_workspace(unsigned int level) workspace->level = level; workspace->req_level = level; workspace->last_used = jiffies; - workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; From 95c8e349d8e8f190e28854e7ca96de866d2dc5a4 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 1 Jun 2023 11:55:13 -0700 Subject: [PATCH 128/194] btrfs: warn on invalid slot in tree mod log rewind The way that tree mod log tracks the ultimate length of the eb, the variable 'n', eventually turns up the correct value, but at intermediate steps during the rewind, n can be inaccurate as a representation of the end of the eb. For example, it doesn't get updated on move rewinds, and it does get updated for add/remove in the middle of the eb. To detect cases with invalid moves, introduce a separate variable called max_slot which tries to track the maximum valid slot in the rewind eb. We can then warn if we do a move whose src range goes beyond the max valid slot. There is a commented caveat that it is possible to have this value be an overestimate due to the challenge of properly handling 'add' operations in the middle of the eb, but in practice it doesn't cause enough of a problem to throw out the max idea in favor of tracking every valid slot. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/tree-mod-log.c | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index a555baa0143a..39545d1d2e9a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -664,10 +664,27 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, unsigned long o_dst; unsigned long o_src; unsigned long p_size = sizeof(struct btrfs_key_ptr); + /* + * max_slot tracks the maximum valid slot of the rewind eb at every + * step of the rewind. This is in contrast with 'n' which eventually + * matches the number of items, but can be wrong during moves or if + * removes overlap on already valid slots (which is probably separately + * a bug). We do this to validate the offsets of memmoves for rewinding + * moves and detect invalid memmoves. + * + * Since a rewind eb can start empty, max_slot is a signed integer with + * a special meaning for -1, which is that no slot is valid to move out + * of. Any other negative value is invalid. + */ + int max_slot; + int move_src_end_slot; + int move_dst_end_slot; n = btrfs_header_nritems(eb); + max_slot = n - 1; read_lock(&fs_info->tree_mod_log_lock); while (tm && tm->seq >= time_seq) { + ASSERT(max_slot >= -1); /* * All the operations are recorded with the operator used for * the modification. As we're going backwards, we do the @@ -684,6 +701,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, btrfs_set_node_ptr_generation(eb, tm->slot, tm->generation); n++; + if (tm->slot > max_slot) + max_slot = tm->slot; break; case BTRFS_MOD_LOG_KEY_REPLACE: BUG_ON(tm->slot >= n); @@ -693,14 +712,37 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, tm->generation); break; case BTRFS_MOD_LOG_KEY_ADD: + /* + * It is possible we could have already removed keys + * behind the known max slot, so this will be an + * overestimate. In practice, the copy operation + * inserts them in increasing order, and overestimating + * just means we miss some warnings, so it's OK. It + * isn't worth carefully tracking the full array of + * valid slots to check against when moving. + */ + if (tm->slot == max_slot) + max_slot--; /* if a move operation is needed it's in the log */ n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: + ASSERT(tm->move.nr_items > 0); + move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1; + move_dst_end_slot = tm->slot + tm->move.nr_items - 1; o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); + if (WARN_ON(move_src_end_slot > max_slot || + tm->move.nr_items <= 0)) { + btrfs_warn(fs_info, +"move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d", + eb->start, tm->slot, + tm->move.dst_slot, tm->move.nr_items, + tm->seq, n, max_slot); + } memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); + max_slot = move_dst_end_slot; break; case BTRFS_MOD_LOG_ROOT_REPLACE: /* From 5cead5422a0e3d13b0bcee986c0f5c4ebb94100b Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 1 Jun 2023 11:55:14 -0700 Subject: [PATCH 129/194] btrfs: insert tree mod log move in push_node_left There is a fairly unlikely race condition in tree mod log rewind that can result in a kernel panic which has the following trace: [530.569] BTRFS critical (device sda3): unable to find logical 0 length 4096 [530.585] BTRFS critical (device sda3): unable to find logical 0 length 4096 [530.602] BUG: kernel NULL pointer dereference, address: 0000000000000002 [530.618] #PF: supervisor read access in kernel mode [530.629] #PF: error_code(0x0000) - not-present page [530.641] PGD 0 P4D 0 [530.647] Oops: 0000 [#1] SMP [530.654] CPU: 30 PID: 398973 Comm: below Kdump: loaded Tainted: G S O K 5.12.0-0_fbk13_clang_7455_gb24de3bdb045 #1 [530.680] Hardware name: Quanta Mono Lake-M.2 SATA 1HY9U9Z001G/Mono Lake-M.2 SATA, BIOS F20_3A15 08/16/2017 [530.703] RIP: 0010:__btrfs_map_block+0xaa/0xd00 [530.755] RSP: 0018:ffffc9002c2f7600 EFLAGS: 00010246 [530.767] RAX: ffffffffffffffea RBX: ffff888292e41000 RCX: f2702d8b8be15100 [530.784] RDX: ffff88885fda6fb8 RSI: ffff88885fd973c8 RDI: ffff88885fd973c8 [530.800] RBP: ffff888292e410d0 R08: ffffffff82fd7fd0 R09: 00000000fffeffff [530.816] R10: ffffffff82e57fd0 R11: ffffffff82e57d70 R12: 0000000000000000 [530.832] R13: 0000000000001000 R14: 0000000000001000 R15: ffffc9002c2f76f0 [530.848] FS: 00007f38d64af000(0000) GS:ffff88885fd80000(0000) knlGS:0000000000000000 [530.866] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [530.880] CR2: 0000000000000002 CR3: 00000002b6770004 CR4: 00000000003706e0 [530.896] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [530.912] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [530.928] Call Trace: [530.934] ? btrfs_printk+0x13b/0x18c [530.943] ? btrfs_bio_counter_inc_blocked+0x3d/0x130 [530.955] btrfs_map_bio+0x75/0x330 [530.963] ? kmem_cache_alloc+0x12a/0x2d0 [530.973] ? btrfs_submit_metadata_bio+0x63/0x100 [530.984] btrfs_submit_metadata_bio+0xa4/0x100 [530.995] submit_extent_page+0x30f/0x360 [531.004] read_extent_buffer_pages+0x49e/0x6d0 [531.015] ? submit_extent_page+0x360/0x360 [531.025] btree_read_extent_buffer_pages+0x5f/0x150 [531.037] read_tree_block+0x37/0x60 [531.046] read_block_for_search+0x18b/0x410 [531.056] btrfs_search_old_slot+0x198/0x2f0 [531.066] resolve_indirect_ref+0xfe/0x6f0 [531.076] ? ulist_alloc+0x31/0x60 [531.084] ? kmem_cache_alloc_trace+0x12e/0x2b0 [531.095] find_parent_nodes+0x720/0x1830 [531.105] ? ulist_alloc+0x10/0x60 [531.113] iterate_extent_inodes+0xea/0x370 [531.123] ? btrfs_previous_extent_item+0x8f/0x110 [531.134] ? btrfs_search_path_in_tree+0x240/0x240 [531.146] iterate_inodes_from_logical+0x98/0xd0 [531.157] ? btrfs_search_path_in_tree+0x240/0x240 [531.168] btrfs_ioctl_logical_to_ino+0xd9/0x180 [531.179] btrfs_ioctl+0xe2/0x2eb0 This occurs when logical inode resolution takes a tree mod log sequence number, and then while backref walking hits a rewind on a busy node which has the following sequence of tree mod log operations (numbers filled in from a specific example, but they are somewhat arbitrary) REMOVE_WHILE_FREEING slot 532 REMOVE_WHILE_FREEING slot 531 REMOVE_WHILE_FREEING slot 530 ... REMOVE_WHILE_FREEING slot 0 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 ADD slot 455 ADD slot 454 ADD slot 453 ... ADD slot 0 MOVE src slot 0 -> dst slot 456 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 When this sequence gets applied via btrfs_tree_mod_log_rewind, it allocates a fresh rewind eb, and first inserts the correct key info for the 533 elements, then overwrites the first 456 of them, then decrements the count by 456 via the add ops, then rewinds the move by doing a memmove from 456:988->0:532. We have never written anything past 532, so that memmove writes garbage into the 0:532 range. In practice, this results in a lot of fully 0 keys. The rewind then puts valid keys into slots 0:455 with the last removes, but 456:532 are still invalid. When search_old_slot uses this eb, if it uses one of those invalid slots, it can then read the extent buffer and issue a bio for offset 0 which ultimately panics looking up extent mappings. This bad tree mod log sequence gets generated when the node balancing code happens to do a balance_node_right followed by a push_node_left while logging in the tree mod log. Illustrated for ebs L and R (left and right): L R start: [XXX|YYY|...] [ZZZ|...|...] balance_node_right: [XXX|YYY|...] [...|ZZZ|...] move Z to make room for Y [XXX|...|...] [YYY|ZZZ|...] copy Y from L to R push_node_left: [XXX|YYY|...] [...|ZZZ|...] copy Y from R to L [XXX|YYY|...] [ZZZ|...|...] move Z into emptied space (NOT LOGGED!) This is because balance_node_right logs a move, but push_node_left explicitly doesn't. That is because logging the move would remove the overwritten src < dst range in the right eb, which was already logged when we called btrfs_tree_mod_log_eb_copy. The correct sequence would include a move from 456:988 to 0:532 after remove 0:455 and before removing 0:532. Reversing that sequence would entail creating keys for 0:532, then moving those keys out to 456:988, then creating more keys for 0:455. i.e., REMOVE_WHILE_FREEING slot 532 REMOVE_WHILE_FREEING slot 531 REMOVE_WHILE_FREEING slot 530 ... REMOVE_WHILE_FREEING slot 0 MOVE src slot 456 -> dst slot 0 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 ADD slot 455 ADD slot 454 ADD slot 453 ... ADD slot 0 MOVE src slot 0 -> dst slot 456 nritems 533 REMOVE slot 455 REMOVE slot 454 REMOVE slot 453 ... REMOVE slot 0 Fix this to log the move but avoid the double remove by putting all the logging logic in btrfs_tree_mod_log_eb_copy which has enough information to detect these cases and properly log moves, removes, and adds. Leave btrfs_tree_mod_log_insert_move to handle insert_ptr and delete_ptr's tree mod logging. (Un)fortunately, this is quite difficult to reproduce, and I was only able to reproduce it by adding sleeps in btrfs_search_old_slot that would encourage more log rewinding during ino_to_logical ioctls. I was able to hit the warning in the previous patch in the series without the fix quite quickly, but not after this patch. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++--- fs/btrfs/tree-mod-log.c | 73 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2f2071d64c52..385524224037 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2785,8 +2785,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call btrfs_tree_mod_log_insert_move() here, key removal - * was already fully logged by btrfs_tree_mod_log_eb_copy() above. + * btrfs_tree_mod_log_eb_copy handles logging the move, so we + * don't need to do an explicit tree mod log operation for it. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), btrfs_node_key_ptr_offset(src, push_items), @@ -2847,8 +2847,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); - BUG_ON(ret < 0); + + /* + * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't + * need to do an explicit tree mod log operation for it. + */ memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 39545d1d2e9a..07c086f9e35e 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -248,6 +248,26 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, return ret; } +static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) + return ERR_PTR(-ENOMEM); + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + RB_CLEAR_NODE(&tm->node); + + return tm; +} + int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, int dst_slot, int src_slot, int nr_items) @@ -265,18 +285,13 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; + tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); + if (IS_ERR(tm)) { + ret = PTR_ERR(tm); + tm = NULL; goto free_tms; } - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = BTRFS_MOD_LOG_MOVE_KEYS; - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); @@ -489,6 +504,10 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, struct tree_mod_elem **tm_list_add, **tm_list_rem; int i; bool locked = false; + struct tree_mod_elem *dst_move_tm = NULL; + struct tree_mod_elem *src_move_tm = NULL; + u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset; + u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items); if (!tree_mod_need_log(fs_info, NULL)) return 0; @@ -501,6 +520,26 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (!tm_list) return -ENOMEM; + if (dst_move_nr_items) { + dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, + dst_offset, dst_move_nr_items); + if (IS_ERR(dst_move_tm)) { + ret = PTR_ERR(dst_move_tm); + dst_move_tm = NULL; + goto free_tms; + } + } + if (src_move_nr_items) { + src_move_tm = tree_mod_log_alloc_move(src, src_offset, + src_offset + nr_items, + src_move_nr_items); + if (IS_ERR(src_move_tm)) { + ret = PTR_ERR(src_move_tm); + src_move_tm = NULL; + goto free_tms; + } + } + tm_list_add = tm_list; tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { @@ -523,6 +562,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, goto free_tms; locked = true; + if (dst_move_tm) { + ret = tree_mod_log_insert(fs_info, dst_move_tm); + if (ret) + goto free_tms; + } for (i = 0; i < nr_items; i++) { ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); if (ret) @@ -531,6 +575,11 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (ret) goto free_tms; } + if (src_move_tm) { + ret = tree_mod_log_insert(fs_info, src_move_tm); + if (ret) + goto free_tms; + } write_unlock(&fs_info->tree_mod_log_lock); kfree(tm_list); @@ -538,6 +587,12 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, return 0; free_tms: + if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node)) + rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log); + kfree(dst_move_tm); + if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) + rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); + kfree(src_move_tm); for (i = 0; i < nr_items * 2; i++) { if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); From 36614a3beba33a05ad78d4dcb9aa1d00e8a7d01f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:50 +0200 Subject: [PATCH 130/194] btrfs: fix range_end calculation in extent_write_locked_range The range_end field in struct writeback_control is inclusive, just like the end parameter passed to extent_write_locked_range. Not doing this could cause extra writeout, which is harmless but suboptimal. Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads") CC: stable@vger.kernel.org # 5.9+ Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d5b3b15f7ab2..0726c82db309 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2310,7 +2310,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) struct writeback_control wbc_writepages = { .sync_mode = WB_SYNC_ALL, .range_start = start, - .range_end = end + 1, + .range_end = end, .no_cgroup_owner = 1, }; struct btrfs_bio_ctrl bio_ctrl = { From ed9ee98ecb4fdbdfe043ee3eec0a65c0745d8669 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:51 +0200 Subject: [PATCH 131/194] btrfs: factor out a btrfs_verify_page helper Split all the conditionals for the fsverity calls in end_page_read into a btrfs_verify_page helper to keep the code readable and make additional refactoring easier. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0726c82db309..8e42ce48b52e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -481,6 +481,15 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, start, end, page_ops, NULL); } +static bool btrfs_verify_page(struct page *page, u64 start) +{ + if (!fsverity_active(page->mapping->host) || + PageError(page) || PageUptodate(page) || + start >= i_size_read(page->mapping->host)) + return true; + return fsverity_verify_page(page); +} + static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -489,11 +498,7 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) start + len <= page_offset(page) + PAGE_SIZE); if (uptodate) { - if (fsverity_active(page->mapping->host) && - !PageError(page) && - !PageUptodate(page) && - start < i_size_read(page->mapping->host) && - !fsverity_verify_page(page)) { + if (!btrfs_verify_page(page, start)) { btrfs_page_set_error(fs_info, page, start, len); } else { btrfs_page_set_uptodate(fs_info, page, start, len); From 2c14f0ffdd30bd3d321ad5fe76fcf701746e1df6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:52 +0200 Subject: [PATCH 132/194] btrfs: fix fsverify read error handling in end_page_read Also clear the uptodate bit to make sure the page isn't seen as uptodate in the page cache if fsverity verification fails. Fixes: 146054090b08 ("btrfs: initial fsverity support") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8e42ce48b52e..a943a6622489 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -497,12 +497,8 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); - if (uptodate) { - if (!btrfs_verify_page(page, start)) { - btrfs_page_set_error(fs_info, page, start, len); - } else { - btrfs_page_set_uptodate(fs_info, page, start, len); - } + if (uptodate && btrfs_verify_page(page, start)) { + btrfs_page_set_uptodate(fs_info, page, start, len); } else { btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_set_error(fs_info, page, start, len); From 57201dddd6f820de9ff94472da9b9cda88973c8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:53 +0200 Subject: [PATCH 133/194] btrfs: don't check PageError in btrfs_verify_page btrfs_verify_page is called from the readpage completion handler, which is only used to read pages, or parts of pages that aren't uptodate yet. The only case where PageError could be set on a page in btrfs is if we had a previous writeback error, but in that case we won't called readpage on it, as it has previously been marked uptodate. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a943a6622489..178e8230c28a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -484,7 +484,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, static bool btrfs_verify_page(struct page *page, u64 start) { if (!fsverity_active(page->mapping->host) || - PageError(page) || PageUptodate(page) || + PageUptodate(page) || start >= i_size_read(page->mapping->host)) return true; return fsverity_verify_page(page); From 973fb26e81a9f59dfe45b079a698c0e238f7bf69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:54 +0200 Subject: [PATCH 134/194] btrfs: don't fail writeback when allocating the compression context fails If cow_file_range_async fails to allocate the asynchronous writeback context, it currently returns an error and entirely fails the writeback. This is not a good idea as a writeback failure is a non-temporary error condition that will make the file system unusable. Just fall back to synchronous uncompressed writeback instead. This requires us to delay setting the BTRFS_INODE_HAS_ASYNC_EXTENT flag until we've committed to the async writeback. The compression checks INODE_NOCOMPRESS and FORCE_COMPRESS are moved from cow_file_range_async to the preceding checks btrfs_run_delalloc_range(). Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 74 ++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8c64882a1e7..2c7d726067e5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1720,58 +1720,36 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_cow); } -static int cow_file_range_async(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) +static bool cow_file_range_async(struct btrfs_inode *inode, + struct writeback_control *wbc, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; - u64 cur_end; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; - bool should_compress; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); - unlock_extent(&inode->io_tree, start, end, NULL); - - if (inode->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(fs_info, FORCE_COMPRESS)) { - num_chunks = 1; - should_compress = false; - } else { - should_compress = true; - } - nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); + if (!ctx) + return false; - if (!ctx) { - unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | - EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR; - - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits, page_ops); - return -ENOMEM; - } + unlock_extent(&inode->io_tree, start, end, NULL); + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { - if (should_compress) - cur_end = min(end, start + SZ_512K - 1); - else - cur_end = end; + u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the @@ -1832,7 +1810,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, start = cur_end + 1; } *page_started = 1; - return 0; + return true; } static noinline int run_delalloc_zoned(struct btrfs_inode *inode, @@ -2413,7 +2391,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc) { - int ret; + int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); /* @@ -2434,19 +2412,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); - } else if (!btrfs_inode_can_compress(inode) || - !inode_need_compress(inode, start, end)) { - if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written); - else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); - } else { - set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); - ret = cow_file_range_async(inode, wbc, locked_page, start, end, - page_started, nr_written); + goto out; } + + if (btrfs_inode_can_compress(inode) && + inode_need_compress(inode, start, end) && + cow_file_range_async(inode, wbc, locked_page, start, + end, page_started, nr_written)) + goto out; + + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1, NULL); + +out: ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, From bb7b05fe1b51e2ae529cbbd4c582ec83503f121a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:55 +0200 Subject: [PATCH 135/194] btrfs: rename cow_file_range_async to run_delalloc_compressed cow_file_range_async is only used for compressed writeback. Rename it to run_delalloc_compressed, which also fits in with run_delalloc_nocow and run_delalloc_zoned. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2c7d726067e5..4633ccacefac 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1693,7 +1693,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to * always adjust ->async_delalloc_pages as its paired with the init - * happening in cow_file_range_async + * happening in run_delalloc_compressed */ if (async_chunk->inode) submit_compressed_extents(async_chunk); @@ -1720,11 +1720,11 @@ static noinline void async_cow_free(struct btrfs_work *work) kvfree(async_cow); } -static bool cow_file_range_async(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) +static bool run_delalloc_compressed(struct btrfs_inode *inode, + struct writeback_control *wbc, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -2417,8 +2417,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - cow_file_range_async(inode, wbc, locked_page, start, - end, page_started, nr_written)) + run_delalloc_compressed(inode, wbc, locked_page, start, + end, page_started, nr_written)) goto out; if (zoned) From 3e92499e3b004baffb479d61e191b41b604ece9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:56 +0200 Subject: [PATCH 136/194] btrfs: don't check PageError in __extent_writepage __extent_writepage currenly sets PageError whenever any error happens, and the also checks for PageError to decide if to call error handling. This leads to very unclear responsibility for cleaning up on errors. In the VM and generic writeback helpers the basic idea is that once I/O is fired off all error handling responsibility is delegated to the end I/O handler. But if that end I/O handler sets the PageError bit, and the submitter checks it, the bit could in some cases leak into the submission context for fast enough I/O. Fix this by simply not checking PageError and just using the local ret variable to check for submission errors. This also fundamentally solves the long problem documented in a comment in __extent_writepage by never leaking the error bit into the submission context. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 178e8230c28a..33e5f0a31c21 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1557,38 +1557,7 @@ done: set_page_writeback(page); end_page_writeback(page); } - /* - * Here we used to have a check for PageError() and then set @ret and - * call end_extent_writepage(). - * - * But in fact setting @ret here will cause different error paths - * between subpage and regular sectorsize. - * - * For regular page size, we never submit current page, but only add - * current page to current bio. - * The bio submission can only happen in next page. - * Thus if we hit the PageError() branch, @ret is already set to - * non-zero value and will not get updated for regular sectorsize. - * - * But for subpage case, it's possible we submit part of current page, - * thus can get PageError() set by submitted bio of the same page, - * while our @ret is still 0. - * - * So here we unify the behavior and don't set @ret. - * Error can still be properly passed to higher layer as page will - * be set error, here we just don't handle the IO failure. - * - * NOTE: This is just a hotfix for subpage. - * The root fix will be properly ending ordered extent when we hit - * an error during writeback. - * - * But that needs a bigger refactoring, as we not only need to grab the - * submitted OE, but also need to know exactly at which bytenr we hit - * the error. - * Currently the full page based __extent_writepage_io() is not - * capable of that. - */ - if (PageError(page)) + if (ret) end_extent_writepage(page, ret, page_start, page_end); if (bio_ctrl->extent_locked) { struct writeback_control *wbc = bio_ctrl->wbc; From 2b2553f12355577d8de3eaedfa25fce3368d652c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:57 +0200 Subject: [PATCH 137/194] btrfs: stop setting PageError in the data I/O path PageError is not used by the VFS/MM and deprecated because it uses up a page bit and has no coherent rules. Instead read errors are usually propagated by not setting or clearing the uptodate bit, and write errors are propagated through the address_space. Btrfs now only sets the flag and never clears it for data pages, so just remove all places setting it, and the subpage error bit. Note that the error propagation for superblock writes that work on the block device mapping still uses PageError for now, but that will be addressed in a separate series. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 -- fs/btrfs/extent_io.c | 24 +++++------------------- fs/btrfs/inode.c | 3 --- fs/btrfs/subpage.c | 35 ----------------------------------- fs/btrfs/subpage.h | 10 ++++------ 5 files changed, 9 insertions(+), 65 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 04cd5de4f00f..bb2d1a4ceca1 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -211,8 +211,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - if (errno) - folio_set_error(folio); btrfs_page_clamp_clear_writeback(fs_info, &folio->page, cb->start, cb->len); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33e5f0a31c21..b7f26a4bb663 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -223,8 +223,6 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_SET_ORDERED) btrfs_page_clamp_set_ordered(fs_info, page, start, len); - if (page_ops & PAGE_SET_ERROR) - btrfs_page_clamp_set_error(fs_info, page, start, len); if (page_ops & PAGE_START_WRITEBACK) { btrfs_page_clamp_clear_dirty(fs_info, page, start, len); btrfs_page_clamp_set_writeback(fs_info, page, start, len); @@ -497,12 +495,10 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); - if (uptodate && btrfs_verify_page(page, start)) { + if (uptodate && btrfs_verify_page(page, start)) btrfs_page_set_uptodate(fs_info, page, start, len); - } else { + else btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); @@ -530,7 +526,6 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) len = end + 1 - start; btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -1059,7 +1054,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { unlock_extent(tree, start, end, NULL); - btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); unlock_page(page); return ret; } @@ -1263,11 +1257,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written, wbc); - if (ret) { - btrfs_page_set_error(inode->root->fs_info, page, - page_offset(page), PAGE_SIZE); + if (ret) return ret; - } + /* * delalloc_end is already one less than the total length, so * we don't subtract one from PAGE_SIZE @@ -1420,7 +1412,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR(em)) { - btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); goto out_error; } @@ -1519,9 +1510,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl WARN_ON(!PageLocked(page)); - btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, - page_offset(page), PAGE_SIZE); - pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { @@ -1534,10 +1522,8 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); ret = set_page_extent_mapped(page); - if (ret < 0) { - SetPageError(page); + if (ret < 0) goto done; - } if (!bio_ctrl->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4633ccacefac..14e9aa91293e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1153,8 +1153,6 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, const u64 page_start = page_offset(locked_page); const u64 page_end = page_start + PAGE_SIZE - 1; - btrfs_page_set_error(inode->root->fs_info, locked_page, - page_start, PAGE_SIZE); set_page_writeback(locked_page); end_page_writeback(locked_page); end_extent_writepage(locked_page, ret, page_start, page_end); @@ -2942,7 +2940,6 @@ out_page: mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); clear_page_dirty_for_io(page); - SetPageError(page); } btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 74bc43040531..1b999c6e4193 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -100,9 +100,6 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector subpage_info->uptodate_offset = cur; cur += nr_bits; - subpage_info->error_offset = cur; - cur += nr_bits; - subpage_info->dirty_offset = cur; cur += nr_bits; @@ -416,35 +413,6 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } -void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - -void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) -{ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, - error, start, len); - unsigned long flags; - - spin_lock_irqsave(&subpage->lock, flags); - bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) - ClearPageError(page); - spin_unlock_irqrestore(&subpage->lock, flags); -} - void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { @@ -606,7 +574,6 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); -IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); @@ -674,7 +641,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, @@ -769,7 +735,6 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, subpage_info, error, &error_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, dirty, &dirty_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, writeback, &writeback_bitmap); GET_SUBPAGE_BITMAP(subpage, subpage_info, ordered, &ordered_bitmap); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5905caea8409..5cbf67ccbdeb 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -8,17 +8,17 @@ /* * Extra info for subpapge bitmap. * - * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * For subpage we pack all uptodate/dirty/writeback/ordered bitmaps into * one larger bitmap. * * This structure records how they are organized in the bitmap: * - * /- uptodate_offset /- error_offset /- dirty_offset + * /- uptodate_offset /- dirty_offset /- ordered_offset * | | | * v v v - * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |u|u|u|u|........|u|u|d|d|.......|d|d|o|o|.......|o|o| * |<- bitmap_nr_bits ->| - * |<--------------- total_nr_bits ---------------->| + * |<----------------- total_nr_bits ------------------>| */ struct btrfs_subpage_info { /* Number of bits for each bitmap */ @@ -32,7 +32,6 @@ struct btrfs_subpage_info { * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. */ unsigned int uptodate_offset; - unsigned int error_offset; unsigned int dirty_offset; unsigned int writeback_offset; unsigned int ordered_offset; @@ -141,7 +140,6 @@ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); -DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); From a994310aa26fcb6bd7529714ea713b6792630f67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:58 +0200 Subject: [PATCH 138/194] btrfs: remove PAGE_SET_ERROR Now that the btrfs writeback code has stopped using PageError, using PAGE_SET_ERROR to just set the per-address_space error flag is confusing. Open code the mapping_set_error calls in the callers and remove the PAGE_SET_ERROR flag. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 --- fs/btrfs/extent_io.h | 1 - fs/btrfs/inode.c | 11 ++++++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b7f26a4bb663..09a9973c27cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -268,9 +268,6 @@ static int __process_pages_contig(struct address_space *mapping, ASSERT(processed_end && *processed_end == start); } - if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index) - mapping_set_error(mapping, -EIO); - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2d91ca91dca5..6723bf3483d9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -40,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_SET_ERROR), ENUM_BIT(PAGE_LOCK), }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 14e9aa91293e..a40a6002a198 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -835,6 +835,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) { struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -949,7 +950,7 @@ again: /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->vfs_inode.i_mapping, start, + mapping, start, pages, &nr_pages, &total_in, @@ -992,9 +993,9 @@ cont: unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_error_op; - page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; + if (ret < 0) + mapping_set_error(mapping, -EIO); /* * inline extent creation worked or returned error, @@ -1011,7 +1012,6 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | - page_error_op | PAGE_END_WRITEBACK); /* @@ -1271,12 +1271,13 @@ out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK | PAGE_SET_ERROR); + PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); goto done; } From f22b5dcbd71edea087946511554956591557de9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:04:59 +0200 Subject: [PATCH 139/194] btrfs: remove non-standard extent handling in __extent_writepage_io __extent_writepage_io is never called for compressed or inline extents, or holes. Remove the not quite working code for them and replace it with asserts that these cases don't happen. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 09a9973c27cc..a2e1dbd9b923 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1361,7 +1361,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct extent_map *em; int ret = 0; int nr = 0; - bool compressed; ret = btrfs_writepage_cow_fixup(page); if (ret) { @@ -1419,10 +1418,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, ASSERT(cur < end); ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); + block_start = em->block_start; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(block_start != EXTENT_MAP_HOLE); + ASSERT(block_start != EXTENT_MAP_INLINE); + /* * Note that em_end from extent_map_end() and dirty_range_end from * find_next_dirty_byte() are all exclusive @@ -1431,22 +1434,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, free_extent_map(em); em = NULL; - /* - * compressed and inline extents are written through other - * paths in the FS - */ - if (compressed || block_start == EXTENT_MAP_HOLE || - block_start == EXTENT_MAP_INLINE) { - if (compressed) - nr++; - else - btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, true); - btrfs_page_clear_dirty(fs_info, page, cur, iosize); - cur += iosize; - continue; - } - btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, From 9ecdbee819ca90589bf2efccfb90d5dabc0de057 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:05:00 +0200 Subject: [PATCH 140/194] btrfs: move writeback_control::nr_to_write update to __extent_writepage Move the nr_to_write accounting from __extent_writepage_io to __extent_writepage_io as we'll grow another __extent_writepage_io that doesn't want this accounting soon. Also drop the obsolete comment - decrementing a counter in the on-stack writeback_control data structure doesn't need the page lock. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a2e1dbd9b923..f773ee12ce1c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1370,12 +1370,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - /* - * we don't want to touch the inode after unlocking the page, - * so we update the mapping writeback index now - */ - bio_ctrl->wbc->nr_to_write--; - bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; @@ -1521,6 +1515,8 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl if (ret == 1) return 0; + bio_ctrl->wbc->nr_to_write--; + done: if (nr == 0) { /* make sure the mapping tag for page dirty gets cleared */ From eb34dceace983e304e00d4bf711cec0a603959ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:05:01 +0200 Subject: [PATCH 141/194] btrfs: only call __extent_writepage_io from extent_write_locked_range __extent_writepage does a lot of things that make no sense for extent_write_locked_range, given that extent_write_locked_range itself is called from __extent_writepage either directly or through a workqueue, and all this work has already been done in the first invocation and the pages haven't been unlocked since. Call __extent_writepage_io directly instead and open code the logic tracked in btrfs_bio_ctrl::extent_locked. Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 65 ++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f773ee12ce1c..1da247e753b0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -103,12 +103,6 @@ struct btrfs_bio_ctrl { blk_opf_t opf; btrfs_bio_end_io_t end_io_func; struct writeback_control *wbc; - - /* - * Tell writepage not to lock the state bits for this range, it still - * does the unlocking. - */ - bool extent_locked; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) @@ -1475,7 +1469,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); const u64 page_end = page_start + PAGE_SIZE - 1; int ret; @@ -1503,13 +1496,11 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl if (ret < 0) goto done; - if (!bio_ctrl->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); - if (ret == 1) - return 0; - if (ret) - goto done; - } + ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + if (ret == 1) + return 0; + if (ret) + goto done; ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); if (ret == 1) @@ -1525,21 +1516,7 @@ done: } if (ret) end_extent_writepage(page, ret, page_start, page_end); - if (bio_ctrl->extent_locked) { - struct writeback_control *wbc = bio_ctrl->wbc; - - /* - * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), - * the page can either be locked by lock_page() or - * process_one_page(). - * Let btrfs_page_unlock_writer() handle both cases. - */ - ASSERT(wbc); - btrfs_page_unlock_writer(fs_info, page, wbc->range_start, - wbc->range_end + 1 - wbc->range_start); - } else { - unlock_page(page); - } + unlock_page(page); ASSERT(ret <= 0); return ret; } @@ -2239,10 +2216,10 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + loff_t i_size = i_size_read(inode); u64 cur = start; - unsigned long nr_pages; - const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; struct writeback_control wbc_writepages = { .sync_mode = WB_SYNC_ALL, .range_start = start, @@ -2254,17 +2231,15 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) /* We're called from an async helper function */ .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | wbc_to_write_flags(&wbc_writepages), - .extent_locked = 1, }; ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); - nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> - PAGE_SHIFT; - wbc_writepages.nr_to_write = nr_pages * 2; wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + struct page *page; + int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); /* @@ -2275,12 +2250,25 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &bio_ctrl); - ASSERT(ret <= 0); + + ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, + i_size, &nr); + if (ret == 1) + goto next_page; + + /* Make sure the mapping tag for page dirty gets cleared. */ + if (nr == 0) { + set_page_writeback(page); + end_page_writeback(page); + } + if (ret) + end_extent_writepage(page, ret, cur, cur_end); + btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); if (ret < 0) { found_error = true; first_error = ret; } +next_page: put_page(page); cur = cur_end + 1; } @@ -2301,7 +2289,6 @@ int extent_writepages(struct address_space *mapping, struct btrfs_bio_ctrl bio_ctrl = { .wbc = wbc, .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), - .extent_locked = 0, }; /* From 7027f87108ce3d23ea542e1a9a79056530db4a87 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 08:05:02 +0200 Subject: [PATCH 142/194] btrfs: don't treat zoned writeback as being from an async helper thread When extent_write_locked_range was originally added, it was only used writing back compressed pages from an async helper thread. But it is now also used for writing back pages on zoned devices, where it is called directly from the ->writepage context. In this case we want to be able to pass on the writeback_control instead of creating a new one, and more importantly want to use all the normal cgroup interaction instead of potentially deferring writeback to another helper. Fixes: 898793d992c2 ("btrfs: zoned: write out partially allocated region") Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 20 +++++++------------- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/inode.c | 20 +++++++++++++++----- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1da247e753b0..f4d3c56b2900 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2210,7 +2210,8 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end) +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc) { bool found_error = false; int first_error = 0; @@ -2220,22 +2221,16 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) const u32 sectorsize = fs_info->sectorsize; loff_t i_size = i_size_read(inode); u64 cur = start; - struct writeback_control wbc_writepages = { - .sync_mode = WB_SYNC_ALL, - .range_start = start, - .range_end = end, - .no_cgroup_owner = 1, - }; struct btrfs_bio_ctrl bio_ctrl = { - .wbc = &wbc_writepages, - /* We're called from an async helper function */ - .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | - wbc_to_write_flags(&wbc_writepages), + .wbc = wbc, + .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), }; + if (wbc->no_cgroup_owner) + bio_ctrl.opf |= REQ_BTRFS_CGROUP_PUNT; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); - wbc_attach_fdatawrite_inode(&wbc_writepages, inode); while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); struct page *page; @@ -2275,7 +2270,6 @@ next_page: submit_write_bio(&bio_ctrl, found_error ? ret : 0); - wbc_detach_inode(&wbc_writepages); if (found_error) return first_error; return ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6723bf3483d9..c5fae3a7d911 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,7 +178,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end, + struct writeback_control *wbc); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a40a6002a198..600086b8195d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1133,6 +1133,12 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, unsigned long nr_written = 0; int page_started = 0; int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = start, + .range_end = end, + .no_cgroup_owner = 1, + }; /* * Call cow_file_range() to run the delalloc range directly, since we @@ -1162,7 +1168,10 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, } /* All pages will be unlocked, including @locked_page */ - return extent_write_locked_range(&inode->vfs_inode, start, end); + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); + wbc_detach_inode(&wbc); + return ret; } static int submit_one_async_extent(struct btrfs_inode *inode, @@ -1815,7 +1824,8 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, static noinline int run_delalloc_zoned(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { u64 done_offset = end; int ret; @@ -1847,8 +1857,8 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, account_page_redirty(locked_page); } locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset); - + extent_write_locked_range(&inode->vfs_inode, start, done_offset, + wbc); start = done_offset + 1; } @@ -2422,7 +2432,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page if (zoned) ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written); + page_started, nr_written, wbc); else ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1, NULL); From 1a1b0e729d227f9f758f7b5f1c997e874e94156e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 1 Jun 2023 00:33:01 +0200 Subject: [PATCH 143/194] btrfs: add block-group tree to lockdep classes The block group tree was not present among the lockdep classes. We could get potentially lockdep warnings but so far none has been seen, also because block-group-tree is a relatively new feature. CC: stable@vger.kernel.org # 6.1+ Signed-off-by: David Sterba --- fs/btrfs/locking.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 3a496b0d3d2b..7979449a58d6 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ - /* Longest entry: btrfs-free-space-00 */ - char names[BTRFS_MAX_LEVEL][20]; + /* Longest entry: btrfs-block-group-00 */ + char names[BTRFS_MAX_LEVEL][24]; struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, @@ -72,6 +72,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, + { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = 0, DEFINE_NAME("tree") }, }; From c731cd0b6d255e4855a7cac9f276864032ab2387 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:54 +0200 Subject: [PATCH 144/194] btrfs: fix file_offset for REQ_BTRFS_ONE_ORDERED bios that get split If a bio gets split, it needs to have a proper file_offset for checksum validation and repair to work properly. Based on feedback from Josef, commit 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios") skipped this adjustment for ONE_ORDERED bios. But if we actually ever need to split a ONE_ORDERED read bio, this will lead to a wrong file offset in the repair code. Right now the only user of the file_offset is logging of an error message so this is mostly harmless, but the wrong offset might be more problematic for additional users in the future. Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios") Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index ead288fa24db..846deed98fd0 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -81,8 +81,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; - if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) - orig_bbio->file_offset += map_length; + orig_bbio->file_offset += map_length; atomic_inc(&orig_bbio->pending_ios); return bbio; From a39da514eba81e687db05efb1e8b7cb393e2cb71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:55 +0200 Subject: [PATCH 145/194] btrfs: limit write bios to a single ordered extent Currently buffered writeback bios are allowed to span multiple ordered_extents, although that basically never actually happens since commit 4a445b7b6178 ("btrfs: don't merge pages into bio if their page offset is not contiguous"). Supporting bios than span ordered_extents complicates the file checksumming code, and prevents us from adding an ordered_extent pointer to the btrfs_bio structure. Use the existing code to limit a bio to single ordered_extent for zoned device writes for all writes. This allows to remove the REQ_BTRFS_ONE_ORDERED flags, and the handling of multiple ordered_extents in btrfs_csum_one_bio. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 3 --- fs/btrfs/bio.h | 3 --- fs/btrfs/compression.c | 2 -- fs/btrfs/extent_io.c | 11 ++--------- fs/btrfs/file-item.c | 38 -------------------------------------- 5 files changed, 2 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 846deed98fd0..4a75efd588df 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -457,9 +457,6 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - /* Do not leak our private flag into the block layer. */ - bio->bi_opf &= ~REQ_BTRFS_ONE_ORDERED; - if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 8a29980159b4..52e7962103ff 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -102,9 +102,6 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) bbio->end_io(bbio); } -/* Bio only refers to one ordered extent. */ -#define REQ_BTRFS_ONE_ORDERED REQ_DRV - /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bb2d1a4ceca1..617b9572f13e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -293,8 +293,6 @@ void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - write_flags |= REQ_BTRFS_ONE_ORDERED; - cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, end_compressed_bio_write); cb->start = start; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f4d3c56b2900..35141a223a3e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -826,13 +826,8 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; - /* - * Limit the extent to the ordered boundary for Zone Append. - * Compressed bios aren't submitted directly, so it doesn't apply to - * them. - */ - if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && - btrfs_use_zone_append(bbio)) { + /* Limit data write bios to the ordered boundary. */ + if (bio_ctrl->wbc) { struct btrfs_ordered_extent *ordered; ordered = btrfs_lookup_ordered_extent(inode, file_offset); @@ -842,9 +837,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, ordered->disk_num_bytes - file_offset); btrfs_put_ordered_extent(ordered); } - } - if (bio_ctrl->wbc) { /* * Pick the last added device to support cgroup writeback. For * multi-device file systems this means blk-cgroup policies have diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 0cb4a9921d21..782bbf081c26 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -733,8 +733,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) struct bio_vec bvec; int index; unsigned int blockcount; - unsigned long total_bytes = 0; - unsigned long this_sum_bytes = 0; int i; unsigned nofs_flag; @@ -776,34 +774,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) - 1); for (i = 0; i < blockcount; i++) { - if (!(bio->bi_opf & REQ_BTRFS_ONE_ORDERED) && - !in_range(offset, ordered->file_offset, - ordered->num_bytes)) { - unsigned long bytes_left; - - sums->len = this_sum_bytes; - this_sum_bytes = 0; - btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); - - bytes_left = bio->bi_iter.bi_size - total_bytes; - - nofs_flag = memalloc_nofs_save(); - sums = kvzalloc(btrfs_ordered_sum_size(fs_info, - bytes_left), GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!sums) - return BLK_STS_RESOURCE; - - sums->len = bytes_left; - ordered = btrfs_lookup_ordered_extent(inode, - offset); - ASSERT(ordered); /* Logic error */ - sums->logical = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + total_bytes; - index = 0; - } - data = bvec_kmap_local(&bvec); crypto_shash_digest(shash, data + (i * fs_info->sectorsize), @@ -812,18 +782,10 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) kunmap_local(data); index += fs_info->csum_size; offset += fs_info->sectorsize; - this_sum_bytes += fs_info->sectorsize; - total_bytes += fs_info->sectorsize; } } - this_sum_bytes = 0; - /* - * The ->sums assignment is for zoned writes, where a bio never spans - * ordered extents and is only done unconditionally because that's cheaper - * than a branch. - */ bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); btrfs_put_ordered_extent(ordered); From 3daea5fda1cdd74829d0306ab88b4798c38254ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:56 +0200 Subject: [PATCH 146/194] btrfs: merge the two calls to btrfs_add_ordered_extent in run_delalloc_nocow Refactor run_delalloc_nocow a little bit so that there is only a single call to btrfs_add_ordered_extent instead of two. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 600086b8195d..cc643565af4f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2140,6 +2140,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ram_bytes; u64 nocow_end; int extent_type; + bool is_prealloc; nocow = false; @@ -2278,8 +2279,8 @@ out_check: } nocow_end = cur_offset + nocow_args.num_bytes - 1; - - if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; + if (is_prealloc) { u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; @@ -2295,29 +2296,21 @@ out_check: goto error; } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, - cur_offset, nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, 0, - 1 << BTRFS_ORDERED_PREALLOC, - BTRFS_COMPRESS_NONE); - if (ret) { + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, + nocow_args.num_bytes, nocow_args.num_bytes, + nocow_args.disk_bytenr, nocow_args.num_bytes, 0, + is_prealloc + ? (1 << BTRFS_ORDERED_PREALLOC) + : (1 << BTRFS_ORDERED_NOCOW), + BTRFS_COMPRESS_NONE); + if (ret) { + if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); - goto error; } - } else { - ret = btrfs_add_ordered_extent(inode, cur_offset, - nocow_args.num_bytes, - nocow_args.num_bytes, - nocow_args.disk_bytenr, - nocow_args.num_bytes, - 0, - 1 << BTRFS_ORDERED_NOCOW, - BTRFS_COMPRESS_NONE); - if (ret) - goto error; + goto error; } if (nocow) { From 34bfaf15304b10e9c9e2fbd3012fa72710929b32 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:57 +0200 Subject: [PATCH 147/194] btrfs: pass an ordered_extent to btrfs_reloc_clone_csums Both callers of btrfs_reloc_clone_csums allocate the ordered_extent that btrfs_reloc_clone_csums operates on. Switch them to use btrfs_alloc_ordered_extent instead of btrfs_add_ordered_extent and pass the ordered_extent to btrfs_reloc_clone_csums instead of doing an extra lookup. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 29 ++++++++++++++++++----------- fs/btrfs/relocation.c | 35 ++++++++++++++--------------------- fs/btrfs/relocation.h | 2 +- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cc643565af4f..d187ddc9673a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1494,6 +1494,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { + struct btrfs_ordered_extent *ordered; + cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, @@ -1518,16 +1520,18 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size, - ins.objectid, cur_alloc_size, 0, - 1 << BTRFS_ORDERED_REGULAR, - BTRFS_COMPRESS_NONE); - if (ret) + ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, + ram_size, ins.objectid, cur_alloc_size, + 0, 1 << BTRFS_ORDERED_REGULAR, + BTRFS_COMPRESS_NONE); + if (IS_ERR(ordered)) { + ret = PTR_ERR(ordered); goto out_drop_extent_cache; + } if (btrfs_is_data_reloc_root(root)) { - ret = btrfs_reloc_clone_csums(inode, start, - cur_alloc_size); + ret = btrfs_reloc_clone_csums(ordered); + /* * Only drop cache here, and process as normal. * @@ -1544,6 +1548,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, start + ram_size - 1, false); } + btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -2133,6 +2138,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, nocow_args.writeback_path = true; while (1) { + struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; @@ -2298,18 +2304,19 @@ out_check: free_extent_map(em); } - ret = btrfs_add_ordered_extent(inode, cur_offset, + ordered = btrfs_alloc_ordered_extent(inode, cur_offset, nocow_args.num_bytes, nocow_args.num_bytes, nocow_args.disk_bytenr, nocow_args.num_bytes, 0, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); - if (ret) { + if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } + ret = PTR_ERR(ordered); goto error; } @@ -2324,8 +2331,8 @@ out_check: * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ - ret = btrfs_reloc_clone_csums(inode, cur_offset, - nocow_args.num_bytes); + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1333c8e2ddad..25a3361caedc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4342,29 +4342,25 @@ out: * cloning checksum properly handles the nodatasum extents. * it also saves CPU time to re-calculate the checksum. */ -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_root *csum_root; - struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered; - int ret; - u64 disk_bytenr; - u64 new_bytenr; + u64 disk_bytenr = ordered->file_offset + inode->index_cnt; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr); LIST_HEAD(list); + int ret; - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); - - disk_bytenr = file_pos + inode->index_cnt; - csum_root = btrfs_csum_root(fs_info, disk_bytenr); ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + disk_bytenr + ordered->num_bytes - 1, + &list, 0, false); if (ret) - goto out; + return ret; while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); + struct btrfs_ordered_sum *sums = + list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); /* @@ -4379,14 +4375,11 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->disk_bytenr + sums->logical - disk_bytenr; - sums->logical = new_bytenr; - + sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr; btrfs_add_ordered_sum(ordered, sums); } -out: - btrfs_put_ordered_extent(ordered); - return ret; + + return 0; } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 57cbac5c8ddd..77d69f6ae967 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -8,7 +8,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *r int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow); From d611935b5d434a697757463e77632ac1501cfd1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:58 +0200 Subject: [PATCH 148/194] btrfs: pass an ordered_extent to btrfs_submit_compressed_write btrfs_submit_compressed_write always operates on a single ordered_extent. Make that explicit by using btrfs_alloc_ordered_extent in the callers and passing the ordered_extent to btrfs_submit_compressed_write. This will help with storing and ordered_extent pointer in the btrfs_bio in subsequent patches. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 29 +++++++++++++++-------------- fs/btrfs/compression.h | 5 ++--- fs/btrfs/inode.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 617b9572f13e..aafc846bcd4f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -279,33 +279,34 @@ static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) * This also checksums the file bytes and gets things ready for * the end io hooks. */ -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, - struct page **compressed_pages, - unsigned int nr_pages, - blk_opf_t write_flags, - bool writeback) +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, + bool writeback) { + struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct compressed_bio *cb; - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); - cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, + cb = alloc_compressed_bio(inode, ordered->file_offset, + REQ_OP_WRITE | write_flags, end_compressed_bio_write); - cb->start = start; - cb->len = len; + cb->start = ordered->file_offset; + cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; - cb->compressed_len = compressed_len; + cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; - cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; + cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; btrfs_add_compressed_bio_pages(cb); btrfs_submit_bio(&cb->bbio, 0); + btrfs_put_ordered_extent(ordered); } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b462ab106f43..03bb9d143fa7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -10,6 +10,7 @@ #include "bio.h" struct btrfs_inode; +struct btrfs_ordered_extent; /* * We want to make sure that amount of RAM required to uncompress an extent is @@ -86,9 +87,7 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); -void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned int len, u64 disk_start, - unsigned int compressed_len, +void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, struct page **compressed_pages, unsigned int nr_pages, blk_opf_t write_flags, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d187ddc9673a..8ae448a4f470 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1182,6 +1182,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_ordered_extent *ordered; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_map *em; @@ -1243,7 +1244,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, /* file_offset */ + ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ async_extent->ram_size, /* ram_bytes */ ins.objectid, /* disk_bytenr */ @@ -1251,8 +1252,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, 0, /* offset */ 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1261,11 +1263,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); - - btrfs_submit_compressed_write(inode, start, /* file_offset */ - async_extent->ram_size, /* num_bytes */ - ins.objectid, /* disk_bytenr */ - ins.offset, /* compressed_len */ + btrfs_submit_compressed_write(ordered, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, true); @@ -10274,6 +10272,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, struct extent_io_tree *io_tree = &inode->io_tree; struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; int compression; size_t orig_count; u64 start, end; @@ -10450,14 +10449,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } free_extent_map(em); - ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, ins.objectid, ins.offset, encoded->unencoded_offset, (1 << BTRFS_ORDERED_ENCODED) | (1 << BTRFS_ORDERED_COMPRESSED), compression); - if (ret) { + if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); + ret = PTR_ERR(ordered); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -10469,8 +10469,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, btrfs_delalloc_release_extents(inode, num_bytes); - btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, - ins.offset, pages, nr_pages, 0, false); + btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); ret = orig_count; goto out; From ebfe4d4eb6ef6bd0f80293936808b77c1fc931b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:53:59 +0200 Subject: [PATCH 149/194] btrfs: remove btrfs_add_ordered_extent All callers are gone now. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 25 +------------------------ fs/btrfs/ordered-data.h | 4 ---- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index bb76bf01a332..3aee77f40f01 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -279,29 +279,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( return entry; } -/* - * Add a new btrfs_ordered_extent for the range, but drop the reference instead - * of returning it to the caller. - */ -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type) -{ - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, - ram_bytes, disk_bytenr, - disk_num_bytes, offset, flags, - compress_type); - - if (IS_ERR(ordered)) - return PTR_ERR(ordered); - btrfs_put_ordered_extent(ordered); - - return 0; -} - /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -579,7 +556,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, freespace_inode = btrfs_is_free_space_inode(btrfs_inode); btrfs_lockdep_acquire(fs_info, btrfs_trans_pending_ordered); - /* This is paired with btrfs_add_ordered_extent. */ + /* This is paired with btrfs_alloc_ordered_extent. */ spin_lock(&btrfs_inode->lock); btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 828bb039634a..0bd0f0368132 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -178,10 +178,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, u64 disk_num_bytes, u64 offset, unsigned long flags, int compress_type); -int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, - u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, - u64 disk_num_bytes, u64 offset, unsigned long flags, - int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, From fbe960877b6f434525cc38e44334157223058e09 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:00 +0200 Subject: [PATCH 150/194] btrfs: add a is_data_bbio helper Add a helper to check for that a btrfs_bio has a valid inode, and that it is a data inode to key off all the special handling for data path checksumming. Note that this uses is_data_inode instead of REQ_META as REQ_META is only set directly before submission in submit_one_bio and we'll also want to use this helper for error handling where REQ_META isn't set yet. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4a75efd588df..41a152e87429 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -27,6 +27,12 @@ struct btrfs_failed_bio { atomic_t repair_count; }; +/* Is this a data path I/O that needs storage layer checksum and repair? */ +static inline bool is_data_bbio(struct btrfs_bio *bbio) +{ + return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -312,7 +318,7 @@ static void btrfs_end_bio_work(struct work_struct *work) struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); /* Metadata reads are checked and repaired by the submitter. */ - if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + if (is_data_bbio(bbio)) btrfs_check_read_bio(bbio, bbio->bio.bi_private); else btrfs_orig_bbio_end_io(bbio); @@ -346,8 +352,7 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; - if (bio_op(bio) == REQ_OP_READ && bbio->inode && - !(bbio->bio.bi_opf & REQ_META)) + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) btrfs_check_read_bio(bbio, NULL); else btrfs_orig_bbio_end_io(bbio); @@ -639,7 +644,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Save the iter for the end_io handler and preload the checksums for * data reads. */ - if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); if (ret) From 112397acc358883a90bb32b9597724dfbc3ccb41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:01 +0200 Subject: [PATCH 151/194] btrfs: open code btrfs_bio_end_io in btrfs_dio_submit_io btrfs_dio_submit_io is the only place that uses btrfs_bio_end_io to end a bio that hasn't been submitted using btrfs_submit_bio yet, and this invariant will become a problem with upcoming changes to the btrfs bio layer. Just open code the assignment of bi_status and the call to btrfs_dio_end_io. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ae448a4f470..020d871bb135 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7845,7 +7845,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + bbio->bio.bi_status = errno_to_blk_status(ret); + btrfs_dio_end_io(bbio); return; } } From ec63b84d4611b2f07e9242080f3e8de4a011820b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:02 +0200 Subject: [PATCH 152/194] btrfs: add an ordered_extent pointer to struct btrfs_bio Add a pointer to the ordered_extent to the existing union in struct btrfs_bio, so all code dealing with data write bios can just use a pointer dereference to retrieve the ordered_extent instead of doing multiple rbtree lookups per I/O. The reference to this ordered_extent is dropped at end I/O time, which implies that an extra one must be acquired when the bio is split. This also requires moving the btrfs_extract_ordered_extent call into btrfs_split_bio so that the invariant of always having a valid ordered_extent reference for the btrfs_bio is kept. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 42 ++++++++++++++++++++++++++++++++++++++---- fs/btrfs/bio.h | 9 +++------ fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 9 +-------- fs/btrfs/inode.c | 8 +++++--- 6 files changed, 49 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 41a152e87429..12b12443efaa 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -33,6 +33,11 @@ static inline bool is_data_bbio(struct btrfs_bio *bbio) return bbio->inode && is_data_inode(&bbio->inode->vfs_inode); } +static bool bbio_has_ordered_extent(struct btrfs_bio *bbio) +{ + return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE; +} + /* * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. @@ -88,11 +93,40 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, bbio->inode = orig_bbio->inode; bbio->file_offset = orig_bbio->file_offset; orig_bbio->file_offset += map_length; - + if (bbio_has_ordered_extent(bbio)) { + refcount_inc(&orig_bbio->ordered->refs); + bbio->ordered = orig_bbio->ordered; + } atomic_inc(&orig_bbio->pending_ios); return bbio; } +/* Free a bio that was never submitted to the underlying device. */ +static void btrfs_cleanup_bio(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); +} + +static void __btrfs_bio_end_io(struct btrfs_bio *bbio) +{ + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } +} + +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + __btrfs_bio_end_io(bbio); +} + static void btrfs_orig_write_end_io(struct bio *bio); static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio, @@ -121,12 +155,12 @@ static void btrfs_orig_bbio_end_io(struct btrfs_bio *bbio) if (bbio->bio.bi_status) btrfs_bbio_propagate_error(bbio, orig_bbio); - bio_put(&bbio->bio); + btrfs_cleanup_bio(bbio); bbio = orig_bbio; } if (atomic_dec_and_test(&bbio->pending_ios)) - bbio->end_io(bbio); + __btrfs_bio_end_io(bbio); } static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror) @@ -684,7 +718,7 @@ done: fail_put_bio: if (map_length < length) - bio_put(bio); + btrfs_cleanup_bio(bbio); fail: btrfs_bio_counter_dec(fs_info); btrfs_bio_end_io(orig_bbio, ret); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 52e7962103ff..ca79decee060 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -50,11 +50,13 @@ struct btrfs_bio { /* * For data writes: + * - ordered extent covering the bio * - pointer to the checksums for this bio * - original physical address from the allocator * (for zone append only) */ struct { + struct btrfs_ordered_extent *ordered; struct btrfs_ordered_sum *sums; u64 orig_physical; }; @@ -95,12 +97,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, struct btrfs_fs_info *fs_info, btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} +void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); /* Submit using blkcg_punt_bio_submit. */ #define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index aafc846bcd4f..ce4be0f21c5c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -303,10 +303,10 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; + cb->bbio.ordered = ordered; btrfs_add_compressed_bio_pages(cb); btrfs_submit_bio(&cb->bbio, 0); - btrfs_put_ordered_extent(ordered); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 35141a223a3e..ad536eaf02b0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -835,7 +835,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, ordered->file_offset + ordered->disk_num_bytes - file_offset); - btrfs_put_ordered_extent(ordered); + bbio->ordered = ordered; } /* diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 782bbf081c26..cbacaa80526c 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -799,19 +799,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) */ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { - struct btrfs_ordered_extent *ordered = - btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); - - if (WARN_ON_ONCE(!ordered)) - return BLK_STS_IOERR; - bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) return BLK_STS_RESOURCE; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; - btrfs_add_ordered_sum(ordered, bbio->sums); - btrfs_put_ordered_extent(ordered); + btrfs_add_ordered_sum(bbio->ordered, bbio->sums); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 020d871bb135..b8ec3d95e659 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2716,8 +2716,11 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, return -EINVAL; /* No need to split if the ordered extent covers the entire bio. */ - if (ordered->disk_num_bytes == len) + if (ordered->disk_num_bytes == len) { + refcount_inc(&ordered->refs); + bbio->ordered = ordered; return 0; + } /* * Don't split the extent_map for NOCOW extents, as we're writing into @@ -2734,8 +2737,7 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return PTR_ERR(new); - btrfs_put_ordered_extent(new); - + bbio->ordered = new; return 0; } From c59360f61aebf173442c2a128910876b3e41bfd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:03 +0200 Subject: [PATCH 153/194] btrfs: use bbio->ordered in btrfs_csum_one_bio Use the ordered_extent pointer in the btrfs_bio instead of looking it up manually. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index cbacaa80526c..696bf695d8eb 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -721,13 +721,12 @@ fail: */ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) { + struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; - u64 offset = bbio->file_offset; struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered = NULL; char *data; struct bvec_iter iter; struct bio_vec bvec; @@ -753,22 +752,6 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) shash->tfm = fs_info->csum_shash; bio_for_each_segment(bvec, bio, iter) { - if (!ordered) { - ordered = btrfs_lookup_ordered_extent(inode, offset); - /* - * The bio range is not covered by any ordered extent, - * must be a code logic error. - */ - if (unlikely(!ordered)) { - WARN(1, KERN_WARNING - "no ordered extent for root %llu ino %llu offset %llu\n", - inode->root->root_key.objectid, - btrfs_ino(inode), offset); - kvfree(sums); - return BLK_STS_IOERR; - } - } - blockcount = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len + fs_info->sectorsize - 1); @@ -781,14 +764,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) sums->sums + index); kunmap_local(data); index += fs_info->csum_size; - offset += fs_info->sectorsize; } } bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); - btrfs_put_ordered_extent(ordered); return 0; } From 53df25869a5659506cb35185997176eed49e125e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:04 +0200 Subject: [PATCH 154/194] btrfs: factor out a can_finish_ordered_extent helper Factor out a helper from btrfs_mark_ordered_io_finished that does the actual per-ordered_extent work to check if we want to schedule an I/O completion. This new helper will later be used complete an ordered_extent without first doing a lookup. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 100 ++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 45 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 3aee77f40f01..2e99fa5d73d9 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -303,6 +303,60 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } +static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, + u64 len, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + lockdep_assert_held(&inode->ordered_tree.lock); + + if (page) { + ASSERT(page->mapping); + ASSERT(page_offset(page) <= file_offset); + ASSERT(file_offset + len <= page_offset(page) + PAGE_SIZE); + + /* + * Ordered (Private2) bit indicates whether we still have + * pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + return false; + btrfs_page_clear_ordered(fs_info, page, file_offset, len); + } + + /* Now we're fine to update the accounting. */ + if (WARN_ON_ONCE(len > ordered->bytes_left)) { + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu", + inode->root->root_key.objectid, btrfs_ino(inode), + ordered->file_offset, ordered->num_bytes, + len, ordered->bytes_left); + ordered->bytes_left = 0; + } else { + ordered->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + + if (ordered->bytes_left) + return false; + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags); + cond_wake_up(&ordered->wait); + refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_mark_finished(inode, ordered); + return true; +} + /* * Mark all ordered extents io inside the specified range finished. * @@ -333,10 +387,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, else wq = fs_info->endio_write_workers; - if (page) - ASSERT(page->mapping && page_offset(page) <= file_offset && - file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); - spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; @@ -389,47 +439,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(end + 1 - cur < U32_MAX); len = end + 1 - cur; - if (page) { - /* - * Ordered (Private2) bit indicates whether we still - * have pending io unfinished for the ordered extent. - * - * If there's no such bit, we need to skip to next range. - */ - if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { - cur += len; - continue; - } - btrfs_page_clear_ordered(fs_info, page, cur, len); - } - - /* Now we're fine to update the accounting */ - if (unlikely(len > entry->bytes_left)) { - WARN_ON(1); - btrfs_crit(fs_info, -"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", - inode->root->root_key.objectid, - btrfs_ino(inode), - entry->file_offset, - entry->num_bytes, - len, entry->bytes_left); - entry->bytes_left = 0; - } else { - entry->bytes_left -= len; - } - - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); - - /* - * All the IO of the ordered extent is finished, we need to queue - * the finish_func to be executed. - */ - if (entry->bytes_left == 0) { - set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - cond_wake_up(&entry->wait); - refcount_inc(&entry->refs); - trace_btrfs_ordered_extent_mark_finished(inode, entry); + if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { spin_unlock_irqrestore(&tree->lock, flags); btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); From 2d6f107ea6875792607fbc40313a990fe3ea522b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:05 +0200 Subject: [PATCH 155/194] btrfs: factor out a btrfs_queue_ordered_fn helper Factor out a helper to queue up an ordered_extent completion in a work queue. This new helper will later be used complete an ordered_extent without first doing a lookup. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2e99fa5d73d9..aa4c203a1695 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -357,6 +357,17 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, return true; } +static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? + fs_info->endio_freespace_worker : fs_info->endio_write_workers; + + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); +} + /* * Mark all ordered extents io inside the specified range finished. * @@ -375,18 +386,11 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; @@ -441,8 +445,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &entry->work); + btrfs_queue_ordered_fn(entry); spin_lock_irqsave(&tree->lock, flags); } cur += len; From 122e9ede5355071359c10a8ebca138b162ef176b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:06 +0200 Subject: [PATCH 156/194] btrfs: add a btrfs_finish_ordered_extent helper Add a helper to complete an ordered_extent without first doing a lookup. The tracepoint cannot use the ordered_extent class as we also want to print the range. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 19 +++++++++++++++++++ fs/btrfs/ordered-data.h | 3 +++ include/trace/events/btrfs.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index aa4c203a1695..a629532283bc 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -368,6 +368,25 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) btrfs_queue_work(wq, &ordered->work); } +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + unsigned long flags; + bool ret; + + trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); + + spin_lock_irqsave(&inode->ordered_tree.lock, flags); + ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); + spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + + if (ret) + btrfs_queue_ordered_fn(ordered); + return ret; +} + /* * Mark all ordered extents io inside the specified range finished. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bd0f0368132..173bd5c5df26 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -167,6 +167,9 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, + struct page *page, u64 file_offset, u64 len, + bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8ea9cea9bfeb..c6eee9b414cf 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -661,6 +661,35 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished, TP_ARGS(inode, ordered) ); +TRACE_EVENT(btrfs_finish_ordered_extent, + + TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 len, + bool uptodate), + + TP_ARGS(inode, start, len, uptodate), + + TP_STRUCT__entry_btrfs( + __field( u64, ino ) + __field( u64, start ) + __field( u64, len ) + __field( bool, uptodate ) + __field( u64, root_objectid ) + ), + + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); + __entry->start = start; + __entry->len = len; + __entry->uptodate = uptodate; + __entry->root_objectid = inode->root->root_key.objectid; + ), + + TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu len=%llu uptodate=%d", + show_root_type(__entry->root_objectid), + __entry->ino, __entry->start, + __entry->len, !!__entry->uptodate) +); + DECLARE_EVENT_CLASS(btrfs__writepage, TP_PROTO(const struct page *page, const struct inode *inode, From 4ba8223d3d0124894578554b473bd7cc680dfd5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:07 +0200 Subject: [PATCH 157/194] btrfs: open code end_extent_writepage in end_bio_extent_writepage This prepares for switching to more efficient ordered_extent processing and already removes the forth and back conversion from len to end back to len. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ad536eaf02b0..827e8dfebf0f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -536,8 +536,6 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct bio_vec *bvec; - u64 start; - u64 end; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -546,6 +544,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + u64 start = page_offset(page) + bvec->bv_offset; + u32 len = bvec->bv_len; /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) @@ -557,12 +557,13 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) "incomplete page write with offset %u and length %u", bvec->bv_offset, bvec->bv_len); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - - end_extent_writepage(page, error, start, end); - - btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), page, start, + start + len - 1, !error); + if (error) { + btrfs_page_clear_uptodate(fs_info, page, start, len); + mapping_set_error(page->mapping, error); + } + btrfs_page_clear_writeback(fs_info, page, start, len); } bio_put(bio); From 7dd4395490369cfc6b3fe87e4ec2b5dd26259e08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:08 +0200 Subject: [PATCH 158/194] btrfs: use btrfs_finish_ordered_extent to complete compressed writes Use the btrfs_finish_ordered_extent helper to complete compressed writes using the bbio->ordered pointer instead of requiring an rbtree lookup in the otherwise equivalent btrfs_mark_ordered_io_finished called from btrfs_writepage_endio_finish_ordered. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ce4be0f21c5c..f1004259c3d3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -224,13 +224,8 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) struct compressed_bio *cb = container_of(work, struct compressed_bio, write_end_work); - /* - * Ok, we're the last bio for this extent, step one is to call back - * into the FS and do all the end_io operations. - */ - btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, - cb->start, cb->start + cb->len - 1, - cb->bbio.bio.bi_status == BLK_STS_OK); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) end_compressed_writeback(cb); From b41b6f6937dc89e072b334e8d814487cb4692770 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:09 +0200 Subject: [PATCH 159/194] btrfs: use btrfs_finish_ordered_extent to complete direct writes Use the btrfs_finish_ordered_extent helper to complete compressed writes using the bbio->ordered pointer instead of requiring an rbtree lookup in the otherwise equivalent btrfs_mark_ordered_io_finished called from btrfs_writepage_endio_finish_ordered. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8ec3d95e659..0219a786e794 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7775,8 +7775,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, - pos, length, false); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); @@ -7806,12 +7806,14 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, bio->bi_status); } - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_mark_ordered_io_finished(inode, NULL, dip->file_offset, - dip->bytes, !bio->bi_status); - else + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + btrfs_finish_ordered_extent(bbio->ordered, NULL, + dip->file_offset, dip->bytes, + !bio->bi_status); + } else { unlock_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); + } bbio->bio.bi_private = bbio->private; iomap_dio_bio_end_io(bio); From 0d394cca8435045f909bd2bb099575b11452e19a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 09:54:10 +0200 Subject: [PATCH 160/194] btrfs: use btrfs_finish_ordered_extent to complete buffered writes Use the btrfs_finish_ordered_extent helper to complete compressed writes using the bbio->ordered pointer instead of requiring an rbtree lookup in the otherwise equivalent btrfs_mark_ordered_io_finished called from btrfs_writepage_endio_finish_ordered. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 827e8dfebf0f..a91d5ad27984 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -557,8 +557,7 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) "incomplete page write with offset %u and length %u", bvec->bv_offset, bvec->bv_len); - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), page, start, - start + len - 1, !error); + btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); if (error) { btrfs_page_clear_uptodate(fs_info, page, start, len); mapping_set_error(page->mapping, error); From 3ed01616bad6c7e3de196676b542ae3df8058592 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 6 Jun 2023 14:36:33 +0900 Subject: [PATCH 161/194] btrfs: delete unused BGs while reclaiming BGs The reclaiming process only starts after the filesystem volumes are allocated to a certain level (75% by default). Thus, the list of reclaiming target block groups can build up so huge at the time the reclaim process kicks in. On a test run, there were over 1000 BGs in the reclaim list. As the reclaim involves rewriting the data, it takes really long time to reclaim the BGs. While the reclaim is running, btrfs_delete_unused_bgs() won't proceed because the reclaim side is holding fs_info->reclaim_bgs_lock. As a result, we will have a large number of unused BGs kept in the unused list. On my test run, I got 1057 unused BGs. Since deleting a block group is relatively easy and fast work, we can call btrfs_delete_unused_bgs() while it reclaims BGs, to avoid building up unused BGs. Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 618ba7670e66..18305f5c74c9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1824,10 +1824,24 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) next: btrfs_put_block_group(bg); + + mutex_unlock(&fs_info->reclaim_bgs_lock); + /* + * Reclaiming all the block groups in the list can take really + * long. Prioritize cleaning up unused block groups. + */ + btrfs_delete_unused_bgs(fs_info); + /* + * If we are interrupted by a balance, we can just bail out. The + * cleaner thread restart again if necessary. + */ + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) + goto end; spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); +end: btrfs_exclop_finish(fs_info); sb_end_write(fs_info->sb); } From a9f189716cf15913c453299d72f69c51a9b0f86b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 6 Jun 2023 14:36:34 +0900 Subject: [PATCH 162/194] btrfs: move out now unused BG from the reclaim list An unused block group is easy to remove to free up space and should be reclaimed fast. Such block group can often already be a target of the reclaim process. As we check list_empty(&bg->bg_list), we keep it in the reclaim list. That block group is never reclaimed until the file system is filled e.g. up to 75%. Instead, we can move unused block group to the unused list and delete it fast. Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 18305f5c74c9..616507596671 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1633,11 +1633,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; + trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); - trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); + } else { + /* Pull out the block group from the reclaim_bgs list. */ + list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); } From 93463ff7b54626f8276c0bd3d3f968fbf8d5d380 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 6 Jun 2023 14:36:35 +0900 Subject: [PATCH 163/194] btrfs: bail out reclaim process if filesystem is read-only When a filesystem is read-only, we cannot reclaim a block group as it cannot rewrite the data. Just bail out in that case. Note that it can drop block groups in this case. As we did sb_start_write(), read-only filesystem means we got a fatal error and forced read-only. There is no chance to reclaim them again. Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 616507596671..9d9f09f25f8f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1794,8 +1794,15 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_unlock(&bg->lock); - /* Get out fast, in case we're unmounting the filesystem */ - if (btrfs_fs_closing(fs_info)) { + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did sb_start_write(), + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { up_write(&space_info->groups_sem); goto next; } From 7e27180994383b7c741ad87749db01e4989a02ba Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 6 Jun 2023 14:36:36 +0900 Subject: [PATCH 164/194] btrfs: reinsert BGs failed to reclaim The reclaim process can temporarily fail. For example, if the space is getting tight, it fails to make the block group read-only. If there are no further writes on that block group, the block group will never get back to the reclaim list, and the BG never gets reclaimed. In a certain workload, we can leave many such block groups never reclaimed. So, let's get it back to the list and give it a chance to be reclaimed. Fixes: 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9d9f09f25f8f..49e92226f766 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1833,6 +1833,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } next: + if (ret) + btrfs_mark_bg_to_reclaim(bg); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); From aadb164bdd5cd7e6c139f2fd5c696ee470cd95d4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 6 Jun 2023 15:26:03 +0100 Subject: [PATCH 165/194] btrfs: update documentation for a block group's bg_list member Currently we are only documenting two uses of the bg_list member of a block group, but there two more: 1) To track deleted block groups for discard purposes, introduced in commit e33e17ee1098 ("btrfs: add missing discards when unpinning extents with -o discard"); 2) To track block groups for automatic reclaim, introduced more recently by commit 18bb8bbf13c1 ("btrfs: zoned: automatically reclaim zones") So document those two other use cases. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index cc0e4b37db2d..f204addc3fe8 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -162,7 +162,14 @@ struct btrfs_block_group { */ struct list_head cluster_list; - /* For delayed block group creation or deletion of empty block groups */ + /* + * Used for several lists: + * + * 1) struct btrfs_fs_info::unused_bgs + * 2) struct btrfs_fs_info::reclaim_bgs + * 3) struct btrfs_transaction::deleted_bgs + * 4) struct btrfs_trans_handle::new_bgs + */ struct list_head bg_list; /* For read-only block groups */ From f02c75e630f09ff4906b0839ed0eaa37c2316132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 11:11:33 +0200 Subject: [PATCH 166/194] btrfs: set FMODE_CAN_ODIRECT instead of a dummy direct_IO method Since commit a2ad63daa88b ("VFS: add FMODE_CAN_ODIRECT file flag") file systems can just set the FMODE_CAN_ODIRECT flag at open time instead of wiring up a dummy direct_IO method to indicate support for direct I/O. Do that for btrfs so that noop_direct_IO can eventually be removed. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f53b7b75092d..392bc7d512a0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3700,7 +3700,8 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0219a786e794..b92c814cec97 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -11011,7 +11011,6 @@ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, From d09c51521f22f9cbdfb1cf63e5c456077c622c84 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:37 +0100 Subject: [PATCH 167/194] btrfs: add missing error handling when logging operation while COWing extent buffer When COWing an extent buffer that is not the root node, we need to log in the tree mod log that we replaced a pointer in the parent node, otherwise a tree mod log user doing a search on the b+tree can return incorrect results (that miss something). We are doing the call to btrfs_tree_mod_log_insert_key() but we totally ignore its return value. So fix this by adding the missing error handling, resulting in a transaction abort and freeing the COWed extent buffer. Fixes: f230475e62f7 ("Btrfs: put all block modifications into the tree mod log") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 385524224037..7f7f13965fe9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -595,8 +595,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE); + ret = btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE); + if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, From ede600e497b1461d06d22a7d17703d9096868bc3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:38 +0100 Subject: [PATCH 168/194] btrfs: fix extent buffer leak after tree mod log failure at split_node() At split_node(), if we fail to log the tree mod log copy operation, we return without unlocking the split extent buffer we just allocated and without decrementing the reference we own on it. Fix this by unlocking it and decrementing the ref count before returning. Fixes: 5de865eebb83 ("Btrfs: fix tree mod logging") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7f7f13965fe9..8496535828de 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3053,6 +3053,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { + btrfs_tree_unlock(split); + free_extent_buffer(split); btrfs_abort_transaction(trans, ret); return ret; } From 8793ed87b376844af96ae0e367a9bcdb0035312f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:39 +0100 Subject: [PATCH 169/194] btrfs: avoid tree mod log ENOMEM failures when we don't need to log When logging tree mod log operations we start by checking, in a lockless manner, if we need to log - if we don't, we just return and do nothing, otherwise we will allocate one or more tree mod log operations and then check again if we need to log. This second check will take the tree mod log lock in write mode if we need to log, otherwise it will do nothing and we just free the allocated memory and return success. We can improve on this by not returning an error in case the memory allocations fail, unless the second check tells us that we actually need to log. That is, if we fail to allocate memory and the second check tells use that we don't need to log, we can just return success and avoid returning -ENOMEM to the caller. Currently tree mod log failures are dealt with either a BUG_ON() or a transaction abort, as tree mod log operations are logged in code paths that modify a b+tree. So just avoid failing with -ENOMEM if we fail to allocate a tree mod log operation unless we actually need to log the operations, that is, if tree_mod_dont_log() returns true. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-mod-log.c | 148 +++++++++++++++++++++++++++++++--------- 1 file changed, 114 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 07c086f9e35e..3df6153d5d5a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -226,21 +226,32 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - int ret; + int ret = 0; if (!tree_mod_need_log(eb->fs_info, eb)) return 0; tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) - return -ENOMEM; + ret = -ENOMEM; if (tree_mod_dont_log(eb->fs_info, eb)) { kfree(tm); + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ return 0; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; } ret = tree_mod_log_insert(eb->fs_info, tm); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) kfree(tm); @@ -282,14 +293,16 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items); if (IS_ERR(tm)) { ret = PTR_ERR(tm); tm = NULL; - goto free_tms; + goto lock; } for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { @@ -297,14 +310,28 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + /* * When we override something during the move, we log these removals. * This can only happen when we move towards the beginning of the @@ -325,10 +352,12 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, return 0; free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); + if (tm_list) { + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&eb->fs_info->tree_mod_log_lock); @@ -378,14 +407,14 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, GFP_NOFS); if (!tm_list) { ret = -ENOMEM; - goto free_tms; + goto lock; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } } @@ -393,7 +422,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm->logical = new_root->start; @@ -402,14 +431,28 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, tm->generation = btrfs_header_generation(old_root); tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } if (tm_list) ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); if (!ret) ret = tree_mod_log_insert(fs_info, tm); +out_unlock: write_unlock(&fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -501,7 +544,8 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, struct btrfs_fs_info *fs_info = dst->fs_info; int ret = 0; struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; + struct tree_mod_elem **tm_list_add = NULL; + struct tree_mod_elem **tm_list_rem = NULL; int i; bool locked = false; struct tree_mod_elem *dst_move_tm = NULL; @@ -517,8 +561,10 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } if (dst_move_nr_items) { dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items, @@ -526,7 +572,7 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (IS_ERR(dst_move_tm)) { ret = PTR_ERR(dst_move_tm); dst_move_tm = NULL; - goto free_tms; + goto lock; } } if (src_move_nr_items) { @@ -536,7 +582,7 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, if (IS_ERR(src_move_tm)) { ret = PTR_ERR(src_move_tm); src_move_tm = NULL; - goto free_tms; + goto lock; } } @@ -547,21 +593,35 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(fs_info, NULL)) +lock: + if (tree_mod_dont_log(fs_info, NULL)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } locked = true; + /* + * We previously failed to allocate memory and we need to log, so we + * have to fail. + */ + if (ret != 0) + goto free_tms; + if (dst_move_tm) { ret = tree_mod_log_insert(fs_info, dst_move_tm); if (ret) @@ -593,10 +653,12 @@ free_tms: if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node)) rb_erase(&src_move_tm->node, &fs_info->tree_mod_log); kfree(src_move_tm); - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); + if (tm_list) { + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } } if (locked) write_unlock(&fs_info->tree_mod_log_lock); @@ -617,22 +679,38 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) nritems = btrfs_header_nritems(eb); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; + if (!tm_list) { + ret = -ENOMEM; + goto lock; + } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; - goto free_tms; + goto lock; } } - if (tree_mod_dont_log(eb->fs_info, eb)) +lock: + if (tree_mod_dont_log(eb->fs_info, eb)) { + /* + * Don't error if we failed to allocate memory because we don't + * need to log. + */ + ret = 0; goto free_tms; + } else if (ret != 0) { + /* + * We previously failed to allocate memory and we need to log, + * so we have to fail. + */ + goto out_unlock; + } ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); +out_unlock: write_unlock(&eb->fs_info->tree_mod_log_lock); if (ret) goto free_tms; @@ -641,9 +719,11 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) return 0; free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } return ret; } From 40b0a749388517de244643c09bdbb98f7dcb6ef1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:40 +0100 Subject: [PATCH 170/194] btrfs: do not BUG_ON() on tree mod log failure at __btrfs_cow_block() At __btrfs_cow_block(), instead of doing a BUG_ON() in case we fail to record a tree mod log root insertion operation, do a transaction abort instead. There's really no need for the BUG_ON(), we can properly release all resources in this context and turn the filesystem to RO mode and in an error state instead. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8496535828de..d6c29564ce49 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -584,9 +584,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) parent_start = buf->start; - atomic_inc(&cow->refs); ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } + atomic_inc(&cow->refs); rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, btrfs_root_id(root), buf, From 39020d8abc7ec62c4de9b260e3d10d4a1c2478ce Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:41 +0100 Subject: [PATCH 171/194] btrfs: do not BUG_ON() on tree mod log failure at balance_level() At balance_level(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in this context, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d6c29564ce49..d60b28c6bd1b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1054,7 +1054,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + goto enospc; + } rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); @@ -1142,7 +1147,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto enospc; + } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); } @@ -1188,7 +1196,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto enospc; + } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } From daefe4d435d75118af302dae650547b8b760da0b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:42 +0100 Subject: [PATCH 172/194] btrfs: rename enospc label to out at balance_level() At balance_level() we have this 'enospc' label where we jump to in case we get an error at several places. However that error is certainly not -ENOSPC in call cases, it can be -EIO or -ENOMEM when reading a child extent buffer for example, or -ENOMEM when trying to record tree mod log operations. So to make this less confusing, rename the label to 'out'. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d60b28c6bd1b..e98f9e205e25 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1041,7 +1041,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(child)) { ret = PTR_ERR(child); btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } btrfs_tree_lock(child); @@ -1050,7 +1050,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); - goto enospc; + goto out; } ret = btrfs_tree_mod_log_insert_root(root->node, child, true); @@ -1058,7 +1058,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(child); free_extent_buffer(child); btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } rcu_assign_pointer(root->node, child); @@ -1087,7 +1087,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(left)) { ret = PTR_ERR(left); left = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); @@ -1096,7 +1096,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1105,7 +1105,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (IS_ERR(right)) { ret = PTR_ERR(right); right = NULL; - goto enospc; + goto out; } __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); @@ -1114,7 +1114,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; - goto enospc; + goto out; } } @@ -1149,7 +1149,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1168,12 +1168,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (!left) { ret = -EROFS; btrfs_handle_fs_error(fs_info, ret, NULL); - goto enospc; + goto out; } wret = balance_node_right(trans, mid, left); if (wret < 0) { ret = wret; - goto enospc; + goto out; } if (wret == 1) { wret = push_node_left(trans, left, mid, 1); @@ -1198,7 +1198,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_MOD_LOG_KEY_REPLACE); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto enospc; + goto out; } btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1225,7 +1225,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); -enospc: +out: if (right) { btrfs_tree_unlock(right); free_extent_buffer(right); From 87b8e9d06e3315c6f8d478fc8fc0dc4d25cf0e09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:43 +0100 Subject: [PATCH 173/194] btrfs: avoid unnecessarily setting the fs to RO and error state at balance_level() At balance_level(), when trying to promote a child node to a root node, if we fail to read the child we call btrfs_handle_fs_error(), which turns the fs to RO mode and sets it to error state as well, causing any ongoing transaction to abort. This however is not necessary because at that point we have not made any change yet at balance_level(), so any error reading the child node does not leaves us in any inconsistent state. Therefore we can just return the error to the caller. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e98f9e205e25..4dcdcf25c3fe 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1040,7 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = btrfs_read_node_slot(mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); - btrfs_handle_fs_error(fs_info, ret, NULL); goto out; } From 725026ed593f1b6da66fe7e3777cd891dbf7343f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:44 +0100 Subject: [PATCH 174/194] btrfs: abort transaction at balance_level() when left child is missing At balance_level() we are calling btrfs_handle_fs_error() when the middle child only has 1 item and the left child is missing, however we can simply use btrfs_abort_transaction(), which achieves the same purposes: to turn the fs to error state, abort the current transaction and turn the fs to RO mode. Besides that, btrfs_abort_transaction() also prints a stack trace which makes it more useful. Also, as this is a highly unexpected case and it's about a b+tree inconsistency, change the error code from -EROFS to -EUCLEAN, tag the if branch as 'unlikely' and log an explicit error message. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4dcdcf25c3fe..00eea2925d1d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1164,9 +1164,13 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * otherwise we would have pulled some pointers from the * right */ - if (!left) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(!left)) { + btrfs_crit(fs_info, +"missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu", + parent->start, btrfs_header_level(parent), + mid->start, btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); goto out; } wret = balance_node_right(trans, mid, left); From eced687e224eb3cc5a501cf53ad9291337c8dbc5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:45 +0100 Subject: [PATCH 175/194] btrfs: abort transaction at update_ref_for_cow() when ref count is zero At update_ref_for_cow() we are calling btrfs_handle_fs_error() if we find that the extent buffer has an unexpected ref count of zero, however we can simply use btrfs_abort_transaction(), which achieves the same purposes: to turn the fs to error state, abort the current transaction and turn the fs to RO mode as well. Besides that, btrfs_abort_transaction() also prints a stack trace which makes it more useful. Also, as this is a very unexpected situation, indicating a serious corruption/inconsistency, tag the if branch as 'unlikely', set the error code to -EUCLEAN instead of -EROFS, and log an explicit message. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 00eea2925d1d..b643bd8656bf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -421,9 +421,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, &refs, &flags); if (ret) return ret; - if (refs == 0) { - ret = -EROFS; - btrfs_handle_fs_error(fs_info, ret, NULL); + if (unlikely(refs == 0)) { + btrfs_crit(fs_info, + "found 0 references for tree block at bytenr %llu level %d root %llu", + buf->start, btrfs_header_level(buf), + btrfs_root_id(root)); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); return ret; } } else { From 11d6ae03557e34dd2bc9e57d1a5139ab3c7be54f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:46 +0100 Subject: [PATCH 176/194] btrfs: do not BUG_ON() on tree mod log failures at push_nodes_for_insert() At push_nodes_for_insert(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the caller. There's really no need for the BUG_ON() as we can release all resources in this context, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b643bd8656bf..dc60ece85664 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1308,7 +1308,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(left); + free_extent_buffer(left); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -1363,7 +1368,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, BTRFS_MOD_LOG_KEY_REPLACE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + btrfs_abort_transaction(trans, ret); + return ret; + } btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); From f61aa7ba08abc09eaf8eb766d9469e9393c22dbf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:47 +0100 Subject: [PATCH 177/194] btrfs: do not BUG_ON() on tree mod log failure at insert_new_root() At insert_new_root(), instead of doing a BUG_ON() in case we fail to record the tree mod log operation, just return the error to the callers after releasing the allocated tree block. At this point we haven't made any changes to the b+tree, so we haven't left it in an inconsistent state and therefore have no need to abort the transaction. All we need to do is to unlock and free the extent buffer we just allocated with the purpose of making it the new root. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dc60ece85664..58325b8f544f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2964,7 +2964,12 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + btrfs_tree_unlock(c); + free_extent_buffer(c); + return ret; + } rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ From 50b5d1fc41da21a4ecf219f37fbca23c79020b08 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:48 +0100 Subject: [PATCH 178/194] btrfs: do not BUG_ON() on tree mod log failures at insert_ptr() At insert_ptr(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in the context of all callers, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. This implies making insert_ptr() return an int instead of void, and making all callers check for returned errors. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 71 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 58325b8f544f..e075e6994d79 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2990,10 +2990,10 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * slot and level indicate where you want the key to go, and * blocknr is the block the key points to. */ -static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct btrfs_disk_key *key, u64 bytenr, - int slot, int level) +static int insert_ptr(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_disk_key *key, u64 bytenr, + int slot, int level) { struct extent_buffer *lower; int nritems; @@ -3009,7 +3009,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, slot, nritems - slot); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(lower, slot + 1), @@ -3019,7 +3022,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, BTRFS_MOD_LOG_KEY_ADD); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } btrfs_set_node_key(lower, key, slot); btrfs_set_node_blockptr(lower, slot, bytenr); @@ -3027,6 +3033,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); btrfs_mark_buffer_dirty(lower); + + return 0; } /* @@ -3106,8 +3114,13 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, path, &disk_key, split->start, - path->slots[level + 1] + 1, level + 1); + ret = insert_ptr(trans, path, &disk_key, split->start, + path->slots[level + 1] + 1, level + 1); + if (ret < 0) { + btrfs_tree_unlock(split); + free_extent_buffer(split); + return ret; + } if (path->slots[level] >= mid) { path->slots[level] -= mid; @@ -3584,16 +3597,17 @@ out: * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. */ -static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_path *path, - struct extent_buffer *l, - struct extent_buffer *right, - int slot, int mid, int nritems) +static noinline int copy_for_split(struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct extent_buffer *l, + struct extent_buffer *right, + int slot, int mid, int nritems) { struct btrfs_fs_info *fs_info = trans->fs_info; int data_copy_size; int rt_data_off; int i; + int ret; struct btrfs_disk_key disk_key; struct btrfs_map_token token; @@ -3618,7 +3632,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1); + if (ret < 0) + return ret; btrfs_mark_buffer_dirty(right); btrfs_mark_buffer_dirty(l); @@ -3636,6 +3652,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, } BUG_ON(path->slots[0] < 0); + + return 0; } /* @@ -3834,8 +3852,13 @@ again: if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1] + 1, 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1] + 1, 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3843,8 +3866,13 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, path, &disk_key, - right->start, path->slots[1], 1); + ret = insert_ptr(trans, path, &disk_key, + right->start, path->slots[1], 1); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3860,7 +3888,12 @@ again: return ret; } - copy_for_split(trans, path, l, right, slot, mid, nritems); + ret = copy_for_split(trans, path, l, right, slot, mid, nritems); + if (ret < 0) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (split == 2) { BUG_ON(num_doubles != 0); From 751a27615ddaaf95519565d83bac65b8aafab9e8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 8 Jun 2023 11:27:49 +0100 Subject: [PATCH 179/194] btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr() At btrfs_del_ptr(), instead of doing a BUG_ON() in case we fail to record tree mod log operations, do a transaction abort and return the error to the callers. There's really no need for the BUG_ON() as we can release all resources in the context of all callers, and we have to abort because other future tree searches that use the tree mod log (btrfs_search_old_slot()) may get inconsistent results if other operations modify the tree after that failure and before the tree mod log based search. This implies btrfs_del_ptr() return an int instead of void, and making all callers check for returned errors. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 52 ++++++++++++++++++++++++++++++++++++------------ fs/btrfs/ctree.h | 4 ++-- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e075e6994d79..190dd90a88eb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1139,7 +1139,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(right) == 0) { btrfs_clear_buffer_dirty(trans, right); btrfs_tree_unlock(right); - btrfs_del_ptr(root, path, level + 1, pslot + 1); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1); + if (ret < 0) { + free_extent_buffer_stale(right); + right = NULL; + goto out; + } root_sub_used(root, right->len); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); @@ -1192,7 +1197,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (btrfs_header_nritems(mid) == 0) { btrfs_clear_buffer_dirty(trans, mid); btrfs_tree_unlock(mid); - btrfs_del_ptr(root, path, level + 1, pslot); + ret = btrfs_del_ptr(trans, root, path, level + 1, pslot); + if (ret < 0) { + free_extent_buffer_stale(mid); + mid = NULL; + goto out; + } root_sub_used(root, mid->len); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); @@ -4440,8 +4450,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * * This is exported for use inside btrfs-progs, don't un-export it. */ -void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot) +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) { struct extent_buffer *parent = path->nodes[level]; u32 nritems; @@ -4452,7 +4462,10 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, if (level) { ret = btrfs_tree_mod_log_insert_move(parent, slot, slot + 1, nritems - slot - 1); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(parent, slot), @@ -4462,7 +4475,10 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, BTRFS_MOD_LOG_KEY_REMOVE); - BUG_ON(ret < 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + return ret; + } } nritems--; @@ -4478,6 +4494,7 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, fixup_low_keys(path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); + return 0; } /* @@ -4490,13 +4507,17 @@ void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *leaf) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { + int ret; + WARN_ON(btrfs_header_generation(leaf) != trans->transid); - btrfs_del_ptr(root, path, 1, path->slots[1]); + ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]); + if (ret < 0) + return ret; /* * btrfs_free_extent is expensive, we want to make sure we @@ -4509,6 +4530,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); free_extent_buffer_stale(leaf); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -4558,7 +4580,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_clear_buffer_dirty(trans, leaf); - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; } } else { int used = leaf_space_used(leaf, 0, nritems); @@ -4619,7 +4643,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - btrfs_del_leaf(trans, root, path, leaf); + ret = btrfs_del_leaf(trans, root, path, leaf); + if (ret < 0) + return ret; free_extent_buffer(leaf); ret = 0; } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5af61480dde6..f2d2b313bde5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,8 +541,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, - int slot); +int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, From 7569141e8fa855b509e674456132b83bca5f087c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 9 Jun 2023 11:49:07 +0100 Subject: [PATCH 180/194] btrfs: replace BUG_ON() at split_item() with proper error handling There's no need to BUG_ON() at split_item() if the leaf does not have enough free space for the new item, we can just return -ENOSPC since the caller can deal with errors from split_item(). Also, as this is a very unlikely condition to happen, because the caller has invoked setup_leaf_for_split() before calling split_item(), surround the condition with a WARN_ON() which makes it easier to notice this unexpected condition and tags the if branch with 'unlikely' as well. I've actually once hit this BUG_ON() with some incorrect code changes I had, which was very inconvenient as it required rebooting the VM. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 190dd90a88eb..a4cb4b642987 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4000,7 +4000,12 @@ static noinline int split_item(struct btrfs_path *path, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)); + /* + * Shouldn't happen because the caller must have previously called + * setup_leaf_for_split() to make room for the new item in the leaf. + */ + if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item))) + return -ENOSPC; orig_slot = path->slots[0]; orig_offset = btrfs_item_offset(leaf, path->slots[0]); From fc4026e26b3383b896af5c0923ada8ae7064ca07 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Jun 2023 11:40:17 +0100 Subject: [PATCH 181/194] btrfs: do not BUG_ON() when dropping inode items from log root When dropping inode items from a log tree at drop_inode_items(), we this BUG_ON() on the result of btrfs_search_slot() because we don't expect an exact match since having a key with an offset of (u64)-1 is unexpected. That is generally true, but for dir index keys for example, we can get a key with such an offset value, even though it's very unlikely and it would take ages to increase the sequence counter for a dir index up to (u64)-1. We can deal with an exact match, we just have to delete the key at that slot, so there is really no need to BUG_ON(), error out or trigger any warning. So remove the BUG_ON(). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f91a6175fd11..365a1cc0a3c3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4056,14 +4056,14 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - BUG_ON(ret == 0); /* Logic error */ - if (ret < 0) + if (ret < 0) { break; + } else if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } - if (path->slots[0] == 0) - break; - - path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); From 6f3eb72a1f26ee84ed9de99ffa9edb1a3748c33b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Jun 2023 11:40:59 +0100 Subject: [PATCH 182/194] btrfs: send: do not BUG_ON() on unexpected symlink data extent There's really no need to BUG_ON() if we find a symlink with an extent that is not inline or is compressed. We can just make send fail with an error (-EUCLEAN) and log an informative error message, so just do that instead of BUG_ON(). Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index af2e153543a5..8bfd44750efe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1774,9 +1774,21 @@ static int read_symlink(struct btrfs_root *root, ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(path->nodes[0], ei); + if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent that is not inline, ino %llu root %llu extent type %d", + ino, btrfs_root_id(root), type); + goto out; + } compression = btrfs_file_extent_compression(path->nodes[0], ei); - BUG_ON(type != BTRFS_FILE_EXTENT_INLINE); - BUG_ON(compression); + if (unlikely(compression != BTRFS_COMPRESS_NONE)) { + ret = -EUCLEAN; + btrfs_crit(root->fs_info, +"send: found symlink extent with compression, ino %llu root %llu compression type %d", + ino, btrfs_root_id(root), compression); + goto out; + } off = btrfs_file_extent_inline_start(ei); len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); From df9f278239046719c91aeb59ec0afb1a99ee8b2b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jun 2023 16:42:16 +0100 Subject: [PATCH 183/194] btrfs: do not BUG_ON on failure to get dir index for new snapshot During the transaction commit path, at create_pending_snapshot(), there is no need to BUG_ON() in case we fail to get a dir index for the snapshot in the parent directory. This should fail very rarely because the parent inode should be loaded in memory already, with the respective delayed inode created and the parent inode's index_cnt field already initialized. However if it fails, it may be -ENOMEM like the comment at create_pending_snapshot() says or any error returned by btrfs_search_slot() through btrfs_set_inode_index_count(), which can be pretty much anything such as -EIO or -EUCLEAN for example. So the comment is not correct when it says it can only be -ENOMEM. However doing a BUG_ON() here is overkill, since we can instead abort the transaction and return the error. Note that any error returned by create_pending_snapshot() will eventually result in a transaction abort at cleanup_transaction(), called from btrfs_commit_transaction(), but we can explicitly abort the transaction at this point instead so that we get a stack trace to tell us that the call to btrfs_set_inode_index() failed. So just abort the transaction and return in case btrfs_set_inode_index() returned an error at create_pending_snapshot(). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fe0f00e717a8..cf306351b148 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1684,7 +1684,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * insert the directory item */ ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, From 6822b3f708608d74820cd69e76de16d42899a230 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 Jun 2023 17:07:54 +0100 Subject: [PATCH 184/194] btrfs: do not BUG_ON after failure to migrate space during truncation During truncation we reserve 2 metadata units when starting a transaction (reserved space goes to fs_info->trans_block_rsv) and then attempt to migrate 1 unit (min_size bytes) from fs_info->trans_block_rsv into the local block reserve. If we ever fail we trigger a BUG_ON(), which should never happen, because we reserved 2 units. However if we happen to fail for some reason, we don't need to be so dire and hit a BUG_ON(), we can just error out the truncation and, since this is highly unexpected, surround the error check with WARN_ON(), to get an informative stack trace and tag the branh as 'unlikely'. Also make the 'min_size' variable const, since it's not supposed to ever change and any accidental change could possibly make the space migration not so unlikely to fail. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b92c814cec97..dbbb67293e34 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8344,7 +8344,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; - u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(&inode->vfs_inode, @@ -8401,7 +8401,15 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + /* + * We have reserved 2 metadata units when we started the transaction and + * min_size matches 1 unit, so this should never fail, but if it does, + * it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) { + btrfs_end_transaction(trans); + goto out; + } trans->block_rsv = rsv; @@ -8449,7 +8457,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + /* + * We have reserved 2 metadata units when we started the + * transaction and min_size matches 1 unit, so this should never + * fail, but if it does, it's not critical we just fail truncation. + */ + if (WARN_ON(ret)) + break; + trans->block_rsv = rsv; } From c2bbc0bab0bb3cdd38914fe714f9b6c3f7544e88 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 12 Jun 2023 14:14:29 +0800 Subject: [PATCH 185/194] btrfs: scrub: remove scrub_ctx::csum_list member Since the rework of scrub introduced by commit 2af2aaf98205 ("btrfs: scrub: introduce structure for new BTRFS_STRIPE_LEN based interface") and later commits, scrub no longer keeps its data checksum inside sctx. Instead we have scrub_stripe::csums for the checksum of the stripe. Thus we can remove the unused scrub_ctx::csum_list member. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0cf8e2f277da..297beaefd49c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -177,7 +177,6 @@ struct scrub_ctx { struct btrfs_fs_info *fs_info; int first_free; int cur_stripe; - struct list_head csum_list; atomic_t cancel_req; int readonly; int sectors_per_bio; @@ -309,17 +308,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } -static void scrub_free_csums(struct scrub_ctx *sctx) -{ - while (!list_empty(&sctx->csum_list)) { - struct btrfs_ordered_sum *sum; - sum = list_first_entry(&sctx->csum_list, - struct btrfs_ordered_sum, list); - list_del(&sum->list); - kfree(sum); - } -} - static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) { int i; @@ -330,7 +318,6 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) release_scrub_stripe(&sctx->stripes[i]); - scrub_free_csums(sctx); kfree(sctx); } @@ -352,7 +339,6 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - INIT_LIST_HEAD(&sctx->csum_list); for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { int ret; From 81db6ae842b3c07edd67278e3693f53a28d694cf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 12 Jun 2023 15:23:29 +0800 Subject: [PATCH 186/194] btrfs: scrub: remove btrfs_fs_info::scrub_wr_completion_workers Since the scrub rework introduced by commit 2af2aaf98205 ("btrfs: scrub: introduce structure for new BTRFS_STRIPE_LEN based interface") and later commits, scrub only needs one single workqueue, fs_info::scrub_worker. That scrub_wr_completion_workers is initially to handle the delay work after write bios finished. But the new scrub code goes submit-and-wait for write bios, thus all the work are done inside the scrub_worker. The last user of fs_info::scrub_wr_completion_workers is removed in commit 16f93993498b ("btrfs: scrub: remove the old writeback infrastructure"), so we can safely remove the workqueue. Reviewed-by: Christoph Hellwig Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 1 - fs/btrfs/scrub.c | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 5dd24c2916a1..396e2a463480 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -642,7 +642,6 @@ struct btrfs_fs_info { */ refcount_t scrub_workers_refcnt; struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 297beaefd49c..2c7fdbb60314 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2698,17 +2698,12 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { struct workqueue_struct *scrub_workers = fs_info->scrub_workers; - struct workqueue_struct *scrub_wr_comp = - fs_info->scrub_wr_completion_workers; fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; mutex_unlock(&fs_info->scrub_lock); if (scrub_workers) destroy_workqueue(scrub_workers); - if (scrub_wr_comp) - destroy_workqueue(scrub_wr_comp); } } @@ -2719,7 +2714,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { struct workqueue_struct *scrub_workers = NULL; - struct workqueue_struct *scrub_wr_comp = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -2732,18 +2726,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, else scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) - goto fail_scrub_workers; - - scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); - if (!scrub_wr_comp) - goto fail_scrub_wr_completion_workers; + return -ENOMEM; mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL && - fs_info->scrub_wr_completion_workers == NULL); + ASSERT(fs_info->scrub_workers == NULL); fs_info->scrub_workers = scrub_workers; - fs_info->scrub_wr_completion_workers = scrub_wr_comp; refcount_set(&fs_info->scrub_workers_refcnt, 1); mutex_unlock(&fs_info->scrub_lock); return 0; @@ -2754,10 +2742,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, ret = 0; - destroy_workqueue(scrub_wr_comp); -fail_scrub_wr_completion_workers: destroy_workqueue(scrub_workers); -fail_scrub_workers: return ret; } From 160fe8f6fdb13da6111677be6263e5d65e875987 Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Mon, 5 Jun 2023 16:49:45 -0700 Subject: [PATCH 187/194] btrfs: add handling for RAID1C23/DUP to btrfs_reduce_alloc_profile Callers of `btrfs_reduce_alloc_profile` expect it to return exactly one allocation profile flag, and failing to do so may ultimately result in a WARN_ON and remount-ro when allocating new blocks, like the below transaction abort on 6.1. `btrfs_reduce_alloc_profile` has two ways of determining the profile, first it checks if a conversion balance is currently running and uses the profile we're converting to. If no balance is currently running, it returns the max-redundancy profile which at least one block in the selected block group has. This works by simply checking each known allocation profile bit in redundancy order. However, `btrfs_reduce_alloc_profile` has not been updated as new flags have been added - first with the `DUP` profile and later with the RAID1C34 profiles. Because of the way it checks, if we have blocks with different profiles and at least one is known, that profile will be selected. However, if none are known we may return a flag set with multiple allocation profiles set. This is currently only possible when a balance from one of the three unhandled profiles to another of the unhandled profiles is canceled after allocating at least one block using the new profile. In that case, a transaction abort like the below will occur and the filesystem will need to be mounted with -o skip_balance to get it mounted rw again (but the balance cannot be resumed without a similar abort). [770.648] ------------[ cut here ]------------ [770.648] BTRFS: Transaction aborted (error -22) [770.648] WARNING: CPU: 43 PID: 1159593 at fs/btrfs/extent-tree.c:4122 find_free_extent+0x1d94/0x1e00 [btrfs] [770.648] CPU: 43 PID: 1159593 Comm: btrfs Tainted: G W 6.1.0-0.deb11.7-powerpc64le #1 Debian 6.1.20-2~bpo11+1a~test [770.648] Hardware name: T2P9D01 REV 1.00 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV [770.648] NIP: c00800000f6784fc LR: c00800000f6784f8 CTR: c000000000d746c0 [770.648] REGS: c000200089afe9a0 TRAP: 0700 Tainted: G W (6.1.0-0.deb11.7-powerpc64le Debian 6.1.20-2~bpo11+1a~test) [770.648] MSR: 9000000002029033 CR: 28848282 XER: 20040000 [770.648] CFAR: c000000000135110 IRQMASK: 0 GPR00: c00800000f6784f8 c000200089afec40 c00800000f7ea800 0000000000000026 GPR04: 00000001004820c2 c000200089afea00 c000200089afe9f8 0000000000000027 GPR08: c000200ffbfe7f98 c000000002127f90 ffffffffffffffd8 0000000026d6a6e8 GPR12: 0000000028848282 c000200fff7f3800 5deadbeef0000122 c00000002269d000 GPR16: c0002008c7797c40 c000200089afef17 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000001 c000200008bc5a98 0000000000000001 GPR24: 0000000000000000 c0000003c73088d0 c000200089afef17 c000000016d3a800 GPR28: c0000003c7308800 c00000002269d000 ffffffffffffffea 0000000000000001 [770.648] NIP [c00800000f6784fc] find_free_extent+0x1d94/0x1e00 [btrfs] [770.648] LR [c00800000f6784f8] find_free_extent+0x1d90/0x1e00 [btrfs] [770.648] Call Trace: [770.648] [c000200089afec40] [c00800000f6784f8] find_free_extent+0x1d90/0x1e00 [btrfs] (unreliable) [770.648] [c000200089afed30] [c00800000f681398] btrfs_reserve_extent+0x1a0/0x2f0 [btrfs] [770.648] [c000200089afeea0] [c00800000f681bf0] btrfs_alloc_tree_block+0x108/0x670 [btrfs] [770.648] [c000200089afeff0] [c00800000f66bd68] __btrfs_cow_block+0x170/0x850 [btrfs] [770.648] [c000200089aff100] [c00800000f66c58c] btrfs_cow_block+0x144/0x288 [btrfs] [770.648] [c000200089aff1b0] [c00800000f67113c] btrfs_search_slot+0x6b4/0xcb0 [btrfs] [770.648] [c000200089aff2a0] [c00800000f679f60] lookup_inline_extent_backref+0x128/0x7c0 [btrfs] [770.648] [c000200089aff3b0] [c00800000f67b338] lookup_extent_backref+0x70/0x190 [btrfs] [770.648] [c000200089aff470] [c00800000f67b54c] __btrfs_free_extent+0xf4/0x1490 [btrfs] [770.648] [c000200089aff5a0] [c00800000f67d770] __btrfs_run_delayed_refs+0x328/0x1530 [btrfs] [770.648] [c000200089aff740] [c00800000f67ea2c] btrfs_run_delayed_refs+0xb4/0x3e0 [btrfs] [770.648] [c000200089aff800] [c00800000f699aa4] btrfs_commit_transaction+0x8c/0x12b0 [btrfs] [770.648] [c000200089aff8f0] [c00800000f6dc628] reset_balance_state+0x1c0/0x290 [btrfs] [770.648] [c000200089aff9a0] [c00800000f6e2f7c] btrfs_balance+0x1164/0x1500 [btrfs] [770.648] [c000200089affb40] [c00800000f6f8e4c] btrfs_ioctl+0x2b54/0x3100 [btrfs] [770.648] [c000200089affc80] [c00000000053be14] sys_ioctl+0x794/0x1310 [770.648] [c000200089affd70] [c00000000002af98] system_call_exception+0x138/0x250 [770.648] [c000200089affe10] [c00000000000c654] system_call_common+0xf4/0x258 [770.648] --- interrupt: c00 at 0x7fff94126800 [770.648] NIP: 00007fff94126800 LR: 0000000107e0b594 CTR: 0000000000000000 [770.648] REGS: c000200089affe80 TRAP: 0c00 Tainted: G W (6.1.0-0.deb11.7-powerpc64le Debian 6.1.20-2~bpo11+1a~test) [770.648] MSR: 900000000000d033 CR: 24002848 XER: 00000000 [770.648] IRQMASK: 0 GPR00: 0000000000000036 00007fffc9439da0 00007fff94217100 0000000000000003 GPR04: 00000000c4009420 00007fffc9439ee8 0000000000000000 0000000000000000 GPR08: 00000000803c7416 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 00007fff9467d120 0000000107e64c9c 0000000107e64d0a GPR16: 0000000107e64d06 0000000107e64cf1 0000000107e64cc4 0000000107e64c73 GPR20: 0000000107e64c31 0000000107e64bf1 0000000107e64be7 0000000000000000 GPR24: 0000000000000000 00007fffc9439ee0 0000000000000003 0000000000000001 GPR28: 00007fffc943f713 0000000000000000 00007fffc9439ee8 0000000000000000 [770.648] NIP [00007fff94126800] 0x7fff94126800 [770.648] LR [0000000107e0b594] 0x107e0b594 [770.648] --- interrupt: c00 [770.648] Instruction dump: [770.648] 3b00ffe4 e8898828 481175f5 60000000 4bfff4fc 3be00000 4bfff570 3d220000 [770.648] 7fc4f378 e8698830 4811cd95 e8410018 <0fe00000> f9c10060 f9e10068 fa010070 [770.648] ---[ end trace 0000000000000000 ]--- [770.648] BTRFS: error (device dm-2: state A) in find_free_extent_update_loop:4122: errno=-22 unknown [770.648] BTRFS info (device dm-2: state EA): forced readonly [770.648] BTRFS: error (device dm-2: state EA) in __btrfs_free_extent:3070: errno=-22 unknown [770.648] BTRFS error (device dm-2: state EA): failed to run delayed ref for logical 17838685708288 num_bytes 24576 type 184 action 2 ref_mod 1: -22 [770.648] BTRFS: error (device dm-2: state EA) in btrfs_run_delayed_refs:2144: errno=-22 unknown [770.648] BTRFS: error (device dm-2: state EA) in reset_balance_state:3599: errno=-22 unknown Fixes: 47e6f7423b91 ("btrfs: add support for 3-copy replication (raid1c3)") Fixes: 8d6fac0087e5 ("btrfs: add support for 4-copy replication (raid1c4)") CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Matt Corallo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 49e92226f766..6753524b146c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -95,14 +95,21 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) } allowed &= flags; - if (allowed & BTRFS_BLOCK_GROUP_RAID6) + /* Select the highest-redundancy RAID level. */ + if (allowed & BTRFS_BLOCK_GROUP_RAID1C4) + allowed = BTRFS_BLOCK_GROUP_RAID1C4; + else if (allowed & BTRFS_BLOCK_GROUP_RAID6) allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3) + allowed = BTRFS_BLOCK_GROUP_RAID1C3; else if (allowed & BTRFS_BLOCK_GROUP_RAID5) allowed = BTRFS_BLOCK_GROUP_RAID5; else if (allowed & BTRFS_BLOCK_GROUP_RAID10) allowed = BTRFS_BLOCK_GROUP_RAID10; else if (allowed & BTRFS_BLOCK_GROUP_RAID1) allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_DUP) + allowed = BTRFS_BLOCK_GROUP_DUP; else if (allowed & BTRFS_BLOCK_GROUP_RAID0) allowed = BTRFS_BLOCK_GROUP_RAID0; From e794203e9d2d610faf6c5926345091193b202af5 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 16 Jun 2023 18:24:43 +0100 Subject: [PATCH 188/194] btrfs: make btrfs_compressed_bioset static The 'btrfs_compressed_bioset' struct isn't exported outside of the fs/btrfs/compression.c file, so make it static to fix the following sparse warning: fs/btrfs/compression.c:40:16: warning: symbol 'btrfs_compressed_bioset' was not declared. Should it be static? Reviewed-by: Christoph Hellwig Signed-off-by: Ben Dooks Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f1004259c3d3..8818ed5c390f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -37,7 +37,7 @@ #include "file-item.h" #include "super.h" -struct bio_set btrfs_compressed_bioset; +static struct bio_set btrfs_compressed_bioset; static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; From c9e561c4753c8c561452393762b957241e74e820 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 15 Jun 2023 08:49:45 -0400 Subject: [PATCH 189/194] btrfs: update i_version in update_dev_time When updating the ctime, we also want to update i_version. This is just something I noticed by inspection. There is probably no way to test this today unless you can somehow get to this inode via nfsd. Still, I think it's the right thing to do for consistency's sake. David Sterba's comment: I don't see anything wrong with setting the iversion bit, however I also don't see where this would be useful. Agreed with the consistency, otherwise the time is updated when device super block is wiped or a device initialized, both are big events so missing that due to lack of iversion update seems unlikely. I'll add it to the queue, thanks. Signed-off-by: Jeff Layton [ add comments ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8137c04f31c9..b8540af6e136 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1928,7 +1928,7 @@ static void update_dev_time(const char *device_path) return; now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); path_put(&path); } From 6442550027f77a76efa7873b946c34c4a733febd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 19 Jun 2023 11:15:31 +0900 Subject: [PATCH 190/194] btrfs: tracepoints: also show actual number of the outstanding extents The btrfs_inode_mod_outstanding_extents trace event only shows the modified number to the number of outstanding extents. It would be helpful if we can see the resulting extent number as well. Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8abf96cfea8f..d47a927b3504 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -332,7 +332,7 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, if (btrfs_is_free_space_inode(inode)) return; trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), - mod); + mod, inode->outstanding_extents); } /* diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index c6eee9b414cf..a8206f5332e9 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2011,25 +2011,27 @@ DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, ); TRACE_EVENT(btrfs_inode_mod_outstanding_extents, - TP_PROTO(const struct btrfs_root *root, u64 ino, int mod), + TP_PROTO(const struct btrfs_root *root, u64 ino, int mod, unsigned outstanding), - TP_ARGS(root, ino, mod), + TP_ARGS(root, ino, mod, outstanding), TP_STRUCT__entry_btrfs( __field( u64, root_objectid ) __field( u64, ino ) __field( int, mod ) + __field( unsigned, outstanding ) ), TP_fast_assign_btrfs(root->fs_info, __entry->root_objectid = root->root_key.objectid; __entry->ino = ino; __entry->mod = mod; + __entry->outstanding = outstanding; ), - TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d", + TP_printk_btrfs("root=%llu(%s) ino=%llu mod=%d outstanding=%u", show_root_type(__entry->root_objectid), - __entry->ino, __entry->mod) + __entry->ino, __entry->mod, __entry->outstanding) ); DECLARE_EVENT_CLASS(btrfs__block_group, From b31cb5a6eb7a48b0a7bfdf06832b1fd5088d8c79 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Jun 2023 17:21:47 +0100 Subject: [PATCH 191/194] btrfs: fix race when deleting quota root from the dirty cow roots list When disabling quotas we are deleting the quota root from the list fs_info->dirty_cowonly_roots without taking the lock that protects it, which is struct btrfs_fs_info::trans_lock. This unsynchronized list manipulation may cause chaos if there's another concurrent manipulation of this list, such as when adding a root to it with ctree.c:add_root_to_dirty_list(). This can result in all sorts of weird failures caused by a race, such as the following crash: [337571.278245] general protection fault, probably for non-canonical address 0xdead000000000108: 0000 [#1] PREEMPT SMP PTI [337571.278933] CPU: 1 PID: 115447 Comm: btrfs Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [337571.279153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [337571.279572] RIP: 0010:commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.279928] Code: 85 38 06 00 (...) [337571.280363] RSP: 0018:ffff9f63446efba0 EFLAGS: 00010206 [337571.280582] RAX: ffff942d98ec2638 RBX: ffff9430b82b4c30 RCX: 0000000449e1c000 [337571.280798] RDX: dead000000000100 RSI: ffff9430021e4900 RDI: 0000000000036070 [337571.281015] RBP: ffff942d98ec2000 R08: ffff942d98ec2000 R09: 000000000000015b [337571.281254] R10: 0000000000000009 R11: 0000000000000001 R12: ffff942fe8fbf600 [337571.281476] R13: ffff942dabe23040 R14: ffff942dabe20800 R15: ffff942d92cf3b48 [337571.281723] FS: 00007f478adb7340(0000) GS:ffff94349fa40000(0000) knlGS:0000000000000000 [337571.281950] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [337571.282184] CR2: 00007f478ab9a3d5 CR3: 000000001e02c001 CR4: 0000000000370ee0 [337571.282416] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [337571.282647] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [337571.282874] Call Trace: [337571.283101] [337571.283327] ? __die_body+0x1b/0x60 [337571.283570] ? die_addr+0x39/0x60 [337571.283796] ? exc_general_protection+0x22e/0x430 [337571.284022] ? asm_exc_general_protection+0x22/0x30 [337571.284251] ? commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.284531] btrfs_commit_transaction+0x42e/0xf90 [btrfs] [337571.284803] ? _raw_spin_unlock+0x15/0x30 [337571.285031] ? release_extent_buffer+0x103/0x130 [btrfs] [337571.285305] reset_balance_state+0x152/0x1b0 [btrfs] [337571.285578] btrfs_balance+0xa50/0x11e0 [btrfs] [337571.285864] ? __kmem_cache_alloc_node+0x14a/0x410 [337571.286086] btrfs_ioctl+0x249a/0x3320 [btrfs] [337571.286358] ? mod_objcg_state+0xd2/0x360 [337571.286577] ? refill_obj_stock+0xb0/0x160 [337571.286798] ? seq_release+0x25/0x30 [337571.287016] ? __rseq_handle_notify_resume+0x3ba/0x4b0 [337571.287235] ? percpu_counter_add_batch+0x2e/0xa0 [337571.287455] ? __x64_sys_ioctl+0x88/0xc0 [337571.287675] __x64_sys_ioctl+0x88/0xc0 [337571.287901] do_syscall_64+0x38/0x90 [337571.288126] entry_SYSCALL_64_after_hwframe+0x72/0xdc [337571.288352] RIP: 0033:0x7f478aaffe9b So fix this by locking struct btrfs_fs_info::trans_lock before deleting the quota root from that list. Fixes: bed92eae26cc ("Btrfs: qgroup implementation and prototypes") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f41da7ac360d..f8735b31da16 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1301,7 +1301,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) goto out; } + spin_lock(&fs_info->trans_lock); list_del("a_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(quota_root->node); btrfs_clear_buffer_dirty(trans, quota_root->node); From babebf023e661b90b1c78b2baa384fb03a226879 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Jun 2023 17:21:48 +0100 Subject: [PATCH 192/194] btrfs: fix race when deleting free space root from the dirty cow roots list When deleting the free space tree we are deleting the free space root from the list fs_info->dirty_cowonly_roots without taking the lock that protects it, which is struct btrfs_fs_info::trans_lock. This unsynchronized list manipulation may cause chaos if there's another concurrent manipulation of this list, such as when adding a root to it with ctree.c:add_root_to_dirty_list(). This can result in all sorts of weird failures caused by a race, such as the following crash: [337571.278245] general protection fault, probably for non-canonical address 0xdead000000000108: 0000 [#1] PREEMPT SMP PTI [337571.278933] CPU: 1 PID: 115447 Comm: btrfs Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [337571.279153] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [337571.279572] RIP: 0010:commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.279928] Code: 85 38 06 00 (...) [337571.280363] RSP: 0018:ffff9f63446efba0 EFLAGS: 00010206 [337571.280582] RAX: ffff942d98ec2638 RBX: ffff9430b82b4c30 RCX: 0000000449e1c000 [337571.280798] RDX: dead000000000100 RSI: ffff9430021e4900 RDI: 0000000000036070 [337571.281015] RBP: ffff942d98ec2000 R08: ffff942d98ec2000 R09: 000000000000015b [337571.281254] R10: 0000000000000009 R11: 0000000000000001 R12: ffff942fe8fbf600 [337571.281476] R13: ffff942dabe23040 R14: ffff942dabe20800 R15: ffff942d92cf3b48 [337571.281723] FS: 00007f478adb7340(0000) GS:ffff94349fa40000(0000) knlGS:0000000000000000 [337571.281950] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [337571.282184] CR2: 00007f478ab9a3d5 CR3: 000000001e02c001 CR4: 0000000000370ee0 [337571.282416] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [337571.282647] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [337571.282874] Call Trace: [337571.283101] [337571.283327] ? __die_body+0x1b/0x60 [337571.283570] ? die_addr+0x39/0x60 [337571.283796] ? exc_general_protection+0x22e/0x430 [337571.284022] ? asm_exc_general_protection+0x22/0x30 [337571.284251] ? commit_cowonly_roots+0x11f/0x250 [btrfs] [337571.284531] btrfs_commit_transaction+0x42e/0xf90 [btrfs] [337571.284803] ? _raw_spin_unlock+0x15/0x30 [337571.285031] ? release_extent_buffer+0x103/0x130 [btrfs] [337571.285305] reset_balance_state+0x152/0x1b0 [btrfs] [337571.285578] btrfs_balance+0xa50/0x11e0 [btrfs] [337571.285864] ? __kmem_cache_alloc_node+0x14a/0x410 [337571.286086] btrfs_ioctl+0x249a/0x3320 [btrfs] [337571.286358] ? mod_objcg_state+0xd2/0x360 [337571.286577] ? refill_obj_stock+0xb0/0x160 [337571.286798] ? seq_release+0x25/0x30 [337571.287016] ? __rseq_handle_notify_resume+0x3ba/0x4b0 [337571.287235] ? percpu_counter_add_batch+0x2e/0xa0 [337571.287455] ? __x64_sys_ioctl+0x88/0xc0 [337571.287675] __x64_sys_ioctl+0x88/0xc0 [337571.287901] do_syscall_64+0x38/0x90 [337571.288126] entry_SYSCALL_64_after_hwframe+0x72/0xdc [337571.288352] RIP: 0033:0x7f478aaffe9b So fix this by locking struct btrfs_fs_info::trans_lock before deleting the free space root from that list. Fixes: a5ed91828518 ("Btrfs: implement the free space B-tree") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index b21da1446f2a..045ddce32eca 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1280,7 +1280,10 @@ int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; btrfs_global_root_delete(free_space_root); + + spin_lock(&fs_info->trans_lock); list_del(&free_space_root->dirty_list); + spin_unlock(&fs_info->trans_lock); btrfs_tree_lock(free_space_root->node); btrfs_clear_buffer_dirty(trans, free_space_root->node); From 08eb2ad9db0a1f65786ab9133d6fe1683c0647c8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Jun 2023 17:21:49 +0100 Subject: [PATCH 193/194] btrfs: add comment to struct btrfs_fs_info::dirty_cowonly_roots Add a comment to struct btrfs_fs_info::dirty_cowonly_roots to mention that struct btrfs_fs_info::trans_lock is the lock that protects that list. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/fs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 396e2a463480..203d2a267828 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -576,6 +576,7 @@ struct btrfs_fs_info { s32 dirty_metadata_batch; s32 delalloc_batch; + /* Protected by 'trans_lock'. */ struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; From 8a4a0b2a3eaf75ca8854f856ef29690c12b2f531 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Jun 2023 17:21:50 +0100 Subject: [PATCH 194/194] btrfs: fix race between quota disable and relocation If we disable quotas while we have a relocation of a metadata block group that has extents belonging to the quota root, we can cause the relocation to fail with -ENOENT. This is because relocation builds backref nodes for extents of the quota root and later needs to walk the backrefs and access the quota root - however if in between a task disables quotas, it results in deleting the quota root from the root tree (with btrfs_del_root(), called from btrfs_quota_disable(). This can be sporadically triggered by test case btrfs/255 from fstests: $ ./check btrfs/255 FSTYP -- btrfs PLATFORM -- Linux/x86_64 debian0 6.4.0-rc6-btrfs-next-134+ #1 SMP PREEMPT_DYNAMIC Thu Jun 15 11:59:28 WEST 2023 MKFS_OPTIONS -- /dev/sdc MOUNT_OPTIONS -- /dev/sdc /home/fdmanana/btrfs-tests/scratch_1 btrfs/255 6s ... _check_dmesg: something found in dmesg (see /home/fdmanana/git/hub/xfstests/results//btrfs/255.dmesg) - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/255.out.bad) --- tests/btrfs/255.out 2023-03-02 21:47:53.876609426 +0000 +++ /home/fdmanana/git/hub/xfstests/results//btrfs/255.out.bad 2023-06-16 10:20:39.267563212 +0100 @@ -1,2 +1,4 @@ QA output created by 255 +ERROR: error during balancing '/home/fdmanana/btrfs-tests/scratch_1': No such file or directory +There may be more info in syslog - try dmesg | tail Silence is golden ... (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/255.out /home/fdmanana/git/hub/xfstests/results//btrfs/255.out.bad' to see the entire diff) Ran: btrfs/255 Failures: btrfs/255 Failed 1 of 1 tests To fix this make the quota disable operation take the cleaner mutex, as relocation of a block group also takes this mutex. This is also what we do when deleting a subvolume/snapshot, we take the cleaner mutex in the cleaner kthread (at cleaner_kthread()) and then we call btrfs_del_root() at btrfs_drop_snapshot() while under the protection of the cleaner mutex. Fixes: bed92eae26cc ("Btrfs: qgroup implementation and prototypes") CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f8735b31da16..da1f84a0eb29 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1232,12 +1232,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) int ret = 0; /* - * We need to have subvol_sem write locked, to prevent races between - * concurrent tasks trying to disable quotas, because we will unlock - * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. + * We need to have subvol_sem write locked to prevent races with + * snapshot creation. */ lockdep_assert_held_write(&fs_info->subvol_sem); + /* + * Lock the cleaner mutex to prevent races with concurrent relocation, + * because relocation may be building backrefs for blocks of the quota + * root while we are deleting the root. This is like dropping fs roots + * of deleted snapshots/subvolumes, we need the same protection. + * + * This also prevents races between concurrent tasks trying to disable + * quotas, because we will unlock and relock qgroup_ioctl_lock across + * BTRFS_FS_QUOTA_ENABLED changes. + */ + mutex_lock(&fs_info->cleaner_mutex); + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; @@ -1319,6 +1330,7 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); + mutex_unlock(&fs_info->cleaner_mutex); return ret; }