From cf8cddd38bab31b284af8d51fee536be9914f6ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Oct 2016 09:27:36 +0200 Subject: [PATCH 01/45] btrfs: don't abuse REQ_OP_* flags for btrfs_map_block btrfs_map_block supports different types of mappings, which to a large extent resemble block layer operations. But they don't always do, and currently btrfs dangerously overlays it's own flag over the block layer flags. This is just asking for a conflict, so introduce a different map flags enum inside of btrfs instead. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/ctree.h | 3 -- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 6 ++-- fs/btrfs/reada.c | 4 +-- fs/btrfs/scrub.c | 17 +++++------ fs/btrfs/volumes.c | 58 ++++++++++++++++++++------------------ fs/btrfs/volumes.h | 25 ++++++++++++++-- 9 files changed, 70 insertions(+), 49 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 8e99251650b3..a6f657ffa633 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1539,7 +1539,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_device *device; length = len; - ret = btrfs_map_block(state->root->fs_info, READ, + ret = btrfs_map_block(state->root->fs_info, BTRFS_MAP_READ, bytenr, &length, &multi, mirror_num); if (ret) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b8ce2b9f7d0..e4e01a99201a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -90,9 +90,6 @@ static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ -#define REQ_GET_READ_MIRRORS (1 << 30) - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4607af38c72e..87ad2ebcac62 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2036,7 +2036,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, + ret = btrfs_map_block(root->fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8ed05d95584a..ea9ade703da2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2029,7 +2029,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, WRITE, logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e3a5a266917..147df4cf33fc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1864,7 +1864,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, + ret = btrfs_map_block(root->fs_info, btrfs_op(bio), logical, &map_length, NULL, 0); if (ret < 0) return ret; @@ -8406,7 +8406,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int i; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + ret = btrfs_map_block(root->fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; @@ -8472,7 +8472,7 @@ next_block: btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + ret = btrfs_map_block(root->fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 75bab76739be..f7dd892669a5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -354,8 +354,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, - &bbio, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &length, &bbio, 0); if (ret || !bbio || length < blocksize) goto error; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fffb9ab8526e..589d79219c18 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1334,8 +1334,8 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0, 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); return -EIO; @@ -2191,8 +2191,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; - ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length, - &bbio, 0, 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &length, &bbio, 0, 1); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2778,7 +2778,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; - ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, + ret = btrfs_map_sblock(sctx->dev_root->fs_info, BTRFS_MAP_WRITE, sparity->logic_start, &length, &bbio, 0, 1); if (ret || !bbio || !bbio->raid_map) @@ -2988,8 +2988,9 @@ again: mapped_length = extent_len; bbio = NULL; - ret = btrfs_map_block(fs_info, READ, extent_logical, - &mapped_length, &bbio, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, + extent_logical, &mapped_length, &bbio, + 0); if (!ret) { if (!bbio || mapped_length < extent_len) ret = -EIO; @@ -4076,7 +4077,7 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, int ret; mapped_length = extent_len; - ret = btrfs_map_block(fs_info, READ, extent_logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71a60cc01451..23df14c27cab 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5329,7 +5329,8 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5414,7 +5415,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, raid56_full_stripe_start *= full_stripe_len; } - if (op == REQ_OP_DISCARD) { + if (op == BTRFS_MAP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5427,7 +5428,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == REQ_OP_WRITE)) { + (op == BTRFS_MAP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5452,8 +5453,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { + op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5474,7 +5475,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, int found = 0; u64 physical_of_found = 0; - ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); @@ -5484,7 +5485,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, tmp_num_stripes = tmp_bbio->num_stripes; if (mirror_num > tmp_num_stripes) { /* - * REQ_GET_READ_MIRRORS does not contain this + * BTRFS_MAP_GET_READ_MIRRORS does not contain this * mirror, that means that the requested area * is not left of the left cursor */ @@ -5540,17 +5541,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == REQ_OP_DISCARD) + if (op == BTRFS_MAP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || - op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || + op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5563,8 +5564,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || - op == REQ_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || + op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5578,9 +5579,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == REQ_OP_DISCARD) + else if (op == BTRFS_MAP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5598,7 +5599,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || + (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5626,8 +5627,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5650,9 +5651,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) num_alloc_stripes <<= 1; - if (op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5668,7 +5669,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || + ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5693,7 +5694,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, RAID6_Q_STRIPE; } - if (op == REQ_OP_DISCARD) { + if (op == BTRFS_MAP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5773,7 +5774,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } - if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) @@ -5781,7 +5782,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, tgtdev_indexes = 0; if (dev_replace_is_ongoing && - (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && + (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5816,7 +5817,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && + op == BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5888,7 +5890,7 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { @@ -5897,7 +5899,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 09ed29c67848..9029a3134922 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -371,14 +371,35 @@ struct btrfs_balance_control { struct btrfs_balance_progress stat; }; +enum btrfs_map_op { + BTRFS_MAP_READ, + BTRFS_MAP_WRITE, + BTRFS_MAP_DISCARD, + BTRFS_MAP_GET_READ_MIRRORS, +}; + +static inline enum btrfs_map_op btrfs_op(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + return BTRFS_MAP_DISCARD; + case REQ_OP_WRITE: + return BTRFS_MAP_WRITE; + default: + WARN_ON_ONCE(1); + case REQ_OP_READ: + return BTRFS_MAP_READ; + } +} + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); From dc1a90c6aad8f0a4bd6e5e02c5244c6a760cd776 Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Wed, 26 Oct 2016 15:23:01 +0800 Subject: [PATCH 02/45] btrfs: cleanup: use already calculated value in btrfs_should_throttle_delayed_refs() Signed-off-by: Wang Xiaoguang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87ad2ebcac62..62b6e2023e12 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2826,7 +2826,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, smp_mb(); avg_runtime = fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; - if (num_entries * avg_runtime >= NSEC_PER_SEC) + if (val >= NSEC_PER_SEC) return 1; if (val >= NSEC_PER_SEC / 2) return 2; From 939659dfd3ed4ff36dde532782207cfb0e5fbcf6 Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Mon, 7 Nov 2016 15:59:16 +0800 Subject: [PATCH 03/45] btrfs: add necessary comments about tickets_id Tickets_id's name may result in some misunderstandings, it just indicates the next ticket will be handled and is not stored per ticket. Fixes: ce12965 ("btrfs: introduce tickets_id to determine whether asynchronous metadata reclaim work makes progress") Signed-off-by: Wang Xiaoguang Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e4e01a99201a..b26b8b363f7f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -426,6 +426,10 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ u64 tickets_id; struct rw_semaphore groups_sem; From 8e2bd3b7fac91b79a6115fd1511ca20b2a09696d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 9 Nov 2016 15:26:50 -0800 Subject: [PATCH 04/45] Btrfs: deal with existing encompassing extent map in btrfs_get_extent() My QEMU VM was seeing inexplicable I/O errors that I tracked down to errors coming from the qcow2 virtual drive in the host system. The qcow2 file is a nocow file on my Btrfs drive, which QEMU opens with O_DIRECT. Every once in awhile, pread() or pwrite() would return EEXIST, which makes no sense. This turned out to be a bug in btrfs_get_extent(). Commit 8dff9c853410 ("Btrfs: deal with duplciates during extent_map insertion in btrfs_get_extent") fixed a case in btrfs_get_extent() where two threads race on adding the same extent map to an inode's extent map tree. However, if the added em is merged with an adjacent em in the extent tree, then we'll end up with an existing extent that is not identical to but instead encompasses the extent we tried to add. When we call merge_extent_mapping() to find the nonoverlapping part of the new em, the arithmetic overflows because there is no such thing. We then end up trying to add a bogus em to the em_tree, which results in a EEXIST that can bubble all the way up to userspace. Fix it by extending the identical extent map special case. Signed-off-by: Omar Sandoval Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 147df4cf33fc..5707d823cb23 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7049,11 +7049,11 @@ insert: * extent causing the -EEXIST. */ if (existing->start == em->start && - extent_map_end(existing) == extent_map_end(em) && + extent_map_end(existing) >= extent_map_end(em) && em->block_start == existing->block_start) { /* - * these two extents are the same, it happens - * with inlines especially + * The existing extent map already encompasses the + * entire extent map we tried to add. */ free_extent_map(em); em = existing; From ebce0e01b930bfde74391f998d77720b2478a603 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Mon, 14 Nov 2016 18:44:34 +0100 Subject: [PATCH 05/45] btrfs: make block group flags in balance printks human-readable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They're not even documented anywhere, letting users with no recourse but to RTFS. It's no big burden to output the bitfield as words. Also, display unknown flags as hex. Signed-off-by: Adam Borowski Tested-by: Holger Hoffstätte Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c4af0cdb783d..d8d450ae9e90 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4332,6 +4332,45 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) return rc; } +/* + * Print the block group being relocated + */ +static void describe_relocation(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + char buf[128]; /* prefixed by a '|' that'll be dropped */ + u64 flags = block_group->flags; + + /* Shouldn't happen */ + if (!flags) { + strcpy(buf, "|NONE"); + } else { + char *bp = buf; + +#define DESCRIBE_FLAG(f, d) \ + if (flags & BTRFS_BLOCK_GROUP_##f) { \ + bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ + flags &= ~BTRFS_BLOCK_GROUP_##f; \ + } + DESCRIBE_FLAG(DATA, "data"); + DESCRIBE_FLAG(SYSTEM, "system"); + DESCRIBE_FLAG(METADATA, "metadata"); + DESCRIBE_FLAG(RAID0, "raid0"); + DESCRIBE_FLAG(RAID1, "raid1"); + DESCRIBE_FLAG(DUP, "dup"); + DESCRIBE_FLAG(RAID10, "raid10"); + DESCRIBE_FLAG(RAID5, "raid5"); + DESCRIBE_FLAG(RAID6, "raid6"); + if (flags) + snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags); +#undef DESCRIBE_FLAG + } + + btrfs_info(fs_info, + "relocating block group %llu flags %s", + block_group->key.objectid, buf + 1); +} + /* * function to relocate all extents in a block group. */ @@ -4388,9 +4427,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - btrfs_info(extent_root->fs_info, - "relocating block group %llu flags %llu", - rc->block_group->key.objectid, rc->block_group->flags); + describe_relocation(extent_root->fs_info, rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); From 745699ef6292bc7c85c08c9435c3f47ed413d3e9 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 23 Sep 2016 12:38:50 +0800 Subject: [PATCH 06/45] btrfs: remove useless comments Fixes: ("btrfs: update btrfs_space_info's bytes_may_use timely") Signed-off-by: Wang Xiaoguang Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 62b6e2023e12..add799b90ce5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6499,16 +6499,9 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space. Metadata - * reservations should be called with RESERVE_ALLOC so we do the proper - * ENOSPC accounting. For data we handle the reservation through clearing the - * delalloc bits in the io_tree. We have to do this since we could end up - * allocating less disk space for the amount of data we have reserved in the - * case of compression. - * - * If this is a reservation and the block group has become read only we cannot - * make the reservation and return -EAGAIN, otherwise this function always - * succeeds. + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. */ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, u64 ram_bytes, u64 num_bytes, int delalloc) From 926b92335a607e787d8d111d872f82de6d5988d5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Oct 2016 14:23:05 +0200 Subject: [PATCH 07/45] btrfs: remove unused headers, statfs.h Signed-off-by: David Sterba --- fs/btrfs/file.c | 1 - fs/btrfs/inode.c | 1 - fs/btrfs/ioctl.c | 1 - 3 files changed, 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3a14c87d9c92..5b1f90af3db6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5707d823cb23..81aba7d20061 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7acbd2cf6192..8bb278e12db6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include From 5d9dbe617a9e4e85c5fc9790c354cec903b88b57 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 5 Oct 2016 14:23:06 +0200 Subject: [PATCH 08/45] btrfs: remove stale comment from btrfs_statfs Signed-off-by: David Sterba --- fs/btrfs/super.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..adec3a0b01d5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2086,10 +2086,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - /* - * holding chunk_mutex to avoid allocating new chunks, holding - * device_list_mutex to avoid the device being removed - */ rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { From ef2fff64fd541af1e23eeae48d6ffdfcd92ae2a3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 26 Oct 2016 16:23:50 +0200 Subject: [PATCH 09/45] btrfs: rename helper macros for qgroup and aux data casts The helpers are not meant to be generic, the name is misleading. Convert them to static inlines for type checking. Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 11f4fffe503e..50b32cb25bdb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -131,8 +131,15 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -#define ptr_to_u64(x) ((u64)(uintptr_t)x) -#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) +static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) +{ + return (u64)(uintptr_t)qg; +} + +static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) +{ + return (struct btrfs_qgroup *)(uintptr_t)n->aux; +} static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, @@ -1066,7 +1073,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Get all of the parent groups that contain this qgroup */ list_for_each_entry(glist, &qgroup->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -1074,7 +1081,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Iterate all of the parents and adjust their reference counts */ ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); + qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); @@ -1087,7 +1094,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Add any parents of the parents */ list_for_each_entry(glist, &qgroup->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -1535,30 +1542,30 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, continue; ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); if (ret < 0) return ret; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); + ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(tmp_unode->aux); + qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) return ret; ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) return ret; @@ -1619,7 +1626,7 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, while ((unode = ulist_next(qgroups, &uiter))) { bool dirty = false; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -2125,7 +2132,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -2157,7 +2164,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); qg->reserved += num_bytes; } @@ -2202,7 +2209,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); qg->reserved -= num_bytes; From 04998b3324fc8aa8f0af9b820e865f8c9665120c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 13:32:43 +0100 Subject: [PATCH 10/45] btrfs: reada, cleanup remove unneeded variable in __readahead_hook We can't touch the eb directly in case the function is called with a non-zero error, so we can read the eb level when needed. Signed-off-by: David Sterba --- fs/btrfs/reada.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index f7dd892669a5..84a5beb48d46 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -109,16 +109,12 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, struct reada_extent *re, struct extent_buffer *eb, u64 start, int err) { - int level = 0; int nritems; int i; u64 bytenr; u64 generation; struct list_head list; - if (eb) - level = btrfs_header_level(eb); - spin_lock(&re->lock); /* * just take the full list from the extent. afterwards we @@ -143,7 +139,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, * trigger more readahead depending from the content, e.g. * fetch the checksums for the extents in the leaf. */ - if (!level) + if (!btrfs_header_level(eb)) goto cleanup; nritems = btrfs_header_nritems(eb); From bcdc51b2043a363b67d97bc99799e505d31391a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 13:39:05 +0100 Subject: [PATCH 11/45] btrfs: reada, remove unused parameter from __readahead_hook 'start' is not used since "btrfs: reada: Pass reada_extent into __readahead_hook directly" (6e39dbe8b9e55280c). Signed-off-by: David Sterba --- fs/btrfs/reada.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 84a5beb48d46..9c7a0424af1b 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -107,7 +107,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, /* in case of err, eb might be NULL */ static void __readahead_hook(struct btrfs_fs_info *fs_info, struct reada_extent *re, struct extent_buffer *eb, - u64 start, int err) + int err) { int nritems; int i; @@ -231,7 +231,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, goto start_machine; } - __readahead_hook(fs_info, re, eb, start, err); + __readahead_hook(fs_info, re, eb, err); reada_extent_put(fs_info, re); /* our ref */ start_machine: @@ -713,9 +713,9 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, ret = reada_tree_block_flagged(fs_info->extent_root, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info, re, NULL, logical, ret); + __readahead_hook(fs_info, re, NULL, ret); else if (eb) - __readahead_hook(fs_info, re, eb, eb->start, ret); + __readahead_hook(fs_info, re, eb, ret); if (eb) free_extent_buffer(eb); From fc2e901f26859a87b7cd5c49015552805b7a00e0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 13:50:03 +0100 Subject: [PATCH 12/45] btrfs: reada, sink start parameter to btree_readahead_hook Originally, the eb and start were passed separately in case eb is NULL. Since the readahead has been refactored in 4.6, this is not true anymore and we can get rid of the parameter. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/reada.c | 8 ++------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b26b8b363f7f..9768ce804265 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3661,7 +3661,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, u64 start, int err); + struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a57f99d96aa..9c4ef833ba0b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -747,7 +747,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, eb->start, ret); + btree_readahead_hook(fs_info, eb, ret); if (ret) { /* @@ -772,7 +772,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO); + btree_readahead_hook(eb->fs_info, eb, -EIO); return -EIO; /* we fixed nothing */ } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9c7a0424af1b..e910bd9b1588 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -209,12 +209,8 @@ cleanup: return; } -/* - * start is passed separately in case eb in NULL, which may be the case with - * failed I/O - */ int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, u64 start, int err) + struct extent_buffer *eb, int err) { int ret = 0; struct reada_extent *re; @@ -222,7 +218,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - start >> PAGE_SHIFT); + eb->start >> PAGE_SHIFT); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); From 8694bb61360554e751f43688a9ff1793609884c4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 17:11:27 +0100 Subject: [PATCH 13/45] btrfs: reada, remove pointless BUG_ON in reada_find_extent The lock is held, we make the same lookup that previously failed with EEXIST and we don't insert NULL pointers. Signed-off-by: David Sterba --- fs/btrfs/reada.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e910bd9b1588..380ab6629e90 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -393,7 +393,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); From b917bb387812f9abb81fc842e4c3b3ec727e10cf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 17:18:35 +0100 Subject: [PATCH 14/45] btrfs: reada, remove pointless BUG_ON check for fs_info We dereference fs_info several times, besides that post-mount functions should never see a NULL fs_info. Signed-off-by: David Sterba --- fs/btrfs/reada.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 380ab6629e90..f0beb63a6d82 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -439,7 +439,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); } - BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); From 62d1f9fe97dd25ca5e850bd7e140d4c9d4b9c7c7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 23:21:05 +0100 Subject: [PATCH 15/45] btrfs: remove trivial helper btrfs_find_tree_block During the time, the function has been shrunk to the point that it just calls find_extent_buffer, just passing the parameters. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++----- fs/btrfs/disk-io.c | 8 +------- fs/btrfs/disk-io.h | 2 -- fs/btrfs/extent-tree.c | 2 +- 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6ba165d3f81..173768767d1b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1670,7 +1670,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root->fs_info, blocknr); + cur = find_extent_buffer(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -2255,7 +2255,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root->fs_info, search); + eb = find_extent_buffer(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2314,7 +2314,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root->fs_info, block1); + eb = find_extent_buffer(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2327,7 +2327,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root->fs_info, block2); + eb = find_extent_buffer(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2445,7 +2445,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root->fs_info, blocknr); + tmp = find_extent_buffer(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9c4ef833ba0b..686d05acfdb7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1191,12 +1191,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr) -{ - return find_extent_buffer(fs_info, bytenr); -} - struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { @@ -4452,7 +4446,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { - eb = btrfs_find_tree_block(root->fs_info, start); + eb = find_extent_buffer(root->fs_info, start); start += root->nodesize; if (!eb) continue; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 1a3237e5700f..124e30c76626 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -63,8 +63,6 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index add799b90ce5..39a834d21749 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8866,7 +8866,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root->fs_info, bytenr); + next = find_extent_buffer(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); if (IS_ERR(next)) From 2230adffe4eae30ffce605daa81a116cb71b5960 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Nov 2016 00:03:12 +0100 Subject: [PATCH 16/45] btrfs: delete unused member from superblock __bdev' has never been used since 0b86a832a1f38abec695864ec2eaedc9d2383f1b (2008). Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9768ce804265..2cf4bc84388e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -799,7 +799,6 @@ struct btrfs_fs_info { spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; - struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; From f157bf765b3773efb5e981dea286cd311fca3b59 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Nov 2016 17:43:38 +0100 Subject: [PATCH 17/45] btrfs: introduce helpers for updating eb uuids The fsid and chunk tree uuid are always located in the first page, we don't need the to use write_extent_buffer. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 +++++++++++++++++++++ fs/btrfs/extent_io.h | 3 +++ 2 files changed, 24 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ea9ade703da2..2ae731a0058a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5465,6 +5465,27 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, return ret; } +void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + const void *srcv) +{ + char *kaddr; + + WARN_ON(!PageUptodate(eb->pages[0])); + kaddr = page_address(eb->pages[0]); + memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, + BTRFS_FSID_SIZE); +} + +void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) +{ + char *kaddr; + + WARN_ON(!PageUptodate(eb->pages[0])); + kaddr = page_address(eb->pages[0]); + memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, + BTRFS_FSID_SIZE); +} + void write_extent_buffer(struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ab31d145227e..065c77d43921 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -405,6 +405,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); +void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); +void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + const void *src); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, From d24ee97b96db46123f766041d2ec0ca81491bd31 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Nov 2016 17:44:25 +0100 Subject: [PATCH 18/45] btrfs: use new helpers to set uuids in eb Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 29 +++++++++-------------------- fs/btrfs/disk-io.c | 10 +++------- fs/btrfs/ioctl.c | 8 +++----- fs/btrfs/volumes.c | 4 ++-- 4 files changed, 17 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 173768767d1b..93bc38b98b3f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -271,8 +271,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); + write_extent_buffer_fsid(cow, root->fs_info->fsid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -1141,8 +1140,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); + write_extent_buffer_fsid(cow, root->fs_info->fsid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -3358,11 +3356,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - - write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); + write_extent_buffer_fsid(c, root->fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(c, root->fs_info->chunk_tree_uuid); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -3495,11 +3490,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_generation(split, trans->transid); btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); - write_extent_buffer(split, root->fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(split), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(split, root->fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(split, + root->fs_info->chunk_tree_uuid); ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); @@ -4283,12 +4276,8 @@ again: btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); - - write_extent_buffer(right, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(right, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid); if (split == 0) { if (mid <= slot) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 686d05acfdb7..21f8e597fe97 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1419,11 +1419,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - write_extent_buffer(leaf, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(leaf, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); @@ -1506,8 +1503,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - write_extent_buffer(root->node, root->fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); + write_extent_buffer_fsid(root->node, root->fs_info->fsid); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8bb278e12db6..24f04d7cb872 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -507,11 +507,9 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(leaf, root->fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(leaf, + root->fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item->inode; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 23df14c27cab..7eebf556feb7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1595,8 +1595,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); + write_extent_buffer_chunk_tree_uuid(leaf, + root->fs_info->chunk_tree_uuid); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); From fba1acf9ff77656e3b9f5c0f7b6a52c93e4932ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 17:56:24 +0100 Subject: [PATCH 19/45] btrfs: use specialized page copying helpers in btrfs_clone_extent_buffer The copy_page is usually optimized and can be faster than memcpy. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ae731a0058a..bf719e3bcaf2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4720,9 +4720,9 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) WARN_ON(PageDirty(p)); SetPageUptodate(p); new->pages[i] = p; + copy_page(page_address(p), page_address(src->pages[i])); } - copy_extent_buffer(new, src, 0, 0, src->len); set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); From b159fa2808b1b53d784807a48ad95fa809be10b0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 18:09:03 +0100 Subject: [PATCH 20/45] btrfs: remove constant parameter to memset_extent_buffer and rename it The only memset we do is to 0, so sink the parameter to the function and simplify all calls. Rename the function to reflect the behaviour. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 +++--- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 10 +++++----- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/file-item.c | 2 +- fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 2 +- fs/btrfs/volumes.c | 2 +- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 93bc38b98b3f..be362b776138 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3348,7 +3348,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root_add_used(root, root->nodesize); - memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(c, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); @@ -3484,7 +3484,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, root->nodesize); - memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(split, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); @@ -4270,7 +4270,7 @@ again: root_add_used(root, root->nodesize); - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(right, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); btrfs_set_header_generation(right, trans->transid); btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 21f8e597fe97..5d1da78f044b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1412,7 +1412,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); @@ -1496,7 +1496,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, return ERR_CAST(leaf); } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 39a834d21749..78fcc67e7b8c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1114,7 +1114,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, BTRFS_BLOCK_FLAG_FULL_BACKREF); bi = (struct btrfs_tree_block_info *)(item + 1); /* FIXME: get first key of the block */ - memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); btrfs_set_tree_block_level(leaf, bi, (int)owner); } else { btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf719e3bcaf2..9f8a1a331c61 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3743,7 +3743,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (btrfs_header_level(eb) > 0) { end = btrfs_node_key_ptr_offset(nritems); - memset_extent_buffer(eb, 0, end, eb->len - end); + memzero_extent_buffer(eb, end, eb->len - end); } else { /* * leaf: @@ -3752,7 +3752,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, start = btrfs_item_nr_offset(nritems); end = btrfs_leaf_data(eb) + leaf_data_end(fs_info->tree_root, eb); - memset_extent_buffer(eb, 0, start, end - start); + memzero_extent_buffer(eb, start, end - start); } for (i = 0; i < num_pages; i++) { @@ -5517,8 +5517,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, } } -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len) +void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long len) { size_t cur; size_t offset; @@ -5538,7 +5538,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memset(kaddr + offset, c, cur); + memset(kaddr + offset, 0, cur); len -= cur; offset = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 065c77d43921..12fe17523df2 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -417,8 +417,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len); +void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long len); int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long pos); void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d0d571c47d33..43418c08b110 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -689,7 +689,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, item_offset = btrfs_item_ptr_offset(leaf, path->slots[0]); - memset_extent_buffer(leaf, 0, item_offset + offset, + memzero_extent_buffer(leaf, item_offset + offset, shift_len); key.offset = bytenr; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e4b48f377d3a..a754865b3cb1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -153,7 +153,7 @@ static int __create_free_space_inode(struct btrfs_root *root, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); btrfs_item_key(leaf, &disk_key, path->slots[0]); - memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + memzero_extent_buffer(leaf, (unsigned long)inode_item, sizeof(*inode_item)); btrfs_set_inode_generation(leaf, inode_item, trans->transid); btrfs_set_inode_size(leaf, inode_item, 0); @@ -181,7 +181,7 @@ static int __create_free_space_inode(struct btrfs_root *root, leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); - memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81aba7d20061..06dc95caa6f1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6276,7 +6276,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, + memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 24f04d7cb872..a26202ebed33 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -501,7 +501,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d8d450ae9e90..26f6c5ac879e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4255,7 +4255,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index caad80bb9bd0..2c7a0a922510 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -306,7 +306,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, int ret; memset(bitmap, 0, len); - memset_extent_buffer(eb, 0, 0, len); + memzero_extent_buffer(eb, 0, len); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { test_msg("Bitmap was not zeroed\n"); return -EINVAL; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7eebf556feb7..1886b94f13ac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3062,7 +3062,7 @@ static int insert_balance_item(struct btrfs_root *root, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); btrfs_set_balance_data(leaf, item, &disk_bargs); From 58e8012cc12b3cdebea118981c4fd7136d52f2c7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Nov 2016 18:30:31 +0100 Subject: [PATCH 21/45] btrfs: add optimized version of eb to eb copy Using copy_extent_buffer is suitable for copying betwenn buffers from an arbitrary offset and deals with page boundaries. This is not necessary when doing a full extent_buffer-to-extent_buffer copy. We can utilize the copy_page helper as well. Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- fs/btrfs/extent_io.c | 14 ++++++++++++++ fs/btrfs/extent_io.h | 2 ++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index be362b776138..25286a5912fc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -260,7 +260,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); @@ -1129,7 +1129,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, /* cow is set to blocking by btrfs_init_new_buffer */ - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9f8a1a331c61..d24af9dc76c7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5546,6 +5546,20 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, } } +void copy_extent_buffer_full(struct extent_buffer *dst, + struct extent_buffer *src) +{ + int i; + unsigned num_pages; + + ASSERT(dst->len == src->len); + + num_pages = num_extent_pages(dst->start, dst->len); + for (i = 0; i < num_pages; i++) + copy_page(page_address(dst->pages[i]), + page_address(src->pages[i])); +} + void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 12fe17523df2..ae64c1917d0a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -410,6 +410,8 @@ void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, const void *src); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); +void copy_extent_buffer_full(struct extent_buffer *dst, + struct extent_buffer *src); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len); From 7b9ea6279b337455268fa41c1ddec22f1cb44e8f Mon Sep 17 00:00:00 2001 From: Shailendra Verma Date: Thu, 10 Nov 2016 15:17:41 +0530 Subject: [PATCH 22/45] btrfs: return early from failed memory allocations in ioctl handlers There is no need to call kfree() if memdup_user() fails, as no memory was allocated and the error in the error-valued pointer should be returned. Signed-off-by: Shailendra Verma [ edit subject ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a26202ebed33..4a20f3e68cb4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4569,11 +4569,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, return -EPERM; loi = memdup_user(arg, sizeof(*loi)); - if (IS_ERR(loi)) { - ret = PTR_ERR(loi); - loi = NULL; - goto out; - } + if (IS_ERR(loi)) + return PTR_ERR(loi); path = btrfs_alloc_path(); if (!path) { @@ -5200,11 +5197,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, int ret = 0; args32 = memdup_user(arg, sizeof(*args32)); - if (IS_ERR(args32)) { - ret = PTR_ERR(args32); - args32 = NULL; - goto out; - } + if (IS_ERR(args32)) + return PTR_ERR(args32); args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { @@ -5252,11 +5246,8 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) { - ret = PTR_ERR(sa); - sa = NULL; - goto out; - } + if (IS_ERR(sa)) + return PTR_ERR(sa); ret = _btrfs_ioctl_set_received_subvol(file, sa); From a23eaa875f0f1d89eb866b8c9860e78273ff5daf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 4 Nov 2016 12:20:54 -0700 Subject: [PATCH 23/45] Btrfs: adjust len of writes if following a preallocated extent If we have |0--hole--4095||4096--preallocate--12287| instead of using preallocated space, a 8K direct write will just create a new 8K extent and it'll end up with |0--new extent--8191||8192--preallocate--12287| It's because we find a hole em and then go to create a new 8K extent directly without adjusting @len. Signed-off-by: Liu Bo Reviewed-by: Chris Mason Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06dc95caa6f1..66c1d65bd476 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7782,10 +7782,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, } /* - * this will cow the extent, reset the len in case we changed - * it above + * this will cow the extent, if em is within [start, len], then + * probably we've found a preallocated/existing extent, let's + * give it a chance to use preallocated space. */ - len = bh_result->b_size; + len = min_t(u64, bh_result->b_size, em->len - (start - em->start)); + len = ALIGN(len, root->sectorsize); free_extent_map(em); em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { From 0b5e3dafb60229dd7225e81023af5d2ddfb6a4b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Domagoj=20Tr=C5=A1an?= Date: Thu, 27 Oct 2016 08:52:33 +0100 Subject: [PATCH 24/45] btrfs: change btrfs_csum_final result param type to u8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit csum member of struct btrfs_super_block has array type of u8. It makes sense that function btrfs_csum_final should be also declared to accept u8 *. I changed the declaration of method void btrfs_csum_final(u32 crc, char *result); to void btrfs_csum_final(u32 crc, u8 *result); Signed-off-by: Domagoj Tršan [ changed cast to u8 at several call sites ] Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 2 +- fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/inode.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d4d8b7e36b2f..49108036dd4c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -120,7 +120,7 @@ static int check_compressed_csum(struct inode *inode, kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (char *)&csum); + btrfs_csum_final(csum, (u8 *)&csum); kunmap_atomic(kaddr); if (csum != *cb_sum) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5d1da78f044b..8677d29efade 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -271,7 +271,7 @@ u32 btrfs_csum_data(char *data, u32 seed, size_t len) return btrfs_crc32c(seed, data, len); } -void btrfs_csum_final(u32 crc, char *result) +void btrfs_csum_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 124e30c76626..729540701458 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -119,7 +119,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, char *result); +void btrfs_csum_final(u32 crc, u8 *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a754865b3cb1..e690d386ee5e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -476,7 +476,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); + btrfs_csum_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -504,7 +504,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); + btrfs_csum_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->root->fs_info, "csum mismatch on free space cache"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 66c1d65bd476..e6300d00c063 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3102,7 +3102,7 @@ static int __readpage_endio_check(struct inode *inode, kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (char *)&csum); + btrfs_csum_final(csum, (u8 *)&csum); if (csum != csum_expected) goto zeroit; From 4d5106a126f33395126e042ae42582832bfc39f7 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 1 Nov 2016 11:26:06 +0100 Subject: [PATCH 25/45] btrfs: remove redundant check of btrfs_iget return value 'btrfs_iget()' can not return NULL, so this test can be removed. Signed-off-by: Christophe JAILLET Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e690d386ee5e..c698dccb3757 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -75,8 +75,6 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); - if (!inode) - return ERR_PTR(-ENOENT); if (IS_ERR(inode)) return inode; if (is_bad_inode(inode)) { From ed0df618b1b06d7431ee4d985317fc5419a5d559 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 1 Nov 2016 14:21:23 +0100 Subject: [PATCH 26/45] btrfs: store and load values of stripes_min/stripes_max in balance status item The balance status item contains currently known filter values, but the stripes filter was unintentionally not among them. This would mean, that interrupted and automatically restarted balance does not apply the stripe filters. Fixes: dee32d0ac3719ef8d640efaf0884111df444730f CC: stable@vger.kernel.org # 4.4+ Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cf4bc84388e..1b25a460ecea 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2210,6 +2210,8 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static inline void @@ -2228,6 +2230,8 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); } /* struct btrfs_super_block */ From d1111a75479d52046d8a71eb3b071581ee55489a Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 1 Nov 2016 20:25:27 -0700 Subject: [PATCH 27/45] btrfs: Call kunmap if zlib_inflateInit2 fails If zlib_inflateInit2 fails, the input page is never unmapped. Add a call to kunmap when it fails. Signed-off-by: Nick Terrell Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 441b81a3e545..0ed90ccd81eb 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -250,6 +250,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { From c2951f32d36c28d96acf95f0d83116facbec48a2 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 21 Nov 2016 15:59:04 +0100 Subject: [PATCH 28/45] btrfs: remove old tree_root dirent processing in btrfs_real_readdir() Commit 3de4586c527 (Btrfs: Allow subvolumes and snapshots anywhere in the directory tree) introduced the current system of placing snapshots in the directory tree. It also introduced the behavior of creating the snapshot and then creating the directory entries for it. We've kept this code around for compatibility reasons, but it turns out that no file systems with the old tree_root based snapshots can be mounted on newer (>= 2009) kernels anyway. About a month after the above commit, commit 2a7108ad89e (Btrfs: rev the disk format for the inode compat and csum selection changes) landed, changing the superblock magic number. As a result, we know that we'll never encounter tree_root-based dirents or have to deal with skipping our own snapshot dirents. Since that also means that we're now only iterating over DIR_INDEX items, which only contain one directory entry per leaf item, we don't need to loop over the leaf item contents anymore either. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 125 ++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 84 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e6300d00c063..df84d76f124a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5804,20 +5804,13 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int slot; unsigned char d_type; int over = 0; - u32 di_cur; - u32 di_total; - u32 di_len; - int key_type = BTRFS_DIR_INDEX_KEY; char tmp_name[32]; char *name_ptr; int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ bool emitted; bool put = false; - - /* FIXME, use a real flag for deciding about the key type */ - if (root->fs_info->tree_root == root) - key_type = BTRFS_DIR_ITEM_KEY; + struct btrfs_key location; if (!dir_emit_dots(file, ctx)) return 0; @@ -5828,14 +5821,11 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) path->reada = READA_FORWARD; - if (key_type == BTRFS_DIR_INDEX_KEY) { - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); - put = btrfs_readdir_get_delayed_items(inode, &ins_list, - &del_list); - } + INIT_LIST_HEAD(&ins_list); + INIT_LIST_HEAD(&del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); - key.type = key_type; + key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(inode); @@ -5861,85 +5851,54 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (found_key.objectid != key.objectid) break; - if (found_key.type != key_type) + if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) goto next; - if (key_type == BTRFS_DIR_INDEX_KEY && - btrfs_should_delete_dir_index(&del_list, - found_key.offset)) + if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; ctx->pos = found_key.offset; is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - di_cur = 0; - di_total = btrfs_item_size(leaf, item); + if (verify_dir_item(root, leaf, di)) + goto next; - while (di_cur < di_total) { - struct btrfs_key location; - - if (verify_dir_item(root, leaf, di)) - break; - - name_len = btrfs_dir_name_len(leaf, di); - if (name_len <= sizeof(tmp_name)) { - name_ptr = tmp_name; - } else { - name_ptr = kmalloc(name_len, GFP_KERNEL); - if (!name_ptr) { - ret = -ENOMEM; - goto err; - } + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_KERNEL); + if (!name_ptr) { + ret = -ENOMEM; + goto err; } - read_extent_buffer(leaf, name_ptr, - (unsigned long)(di + 1), name_len); - - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); - - - /* is this a reference to our own snapshot? If so - * skip it. - * - * In contrast to old kernels, we insert the snapshot's - * dir item and dir index after it has been created, so - * we won't find a reference to our own snapshot. We - * still keep the following code for backward - * compatibility. - */ - if (location.type == BTRFS_ROOT_ITEM_KEY && - location.objectid == root->root_key.objectid) { - over = 0; - goto skip; - } - over = !dir_emit(ctx, name_ptr, name_len, - location.objectid, d_type); - -skip: - if (name_ptr != tmp_name) - kfree(name_ptr); - - if (over) - goto nopos; - emitted = true; - di_len = btrfs_dir_name_len(leaf, di) + - btrfs_dir_data_len(leaf, di) + sizeof(*di); - di_cur += di_len; - di = (struct btrfs_dir_item *)((char *)di + di_len); } + read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), + name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + over = !dir_emit(ctx, name_ptr, name_len, location.objectid, + d_type); + + if (name_ptr != tmp_name) + kfree(name_ptr); + + emitted = true; + if (over) + goto nopos; next: path->slots[0]++; } - if (key_type == BTRFS_DIR_INDEX_KEY) { - if (is_curr) - ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); - if (ret) - goto nopos; - } + if (is_curr) + ctx->pos++; + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); + if (ret) + goto nopos; /* * If we haven't emitted any dir entry, we must not touch ctx->pos as @@ -5970,12 +5929,10 @@ next: * last entry requires it because doing so has broken 32bit apps * in the past. */ - if (key_type == BTRFS_DIR_INDEX_KEY) { - if (ctx->pos >= INT_MAX) - ctx->pos = LLONG_MAX; - else - ctx->pos = INT_MAX; - } + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; nopos: ret = 0; err: From d2fbb2b589ece9060635b43c2b2333d0b0a0fbf2 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sat, 5 Nov 2016 13:26:35 -0400 Subject: [PATCH 29/45] btrfs: increment ctx->pos for every emitted or skipped dirent in readdir If we process the last item in the leaf and hit an I/O error while reading the next leaf, we return -EIO without having adjusted the position. Since we have emitted dirents, getdents() will return the byte count to the user instead of the error. Subsequent callers will emit the last successful dirent again, and return -EIO again, with the same result. Callers loop forever. Instead, if we always increment ctx->pos after emitting or skipping the dirent, we'll be sure that we won't hit the same one again. When we go to process the next leaf, we won't have emitted any dirents and the -EIO will be returned to the user properly. We also don't need to track if we've emitted a dirent already or if we've changed the position yet. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +-- fs/btrfs/delayed-inode.h | 2 +- fs/btrfs/inode.c | 22 ++-------------------- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0fcf5f25d524..d90d4446f9fe 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1686,7 +1686,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list, bool *emitted) + struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1730,7 +1730,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; - *emitted = true; } return 0; } diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 2495b3d4075f..2c1cbe245104 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -146,7 +146,7 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list, bool *emitted); + struct list_head *ins_list); /* for init */ int __init btrfs_delayed_inode_init(void); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index df84d76f124a..0b836737d382 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5807,8 +5807,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) char tmp_name[32]; char *name_ptr; int name_len; - int is_curr = 0; /* ctx->pos points to the current index? */ - bool emitted; bool put = false; struct btrfs_key location; @@ -5833,7 +5831,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; - emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5859,7 +5856,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) goto next; ctx->pos = found_key.offset; - is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(root, leaf, di)) @@ -5887,31 +5883,17 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (name_ptr != tmp_name) kfree(name_ptr); - emitted = true; if (over) goto nopos; + ctx->pos++; next: path->slots[0]++; } - if (is_curr) - ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; - /* - * If we haven't emitted any dir entry, we must not touch ctx->pos as - * it was was set to the termination value in previous call. We assume - * that "." and ".." were emitted if we reach this point and set the - * termination value as well for an empty directory. - */ - if (ctx->pos > 2 && !emitted) - goto nopos; - - /* Reached end of directory/root. Bump pos past the last item. */ - ctx->pos++; - /* * Stop new entries from being returned after we return the last * entry. From 62fe51c1d0100ff07a761cd077872e01f2a2b8ca Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Nov 2016 09:13:39 -0500 Subject: [PATCH 30/45] Btrfs: fix file extent corruption In order to do hole punching we have a block reserve to hold the reservation we need to drop the extents in our range. Since we could end up dropping a lot of extents we set rsv->failfast so we can just loop around again and drop the remaining of the range. Unfortunately we unconditionally fill the hole extents in and start from the last extent we encountered, which we may or may not have dropped. So this can result in overlapping file extent entries, which can be tripped over in a variety of ways, either by hitting BUG_ON(!ret) in fill_holes() after the search, or in btrfs_set_item_key_safe() in btrfs_drop_extent() at a later time by an unrelated task. Fix this by only setting drop_end to the last extent we did actually drop. This way our holes are filled in properly for the range that we did drop, and the rest of the range that remains to be dropped is actually dropped. Thanks, Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 5b1f90af3db6..f5288fa0aad0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -705,6 +705,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; + u64 last_end = start; int del_nr = 0; int del_slot = 0; int extent_type; @@ -796,8 +797,10 @@ next_slot: * extent item in the call to setup_items_for_insert() later * in this function. */ - if (extent_end == key.offset && extent_end >= search_start) + if (extent_end == key.offset && extent_end >= search_start) { + last_end = extent_end; goto delete_extent_item; + } if (extent_end <= search_start) { path->slots[0]++; @@ -859,6 +862,12 @@ next_slot: } key.offset = start; } + /* + * From here on out we will have actually dropped something, so + * last_end can be updated. + */ + last_end = extent_end; + /* * | ---- range to drop ----- | * | -------- extent -------- | @@ -1009,7 +1018,7 @@ delete_extent_item: if (!replace_extent || !(*key_inserted)) btrfs_release_path(path); if (drop_end) - *drop_end = found ? min(end, extent_end) : end; + *drop_end = found ? min(end, last_end) : end; return ret; } @@ -2524,7 +2533,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; - if (cur_offset < ino_size) { + if (cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { From f94480bd7be6bb1b0823d1036f3ee4ebe7450172 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2016 14:06:22 -0500 Subject: [PATCH 31/45] Btrfs: abort transaction if fill_holes() fails At this point we will have dropped extent entries from the file, so if we fail to insert the new hole entries then we are leaving the fs in a corrupt state (albeit an easily fixed one). Abort the transaciton if this happens so we can avoid corrupting the fs. Thanks, Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/file.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f5288fa0aad0..3c1f4be36f16 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2232,9 +2232,15 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) + if (ret <= 0) { + /* + * We should have dropped this offset, so if we find it then + * something has gone horribly wrong. + */ + if (ret == 0) + ret = -EINVAL; return ret; - BUG_ON(!ret); + } leaf = path->nodes[0]; if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { @@ -2537,6 +2543,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); err = ret; break; } @@ -2601,6 +2614,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); err = ret; goto out_trans; } From 0c476a5d7f63bdae0b6188a191a6e9eb8f1024d7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 18 Nov 2016 21:52:40 -0500 Subject: [PATCH 32/45] btrfs: Ensure proper sector alignment for btrfs_free_reserved_data_space This fixes the WARN_ON on BTRFS_I(inode)->reserved_extents in btrfs_destroy_inode and the WARN_ON on nonzero delalloc bytes on umount with qgroups enabled. I was able to reproduce this by setting up a small (~500kb) quota limit and writing a file one byte at a time until I hit the limit. The warnings would all hit on umount. The root cause is that we would reserve a block-sized range in both the reservation and the quota in btrfs_check_data_free_space, but if we encountered a problem (like e.g. EDQUOT), we would only release the single byte in the qgroup reservation. That caused an iotree state split, which increased the number of outstanding extents, in turn disallowing releasing the metadata reservation. Signed-off-by: Jeff Mahoney Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 78fcc67e7b8c..c17b0d1b081f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4322,6 +4322,13 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + btrfs_free_reserved_data_space_noquota(inode, start, len); btrfs_qgroup_free_data(inode, start, len); } From 974b1adc3b103fae1dbc1fe6a8aceeca2878f20e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:46 +0100 Subject: [PATCH 33/45] btrfs: use bio iterators for the decompression handlers Pass the full bio to the decompression routines and use bio iterators to iterate over the data in the bio. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 125 ++++++++++++++--------------------------- fs/btrfs/compression.h | 12 +--- fs/btrfs/lzo.c | 17 ++---- fs/btrfs/zlib.c | 15 ++--- 4 files changed, 56 insertions(+), 113 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 49108036dd4c..b060465c4fad 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -81,9 +81,9 @@ struct compressed_bio { u32 sums; }; -static int btrfs_decompress_biovec(int type, struct page **pages_in, - u64 disk_start, struct bio_vec *bvec, - int vcnt, size_t srclen); +static int btrfs_decompress_bio(int type, struct page **pages_in, + u64 disk_start, struct bio *orig_bio, + size_t srclen); static inline int compressed_bio_size(struct btrfs_root *root, unsigned long disk_size) @@ -175,11 +175,10 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_biovec(cb->compress_type, + ret = btrfs_decompress_bio(cb->compress_type, cb->compressed_pages, cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, + cb->orig_bio, cb->compressed_len); csum_failed: if (ret) @@ -959,9 +958,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * * disk_start is the starting logical offset of this array in the file * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec + * orig_bio contains the pages from the file that we want to decompress into * * srclen is the number of bytes in pages_in * @@ -970,18 +967,18 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_biovec(int type, struct page **pages_in, - u64 disk_start, struct bio_vec *bvec, - int vcnt, size_t srclen) +static int btrfs_decompress_bio(int type, struct page **pages_in, + u64 disk_start, struct bio *orig_bio, + size_t srclen) { struct list_head *workspace; int ret; workspace = find_workspace(type); - ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, - disk_start, - bvec, vcnt, srclen); + ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, + disk_start, orig_bio, + srclen); free_workspace(type, workspace); return ret; } @@ -1021,9 +1018,7 @@ void btrfs_exit_compress(void) */ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset) + struct bio *bio) { unsigned long buf_offset; unsigned long current_buf_start; @@ -1031,13 +1026,13 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long working_bytes = total_out - buf_start; unsigned long bytes; char *kaddr; - struct page *page_out = bvec[*pg_index].bv_page; + struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); /* * start byte is the first byte of the page we're currently * copying into relative to the start of the compressed data. */ - start_byte = page_offset(page_out) - disk_start; + start_byte = page_offset(bvec.bv_page) - disk_start; /* we haven't yet hit data corresponding to this page */ if (total_out <= start_byte) @@ -1057,80 +1052,46 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { - bytes = min(PAGE_SIZE - *pg_offset, - PAGE_SIZE - buf_offset); + bytes = min_t(unsigned long, bvec.bv_len, + PAGE_SIZE - buf_offset); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out); - memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - kunmap_atomic(kaddr); - flush_dcache_page(page_out); - *pg_offset += bytes; + kaddr = kmap_atomic(bvec.bv_page); + memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr); + flush_dcache_page(bvec.bv_page); + buf_offset += bytes; working_bytes -= bytes; current_buf_start += bytes; /* check if we need to pick another page */ - if (*pg_offset == PAGE_SIZE) { - (*pg_index)++; - if (*pg_index >= vcnt) - return 0; + bio_advance(bio, bytes); + if (!bio->bi_iter.bi_size) + return 0; + bvec = bio_iter_iovec(bio, bio->bi_iter); - page_out = bvec[*pg_index].bv_page; - *pg_offset = 0; - start_byte = page_offset(page_out) - disk_start; + start_byte = page_offset(bvec.bv_page) - disk_start; - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - return 1; + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; - } + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; } } return 1; } - -/* - * When uncompressing data, we need to make sure and zero any parts of - * the biovec that were not filled in by the decompression code. pg_index - * and pg_offset indicate the last page and the last offset of that page - * that have been filled in. This will zero everything remaining in the - * biovec. - */ -void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, - unsigned long pg_index, - unsigned long pg_offset) -{ - while (pg_index < vcnt) { - struct page *page = bvec[pg_index].bv_page; - unsigned long off = bvec[pg_index].bv_offset; - unsigned long len = bvec[pg_index].bv_len; - - if (pg_offset < off) - pg_offset = off; - if (pg_offset < off + len) { - unsigned long bytes = off + len - pg_offset; - char *kaddr; - - kaddr = kmap_atomic(page); - memset(kaddr + pg_offset, 0, bytes); - kunmap_atomic(kaddr); - } - pg_index++; - pg_offset = 0; - } -} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index f49d8b8c0f00..09879579fbc8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,9 +34,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset); + struct bio *bio); int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, @@ -45,9 +43,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, - unsigned long pg_index, - unsigned long pg_offset); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -72,11 +67,10 @@ struct btrfs_compress_op { unsigned long *total_out, unsigned long max_out); - int (*decompress_biovec)(struct list_head *workspace, + int (*decompress_bio)(struct list_head *workspace, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen); int (*decompress)(struct list_head *workspace, diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 48655da0f4ca..45d26980caf9 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -254,25 +254,21 @@ out: return ret; } -static int lzo_decompress_biovec(struct list_head *ws, +static int lzo_decompress_bio(struct list_head *ws, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; - unsigned long page_out_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; unsigned long working_bytes; - unsigned long pg_offset; - size_t in_len; size_t out_len; unsigned long in_offset; @@ -292,7 +288,6 @@ static int lzo_decompress_biovec(struct list_head *ws, in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; - pg_offset = 0; while (tot_in < tot_len) { in_len = read_compress_length(data_in + in_offset); @@ -365,16 +360,14 @@ cont: tot_out += out_len; ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - tot_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); + tot_out, disk_start, orig_bio); if (ret2 == 0) break; } done: kunmap(pages_in[page_in_index]); if (!ret) - btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + zero_fill_bio(orig_bio); return ret; } @@ -438,6 +431,6 @@ const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, - .decompress_biovec = lzo_decompress_biovec, + .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, }; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 0ed90ccd81eb..da497f184ff4 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -210,10 +210,9 @@ out: return ret; } -static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, +static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -222,10 +221,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; - unsigned long page_out_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; @@ -235,7 +232,6 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; workspace->strm.avail_out = PAGE_SIZE; - pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -267,8 +263,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, total_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); + orig_bio); if (ret2 == 0) { ret = 0; goto done; @@ -301,7 +296,7 @@ done: if (data_in) kunmap(pages_in[page_in_index]); if (!ret) - btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + zero_fill_bio(orig_bio); return ret; } @@ -408,6 +403,6 @@ const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, - .decompress_biovec = zlib_decompress_biovec, + .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, }; From 80ace3e40390ad0a85606b3a1450eb723070c9a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:47 +0100 Subject: [PATCH 34/45] btrfs: don't access the bio directly in the raid5/6 code Just use bio_for_each_segment_all to iterate over all segments. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d016d4a79864..eece126d6973 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1144,10 +1144,10 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; + struct bio_vec *bvec; u64 start; unsigned long stripe_offset; unsigned long page_index; - struct page *p; int i; spin_lock_irq(&rbio->bio_list_lock); @@ -1156,10 +1156,8 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; - for (i = 0; i < bio->bi_vcnt; i++) { - p = bio->bi_io_vec[i].bv_page; - rbio->bio_pages[page_index + i] = p; - } + bio_for_each_segment_all(bvec, bio, i) + rbio->bio_pages[page_index + i] = bvec->bv_page; } spin_unlock_irq(&rbio->bio_list_lock); } @@ -1433,13 +1431,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct bio *bio) { + struct bio_vec *bvec; int i; - struct page *p; - for (i = 0; i < bio->bi_vcnt; i++) { - p = bio->bi_io_vec[i].bv_page; - SetPageUptodate(p); - } + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); } /* From 6a2de22f6babafd609b9356cdb0979eb5bb10564 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:48 +0100 Subject: [PATCH 35/45] btrfs: don't access the bio directly in the direct I/O code Just use bio_for_each_segment_all to iterate over all segments. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0b836737d382..c96d94ef846d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8334,7 +8334,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec = orig_bio->bi_io_vec; + struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; @@ -8343,7 +8343,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int async_submit = 0; int nr_sectors; int ret; - int i; + int i, j; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, btrfs_op(orig_bio), @@ -8373,7 +8373,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, btrfs_io_bio(bio)->logical = file_offset; atomic_inc(&dip->pending_bios); - while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + bio_for_each_segment_all(bvec, orig_bio, j) { nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); i = 0; next_block: @@ -8427,7 +8427,6 @@ next_block: i++; goto next_block; } - bvec++; } } From 6cd7ce4935485c203e0bda815dbabb50e30f31e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:49 +0100 Subject: [PATCH 36/45] btrfs: don't access the bio directly in btrfs_csum_one_bio Use bio_for_each_segment_all to iterate over the segments instead. This requires a bit of reshuffling so that we only lookup up the ordered item once inside the bio_for_each_segment_all loop. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 43418c08b110..fad3804fc335 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -447,13 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; int index; int nr_sectors; - int i; + int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; @@ -470,17 +469,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (contig) offset = file_start; else - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = 0; /* shut up gcc */ - ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, j) { if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!ordered) { + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); /* Logic error */ + } + data = kmap_atomic(bvec->bv_page); nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, @@ -529,9 +531,6 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, } kunmap_atomic(data); - - bio_index++; - bvec++; } this_sum_bytes = 0; btrfs_add_ordered_sum(inode, ordered, sums); From 81381053d094a3098d27eba7bb9b9aaf0e197a4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:50 +0100 Subject: [PATCH 37/45] btrfs: use bi_size Instead of using bi_vcnt to calculate it. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/compression.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b060465c4fad..1a618cb5370b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -562,7 +562,6 @@ next: * * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages - * bio->bi_vcnt is a count of pages * * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls @@ -574,7 +573,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map_tree *em_tree; struct compressed_bio *cb; struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE; unsigned long compressed_len; unsigned long nr_pages; unsigned long pg_index; @@ -619,7 +617,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, free_extent_map(em); em = NULL; - cb->len = uncompressed_len; + cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; @@ -647,8 +645,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ - uncompressed_len = bio->bi_vcnt * PAGE_SIZE; - cb->len = uncompressed_len; + cb->len = bio->bi_iter.bi_size; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) From 2a4d0c9068a6cbd94086953e45625505891490b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:51 +0100 Subject: [PATCH 38/45] btrfs: calculate end of bio offset properly Use the bvec offset and len members to prepare for multipage bvecs. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1a618cb5370b..ae4c000cbffc 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -445,6 +445,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, return 0; } +static u64 bio_end_offset(struct bio *bio) +{ + struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + return page_offset(last->bv_page) + last->bv_len + last->bv_offset; +} + static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb) @@ -463,8 +470,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 end; int misses = 0; - page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; - last_offset = (page_offset(page) + PAGE_SIZE); + last_offset = bio_end_offset(cb->orig_bio); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; From 4989d277eb4b36cc1aacf72725b53977c6b5260d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:52 +0100 Subject: [PATCH 39/45] btrfs: refactor __btrfs_lookup_bio_sums to use bio_for_each_segment_all Rework the loop a little bit to use the generic bio_for_each_segment_all helper for iterating over the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index fad3804fc335..5e74178ba9d9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -163,7 +163,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -176,8 +176,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 page_bytes_left; u32 diff; int nblocks; - int bio_index = 0; - int count; + int count = 0, i; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); path = btrfs_alloc_path(); @@ -223,8 +222,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; - page_bytes_left = bvec->bv_len; - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { + page_bytes_left = bvec->bv_len; + if (count) + goto next; + if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, @@ -285,29 +287,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; - +next: while (count--) { disk_bytenr += root->sectorsize; offset += root->sectorsize; page_bytes_left -= root->sectorsize; - if (!page_bytes_left) { - bio_index++; - /* - * make sure we're still inside the - * bio before we update page_bytes_left - */ - if (bio_index >= bio->bi_vcnt) { - WARN_ON_ONCE(count); - goto done; - } - bvec++; - page_bytes_left = bvec->bv_len; - } - + if (!page_bytes_left) + break; /* move to next bio */ } } -done: + WARN_ON_ONCE(count); btrfs_free_path(path); return 0; } From 1621f8f3f9cdfab43822aa54a84c2a0a5111b936 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 25 Nov 2016 09:07:53 +0100 Subject: [PATCH 40/45] btrfs: use bio_for_each_segment_all in __btrfsic_submit_bio And remove the bogus check for a NULL return value from kmap, which can't happen. While we're at it: I don't think that kmapping up to 256 will work without deadlocks on highmem machines, a better idea would be to use vm_map_ram to map all of them into a single virtual address range. Incidentally that would also simplify the code a lot. Signed-off-by: Christoph Hellwig Reviewed-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index a6f657ffa633..86f681fd200d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2819,10 +2819,11 @@ static void __btrfsic_submit_bio(struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; + struct bio_vec *bvec; int bio_is_patched; char **mapped_datav; @@ -2840,32 +2841,23 @@ static void __btrfsic_submit_bio(struct bio *bio) if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - for (i = 0; i < bio->bi_vcnt; i++) { - BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); - if (!mapped_datav[i]) { - while (i > 0) { - i--; - kunmap(bio->bi_io_vec[i].bv_page); - } - kfree(mapped_datav); - goto leave; - } + + bio_for_each_segment_all(bvec, bio, i) { + BUG_ON(bvec->bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec->bv_page); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bio->bi_io_vec[i].bv_len, - bio->bi_io_vec[i].bv_offset); - cur_bytenr += bio->bi_io_vec[i].bv_len; + i, cur_bytenr, bvec->bv_len, bvec->bv_offset); + cur_bytenr += bvec->bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, NULL, bio->bi_opf); - while (i > 0) { - i--; - kunmap(bio->bi_io_vec[i].bv_page); - } + bio_for_each_segment_all(bvec, bio, i) + kunmap(bvec->bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & From 1d2beaa95b307db5aacd527065d16ed48854d04e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Oct 2016 09:31:26 +0800 Subject: [PATCH 41/45] btrfs: qgroup: Add comments explaining how btrfs qgroup works Add explaination how btrfs qgroups work. Qgroup is split into 3 main phrases: 1) Reserve To ensure qgroup doesn't exceed its limit 2) Trace To info qgroup to trace which extent 3) Account Calculate qgroup number change for each traced extent. This should save quite some time for new developers. Signed-off-by: Qu Wenruo Reviewed-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/qgroup.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1bc64c864b62..a72bf2192757 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,34 @@ #include "ulist.h" #include "delayed-ref.h" +/* + * Btrfs qgroup overview + * + * Btrfs qgroup splits into 3 main part: + * 1) Reserve + * Reserve metadata/data space for incoming operations + * Affect how qgroup limit works + * + * 2) Trace + * Tell btrfs qgroup to trace dirty extents. + * + * Dirty extents including: + * - Newly allocated extents + * - Extents going to be deleted (in this trans) + * - Extents whose owner is going to be modified + * + * This is the main part affects whether qgroup numbers will stay + * consistent. + * Btrfs qgroup can trace clean extents and won't cause any problem, + * but it will consume extra CPU time, it should be avoided if possible. + * + * 3) Account + * Btrfs qgroup will updates its numbers, based on dirty extents traced + * in previous step. + * + * Normally at qgroup rescan and transaction commit time. + */ + /* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. From 50b3e040b7c092c3c157f3febaaac77038e9f6fd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Oct 2016 09:31:27 +0800 Subject: [PATCH 42/45] btrfs: qgroup: Rename functions to make it follow reserve,trace,account steps Rename btrfs_qgroup_insert_dirty_extent(_nolock) to btrfs_qgroup_trace_extent(_nolock), according to the new reserve/trace/account naming schema. Signed-off-by: Qu Wenruo Reviewed-and-Tested-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 2 +- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/qgroup.c | 8 ++++---- fs/btrfs/qgroup.h | 13 +++++++------ fs/btrfs/relocation.c | 2 +- fs/btrfs/tree-log.c | 2 +- include/trace/events/btrfs.h | 2 +- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8d93854a4b4f..a1cd0da72c94 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -606,7 +606,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + if(btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord)) kfree(qrecord); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c17b0d1b081f..1ad5643a507b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8571,8 +8571,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, - bytenr, num_bytes, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8721,7 +8721,7 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = btrfs_qgroup_insert_dirty_extent(trans, + ret = btrfs_qgroup_trace_extent(trans, root->fs_info, child_bytenr, root->nodesize, GFP_NOFS); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50b32cb25bdb..87ab7387680f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1457,7 +1457,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, +int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { @@ -1467,7 +1467,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); + trace_btrfs_qgroup_trace_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1486,7 +1486,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag) { @@ -1509,7 +1509,7 @@ int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, record->old_roots = NULL; spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); if (ret > 0) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index a72bf2192757..9303e09c71dc 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -93,8 +93,8 @@ struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); /* - * Insert one dirty extent record into @delayed_refs, informing qgroup to - * account that extent at commit trans time. + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at commit trans time. * * No lock version, caller must acquire delayed ref lock and allocate memory. * @@ -102,14 +102,15 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, * Return >0 for existing record, caller can free @record safely. * Error is not possible */ -int btrfs_qgroup_insert_dirty_extent_nolock( +int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); /* - * Insert one dirty extent record into @delayed_refs, informing qgroup to - * account that extent at commit trans time. + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. * * Better encapsulated version. * @@ -117,7 +118,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock( * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ -int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 26f6c5ac879e..c430f2f5be24 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4012,7 +4012,7 @@ static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, if (btrfs_file_extent_type(path->nodes[0], fi) != BTRFS_FILE_EXTENT_REG) goto next; - ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, + ret = btrfs_qgroup_trace_extent(trans, fs_info, btrfs_file_extent_disk_bytenr(path->nodes[0], fi), btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), GFP_NOFS); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3d33c4e41e5f..e0478f51cf16 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -689,7 +689,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup) */ - ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + ret = btrfs_qgroup_trace_extent(trans, root->fs_info, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item), GFP_NOFS); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index e030d6f6c19a..e61bbc3b82d5 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1406,7 +1406,7 @@ DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, TP_ARGS(fs_info, rec) ); -DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent, +DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent, TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *rec), From 33d1f05ccb698aa92db3e64a639ce523cf18a408 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Oct 2016 09:31:28 +0800 Subject: [PATCH 43/45] btrfs: Export and move leaf/subtree qgroup helpers to qgroup.c Move account_shared_subtree() to qgroup.c and rename it to btrfs_qgroup_trace_subtree(). Do the same thing for account_leaf_items() and rename it to btrfs_qgroup_trace_leaf_items(). Since all these functions are only for qgroup, move them to qgroup.c and export them is more appropriate. Signed-off-by: Qu Wenruo Reviewed-and-Tested-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 220 +---------------------------------------- fs/btrfs/qgroup.c | 213 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/qgroup.h | 23 +++++ 3 files changed, 239 insertions(+), 217 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1ad5643a507b..af0bcbd8302f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8538,220 +8538,6 @@ reada: wc->reada_slot = slot; } -static int account_leaf_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *eb) -{ - int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u64 bytenr, num_bytes; - - /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) - return 0; - - for (i = 0; i < nr; i++) { - btrfs_item_key_to_cpu(eb, &key, i); - - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); - /* filter out non qgroup-accountable extents */ - extent_type = btrfs_file_extent_type(eb, fi); - - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - - bytenr = btrfs_file_extent_disk_bytenr(eb, fi); - if (!bytenr) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_trace_extent(trans, root->fs_info, - bytenr, num_bytes, GFP_NOFS); - if (ret) - return ret; - } - return 0; -} - -/* - * Walk up the tree from the bottom, freeing leaves and any interior - * nodes which have had all slots visited. If a node (leaf or - * interior) is freed, the node above it will have it's slot - * incremented. The root node will never be freed. - * - * At the end of this function, we should have a path which has all - * slots incremented to the next position for a search. If we need to - * read a new node it will be NULL and the node above it will have the - * correct slot selected for a later read. - * - * If we increment the root nodes slot counter past the number of - * elements, 1 is returned to signal completion of the search. - */ -static int adjust_slots_upwards(struct btrfs_root *root, - struct btrfs_path *path, int root_level) -{ - int level = 0; - int nr, slot; - struct extent_buffer *eb; - - if (root_level == 0) - return 1; - - while (level <= root_level) { - eb = path->nodes[level]; - nr = btrfs_header_nritems(eb); - path->slots[level]++; - slot = path->slots[level]; - if (slot >= nr || level == 0) { - /* - * Don't free the root - we will detect this - * condition after our loop and return a - * positive value for caller to stop walking the tree. - */ - if (level != root_level) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - - free_extent_buffer(eb); - path->nodes[level] = NULL; - path->slots[level] = 0; - } - } else { - /* - * We have a valid slot to walk back down - * from. Stop here so caller can process these - * new nodes. - */ - break; - } - - level++; - } - - eb = path->nodes[root_level]; - if (path->slots[root_level] >= btrfs_header_nritems(eb)) - return 1; - - return 0; -} - -/* - * root_eb is the subtree root and is locked before this function is called. - */ -static int account_shared_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *root_eb, - u64 root_gen, - int root_level) -{ - int ret = 0; - int level; - struct extent_buffer *eb = root_eb; - struct btrfs_path *path = NULL; - - BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); - - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) - return 0; - - if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen); - if (ret) - goto out; - } - - if (root_level == 0) { - ret = account_leaf_items(trans, root, root_eb); - goto out; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * Walk down the tree. Missing extent blocks are filled in as - * we go. Metadata is accounted every time we read a new - * extent block. - * - * When we reach a leaf, we account for file extent items in it, - * walk back up the tree (adjusting slot pointers as we go) - * and restart the search process. - */ - extent_buffer_get(root_eb); /* For path */ - path->nodes[root_level] = root_eb; - path->slots[root_level] = 0; - path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ -walk_down: - level = root_level; - while (level >= 0) { - if (path->nodes[level] == NULL) { - int parent_slot; - u64 child_gen; - u64 child_bytenr; - - /* We need to get child blockptr/gen from - * parent before we can read it. */ - eb = path->nodes[level + 1]; - parent_slot = path->slots[level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - - eb = read_tree_block(root, child_bytenr, child_gen); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; - } - - path->nodes[level] = eb; - path->slots[level] = 0; - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_trace_extent(trans, - root->fs_info, child_bytenr, - root->nodesize, GFP_NOFS); - if (ret) - goto out; - } - - if (level == 0) { - ret = account_leaf_items(trans, root, path->nodes[level]); - if (ret) - goto out; - - /* Nonzero return here means we completed our search */ - ret = adjust_slots_upwards(root, path, root_level); - if (ret) - break; - - /* Restart search with new slots */ - goto walk_down; - } - - level--; - } - - ret = 0; -out: - btrfs_free_path(path); - - return ret; -} - /* * helper to process tree block while walking down the tree. * @@ -8980,8 +8766,8 @@ skip: } if (need_account) { - ret = account_shared_subtree(trans, root, next, - generation, level - 1); + ret = btrfs_qgroup_trace_subtree(trans, root, next, + generation, level - 1); if (ret) { btrfs_err_rl(root->fs_info, "Error %d accounting shared subtree. Quota is out of sync, rescan required.", @@ -9078,7 +8864,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = account_leaf_items(trans, root, eb); + ret = btrfs_qgroup_trace_leaf_items(trans, root, eb); if (ret) { btrfs_err_rl(root->fs_info, "error %d accounting leaf items. Quota is out of sync, rescan required.", diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 87ab7387680f..605a3227980a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1517,6 +1517,219 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, return 0; } +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + /* We can be called directly from walk_up_proc() */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) + return 0; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_trace_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * We need to get child blockptr/gen from parent before + * we can read it. + */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_gen); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_trace_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); + if (ret) + goto out; + } + + if (level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, root, + path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 9303e09c71dc..99c879dbedc1 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -122,6 +122,29 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag); +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb); +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, int root_level); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, From 824d8dff8846533c9f1f9b1eabb0c03959e989ca Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Oct 2016 09:31:29 +0800 Subject: [PATCH 44/45] btrfs: qgroup: Fix qgroup data leaking by using subtree tracing Commit 62b99540a1d91e464 (btrfs: relocation: Fix leaking qgroups numbers on data extents) only fixes the problem partly. The previous fix is to trace all new data extents at transaction commit time when balance finishes. However balance is not done in a large transaction, every path replacement can happen in its own transaction. This makes the fix useless if transaction commits during relocation. For example: relocate_block_group() |-merge_reloc_roots() | |- merge_reloc_root() | |- btrfs_start_transaction() <- Trans X | |- replace_path() <- Cause leak | |- btrfs_end_transaction_throttle() <- Trans X commits here | | Leak not fixed | | | |- btrfs_start_transaction() <- Trans Y | |- replace_path() <- Cause leak | |- btrfs_end_transaction_throttle() <- Trans Y ends | but not committed |-btrfs_join_transaction() <- Still trans Y |-qgroup_fix() <- Only fixes data leak | in trans Y |-btrfs_commit_transaction() <- Trans Y commits In that case, qgroup fixup can only fix data leak in trans Y, data leak in trans X is out of fix. So the correct fix should happen in the same transaction of replace_path(). This patch fixes it by tracing both subtrees of tree block swap, so it can fix the problem and ensure all leaking and fix are in the same transaction, so no leak again. Reported-by: Goldwyn Rodrigues Signed-off-by: Qu Wenruo Reviewed-and-Tested-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 119 ++++++++---------------------------------- 1 file changed, 23 insertions(+), 96 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c430f2f5be24..3dc7232aa038 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1900,6 +1900,29 @@ again: path->lowest_level = 0; BUG_ON(ret); + /* + * Info qgroup to trace both subtrees. + * + * We must trace both trees. + * 1) Tree reloc subtree + * If not traced, we will leak data numbers + * 2) Fs subtree + * If not traced, we will double count old data + * and tree block numbers, if current trans doesn't free + * data reloc tree inode. + */ + ret = btrfs_qgroup_trace_subtree(trans, src, parent, + btrfs_header_generation(parent), + btrfs_header_level(parent)); + if (ret < 0) + break; + ret = btrfs_qgroup_trace_subtree(trans, dest, + path->nodes[level], + btrfs_header_generation(path->nodes[level]), + btrfs_header_level(path->nodes[level])); + if (ret < 0) + break; + /* * swap blocks in fs tree and reloc tree. */ @@ -3949,90 +3972,6 @@ int prepare_to_relocate(struct reloc_control *rc) return 0; } -/* - * Qgroup fixer for data chunk relocation. - * The data relocation is done in the following steps - * 1) Copy data extents into data reloc tree - * 2) Create tree reloc tree(special snapshot) for related subvolumes - * 3) Modify file extents in tree reloc tree - * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks - * - * The problem is, data and tree reloc tree are not accounted to qgroup, - * and 4) will only info qgroup to track tree blocks change, not file extents - * in the tree blocks. - * - * The good news is, related data extents are all in data reloc tree, so we - * only need to info qgroup to track all file extents in data reloc tree - * before commit trans. - */ -static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct inode *inode = rc->data_inode; - struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - int ret = 0; - - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return 0; - - /* - * Only for stage where we update data pointers the qgroup fix is - * valid. - * For MOVING_DATA stage, we will miss the timing of swapping tree - * blocks, and won't fix it. - */ - if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); - while (1) { - struct btrfs_file_extent_item *fi; - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid > btrfs_ino(inode)) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - goto next; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(path->nodes[0], fi) != - BTRFS_FILE_EXTENT_REG) - goto next; - ret = btrfs_qgroup_trace_extent(trans, fs_info, - btrfs_file_extent_disk_bytenr(path->nodes[0], fi), - btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), - GFP_NOFS); - if (ret < 0) - break; -next: - ret = btrfs_next_item(data_reloc_root, path); - if (ret < 0) - break; - if (ret > 0) { - ret = 0; - break; - } - } - unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); -out: - btrfs_free_path(path); - return ret; -} - static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -4223,13 +4162,6 @@ restart: err = PTR_ERR(trans); goto out_free; } - ret = qgroup_fix_relocated_data_extents(trans, rc); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - if (!err) - err = ret; - goto out_free; - } btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); @@ -4635,11 +4567,6 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = PTR_ERR(trans); goto out_free; } - err = qgroup_fix_relocated_data_extents(trans, rc); - if (err < 0) { - btrfs_abort_transaction(trans, err); - goto out_free; - } err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); From 1d57ee941692d0cc928526e21a1557b2ae3e11db Mon Sep 17 00:00:00 2001 From: Wang Xiaoguang Date: Wed, 26 Oct 2016 18:07:33 +0800 Subject: [PATCH 45/45] btrfs: improve delayed refs iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This issue was found when I tried to delete a heavily reflinked file, when deleting such files, other transaction operation will not have a chance to make progress, for example, start_transaction() will blocked in wait_current_trans(root) for long time, sometimes it even triggers soft lockups, and the time taken to delete such heavily reflinked file is also very large, often hundreds of seconds. Using perf top, it reports that: PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs) --------------------------------------------------------------------------------------- 84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80 11.02% [kernel] [k] delay_tsc 0.79% [kernel] [k] _raw_spin_unlock_irq 0.78% [kernel] [k] _raw_spin_unlock_irqrestore 0.45% [kernel] [k] do_raw_spin_lock 0.18% [kernel] [k] __slab_alloc It seems __btrfs_run_delayed_refs() took most cpu time, after some debug work, I found it's select_delayed_ref() causing this issue, for a delayed head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but select_delayed_ref() will firstly try to iterate node list to find BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and waste much time. To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head, then in select_delayed_ref(), if this list is not empty, we can directly use nodes in this list. With this patch, it just took about 10~15 seconds to delte the same file. Now using perf top, it reports that: PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs) ---------------------------------------------------------------------------------------- 20.74% [kernel] [k] _raw_spin_unlock_irqrestore 16.33% [kernel] [k] __slab_alloc 5.41% [kernel] [k] lock_acquired 4.42% [kernel] [k] lock_acquire 4.05% [kernel] [k] lock_release 3.37% [kernel] [k] _raw_spin_unlock_irq For normal files, this patch also gives help, at least we do not need to iterate whole list to found BTRFS_ADD_DELAYED_REF nodes. Signed-off-by: Wang Xiaoguang Reviewed-by: Liu Bo Tested-by: Holger Hoffstätte Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 18 ++++++++++++++++++ fs/btrfs/delayed-ref.h | 8 ++++++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent-tree.c | 15 +++++++++------ 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a1cd0da72c94..ef724a5fc30e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -189,6 +189,8 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, } else { assert_spin_locked(&head->lock); list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -431,6 +433,15 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist->action = ref->action; mod = -exist->ref_mod; exist->ref_mod = ref->ref_mod; + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&exist->add_list, + &href->ref_add_list); + else if (ref->action == BTRFS_DROP_DELAYED_REF) { + ASSERT(!list_empty(&exist->add_list)); + list_del(&exist->add_list); + } else { + ASSERT(0); + } } else mod = -ref->ref_mod; } @@ -444,6 +455,8 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, add_tail: list_add_tail(&ref->list, &href->ref_list); + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); trans->delayed_ref_updates++; spin_unlock(&href->lock); @@ -590,6 +603,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; INIT_LIST_HEAD(&head_ref->ref_list); + INIT_LIST_HEAD(&head_ref->ref_add_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; @@ -671,6 +685,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + INIT_LIST_HEAD(&ref->list); + INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_tree_ref(ref); full_ref->parent = parent; @@ -726,6 +742,8 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + INIT_LIST_HEAD(&ref->list); + INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_data_ref(ref); full_ref->parent = parent; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 43f3629760e9..dba97842b47a 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -42,6 +42,12 @@ struct btrfs_delayed_ref_node { /*data/tree ref use list, stored in ref_head->ref_list. */ struct list_head list; + /* + * If action is BTRFS_ADD_DELAYED_REF, also link this node to + * ref_head->ref_add_list, then we do not need to iterate the + * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + */ + struct list_head add_list; /* the starting bytenr of the extent */ u64 bytenr; @@ -99,6 +105,8 @@ struct btrfs_delayed_ref_head { spinlock_t lock; struct list_head ref_list; + /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ + struct list_head ref_add_list; struct rb_node href_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8677d29efade..811662cce977 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4344,6 +4344,8 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list) { ref->in_tree = 0; list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index af0bcbd8302f..0d80136206b1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2454,13 +2454,14 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) * the extent item from the extent tree, when there still are references * to add, which would fail because they would not find the extent item. */ - list_for_each_entry(ref, &head->ref_list, list) { - if (ref->action == BTRFS_ADD_DELAYED_REF) - return ref; - } + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); - return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, - list); + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + ASSERT(list_empty(&ref->add_list)); + return ref; } /* @@ -2620,6 +2621,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, actual_count++; ref->in_tree = 0; list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); } atomic_dec(&delayed_refs->num_entries);