for-5.12-rc3-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmBTeBsACgkQxWXV+ddt
 WDtwcBAAoto5Pbc3Lvt0aha3qn9q/Ms9lNU3YIwTjqXV3lIRKksWCS7kQmWlFmLz
 dILhdRBg1iWVh8qbeqpL5su7yNJduypsY/ImJroukb/BzwQViFRDGy5qIc56qLH2
 OVTx4LQ0zdqVdD86Qj0mt9ilSjgXYN+J53IUjsSSyJIpgt3vVcfjCYSkFO8zBiMH
 eliRtYShzJHkjEwVWLZRzk76oTnFQEC28IdYJ4y95mYl2wCABfTU2ylSeVDTtc6O
 x+fNMHHRmde2nbsHc+0eMm7rYLXuzvyx/tY17u6A6iwEQLGjE4rXOVZ7kA93WgAd
 YTXhM/B+YFfirNh029Av/MJP+2t9YBEODAHl1tnOdM0mfvXkpimaW0jvUEhi5f6I
 ZGu5FytscsgjyUK827WL7bZKO8WMzTLQvB3ryZ9UcrHm3QbZ7xGdoBE2L86p4Euw
 LiXUALdOWeYjFKSW9WWKrtQBtdjlLQYqJt+hL0ifaGlnfoi2G+DQeKtL9ZAKH5Cu
 gcjDUewnJtYPLyDOCRjQPFcts/MD5o81qMLeEwshmZT/bNMD9JOGEppCxBWGWSCx
 dYGq04Wib/dN710i5jB1XbJboBmT2SZDyBeiKTpCXs5mECBU00uWkkO98oId1YS3
 wHu9qyGUOi2g88V27jH593/JstUYn6zyxJYIZX84mzcxOqZlKuo=
 =auMP
 -----END PGP SIGNATURE-----

Merge tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "There are still regressions being found and fixed in the zoned mode
  and subpage code, the rest are fixes for bugs reported by users.

  Regressions:

   - subpage block support:
      - readahead works on the proper block size
      - fix last page zeroing

   - zoned mode:
      - linked list corruption for tree log

  Fixes:

   - qgroup leak after falloc failure

   - tree mod log and backref resolving:
      - extent buffer cloning race when resolving backrefs
      - pin deleted leaves with active tree mod log users

   - drop debugging flag from slab cache"

* tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: always pin deleted leaves when there are active tree mod log users
  btrfs: fix race when cloning extent buffer during rewind of an old root
  btrfs: fix slab cache flags for free space tree bitmap
  btrfs: subpage: make readahead work properly
  btrfs: subpage: fix wild pointer access during metadata read failure
  btrfs: zoned: fix linked list corruption after log root tree allocation failure
  btrfs: fix qgroup data rsv leak caused by falloc failure
  btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
  btrfs: fix wrong offset to zero out range beyond i_size
This commit is contained in:
Linus Torvalds 2021-03-18 13:38:42 -07:00
commit 81aa0968b7
6 changed files with 103 additions and 35 deletions

View File

@ -1365,7 +1365,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
"failed to read tree block %llu from get_old_root",
logical);
} else {
btrfs_tree_read_lock(old);
eb = btrfs_clone_extent_buffer(old);
btrfs_tree_read_unlock(old);
free_extent_buffer(old);
}
} else if (old_root) {

View File

@ -3323,6 +3323,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (last_ref && btrfs_header_generation(buf) == trans->transid) {
struct btrfs_block_group *cache;
bool must_pin = false;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, buf->start);
@ -3340,7 +3341,27 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
if (btrfs_is_zoned(fs_info)) {
/*
* If this is a leaf and there are tree mod log users, we may
* have recorded mod log operations that point to this leaf.
* So we must make sure no one reuses this leaf's extent before
* mod log operations are applied to a node, otherwise after
* rewinding a node using the mod log operations we get an
* inconsistent btree, as the leaf's extent may now be used as
* a node or leaf for another different btree.
* We are safe from races here because at this point no other
* node or root points to this extent buffer, so if after this
* check a new tree mod log user joins, it will not be able to
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
if (btrfs_header_level(buf) == 0) {
read_lock(&fs_info->tree_mod_log_lock);
must_pin = !list_empty(&fs_info->tree_mod_seq_list);
read_unlock(&fs_info->tree_mod_log_lock);
}
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);
pin_down_extent(trans, cache, buf->start, buf->len, 1);
btrfs_put_block_group(cache);

View File

@ -2885,6 +2885,35 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
btrfs_subpage_end_reader(fs_info, page, start, len);
}
/*
* Find extent buffer for a givne bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
*/
static struct extent_buffer *find_extent_buffer_readpage(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
struct extent_buffer *eb;
/*
* For regular sectorsize, we can use page->private to grab extent
* buffer
*/
if (fs_info->sectorsize == PAGE_SIZE) {
ASSERT(PagePrivate(page) && page->private);
return (struct extent_buffer *)page->private;
}
/* For subpage case, we need to lookup buffer radix tree */
rcu_read_lock();
eb = radix_tree_lookup(&fs_info->buffer_radix,
bytenr >> fs_info->sectorsize_bits);
rcu_read_unlock();
ASSERT(eb);
return eb;
}
/*
* after a readpage IO is done, we need to:
* clear the uptodate bits on error
@ -2996,7 +3025,7 @@ static void end_bio_extent_readpage(struct bio *bio)
} else {
struct extent_buffer *eb;
eb = (struct extent_buffer *)page->private;
eb = find_extent_buffer_readpage(fs_info, page, start);
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror;
atomic_dec(&eb->io_pages);
@ -3020,7 +3049,7 @@ readpage_ok:
*/
if (page->index == end_index && i_size <= end) {
u32 zero_start = max(offset_in_page(i_size),
offset_in_page(end));
offset_in_page(start));
zero_user_segment(page, zero_start,
offset_in_page(end) + 1);

View File

@ -9008,7 +9008,7 @@ int __init btrfs_init_cachep(void)
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
SLAB_RED_ZONE, NULL);
SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep)
goto fail;
@ -9877,6 +9877,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int qgroup_released;
int ret;
memset(&stack_fi, 0, sizeof(stack_fi));
@ -9889,16 +9890,16 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
/* Encryption and other encoding is reserved and all 0 */
ret = btrfs_qgroup_release_data(inode, file_offset, len);
if (ret < 0)
return ERR_PTR(ret);
qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
if (qgroup_released < 0)
return ERR_PTR(qgroup_released);
if (trans) {
ret = insert_reserved_file_extent(trans, inode,
file_offset, &stack_fi,
true, ret);
true, qgroup_released);
if (ret)
return ERR_PTR(ret);
goto free_qgroup;
return trans;
}
@ -9909,21 +9910,35 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
extent_info.qgroup_reserved = ret;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
if (!path) {
ret = -ENOMEM;
goto free_qgroup;
}
ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
if (ret)
return ERR_PTR(ret);
goto free_qgroup;
return trans;
free_qgroup:
/*
* We have released qgroup data range at the beginning of the function,
* and normally qgroup_released bytes will be freed when committing
* transaction.
* But if we error out early, we have to free what we have released
* or we leak qgroup data reservation.
*/
btrfs_qgroup_free_refroot(inode->root->fs_info,
inode->root->root_key.objectid, qgroup_released,
BTRFS_QGROUP_RSV_DATA);
return ERR_PTR(ret);
}
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,

View File

@ -209,7 +209,7 @@ int btree_readahead_hook(struct extent_buffer *eb, int err)
/* find extent */
spin_lock(&fs_info->reada_lock);
re = radix_tree_lookup(&fs_info->reada_tree,
eb->start >> PAGE_SHIFT);
eb->start >> fs_info->sectorsize_bits);
if (re)
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@ -240,7 +240,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
zone = NULL;
spin_lock(&fs_info->reada_lock);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
logical >> PAGE_SHIFT, 1);
logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end) {
kref_get(&zone->refcnt);
spin_unlock(&fs_info->reada_lock);
@ -283,13 +283,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical,
spin_lock(&fs_info->reada_lock);
ret = radix_tree_insert(&dev->reada_zones,
(unsigned long)(zone->end >> PAGE_SHIFT),
zone);
(unsigned long)(zone->end >> fs_info->sectorsize_bits),
zone);
if (ret == -EEXIST) {
kfree(zone);
ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
logical >> PAGE_SHIFT, 1);
logical >> fs_info->sectorsize_bits, 1);
if (ret == 1 && logical >= zone->start && logical <= zone->end)
kref_get(&zone->refcnt);
else
@ -315,7 +315,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info,
u64 length;
int real_stripes;
int nzones = 0;
unsigned long index = logical >> PAGE_SHIFT;
unsigned long index = logical >> fs_info->sectorsize_bits;
int dev_replace_is_ongoing;
int have_zone = 0;
@ -497,7 +497,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
struct reada_extent *re)
{
int i;
unsigned long index = re->logical >> PAGE_SHIFT;
unsigned long index = re->logical >> fs_info->sectorsize_bits;
spin_lock(&fs_info->reada_lock);
if (--re->refcnt) {
@ -538,11 +538,12 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
static void reada_zone_release(struct kref *kref)
{
struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
struct btrfs_fs_info *fs_info = zone->device->fs_info;
lockdep_assert_held(&zone->device->fs_info->reada_lock);
lockdep_assert_held(&fs_info->reada_lock);
radix_tree_delete(&zone->device->reada_zones,
zone->end >> PAGE_SHIFT);
zone->end >> fs_info->sectorsize_bits);
kfree(zone);
}
@ -593,7 +594,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
{
int i;
unsigned long index = zone->end >> PAGE_SHIFT;
unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits;
for (i = 0; i < zone->ndevs; ++i) {
struct reada_zone *peer;
@ -628,7 +629,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
(void **)&zone, index, 1);
if (ret == 0)
break;
index = (zone->end >> PAGE_SHIFT) + 1;
index = (zone->end >> dev->fs_info->sectorsize_bits) + 1;
if (zone->locked) {
if (zone->elems > top_locked_elems) {
top_locked_elems = zone->elems;
@ -709,7 +710,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
* plugging to speed things up
*/
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_SHIFT, 1);
dev->reada_next >> fs_info->sectorsize_bits, 1);
if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
ret = reada_pick_zone(dev);
if (!ret) {
@ -718,7 +719,7 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
}
re = NULL;
ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
dev->reada_next >> PAGE_SHIFT, 1);
dev->reada_next >> fs_info->sectorsize_bits, 1);
}
if (ret == 0) {
spin_unlock(&fs_info->reada_lock);
@ -885,7 +886,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
pr_cont(" curr off %llu",
device->reada_next - zone->start);
pr_cont("\n");
index = (zone->end >> PAGE_SHIFT) + 1;
index = (zone->end >> fs_info->sectorsize_bits) + 1;
}
cnt = 0;
index = 0;
@ -910,7 +911,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
pr_cont("\n");
index = (re->logical >> PAGE_SHIFT) + 1;
index = (re->logical >> fs_info->sectorsize_bits) + 1;
if (++cnt > 15)
break;
}
@ -926,7 +927,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
if (ret == 0)
break;
if (!re->scheduled) {
index = (re->logical >> PAGE_SHIFT) + 1;
index = (re->logical >> fs_info->sectorsize_bits) + 1;
continue;
}
pr_debug("re: logical %llu size %u list empty %d scheduled %d",
@ -942,7 +943,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
}
pr_cont("\n");
index = (re->logical >> PAGE_SHIFT) + 1;
index = (re->logical >> fs_info->sectorsize_bits) + 1;
}
spin_unlock(&fs_info->reada_lock);
}

View File

@ -3169,10 +3169,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
if (btrfs_is_zoned(fs_info)) {
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
@ -3183,6 +3179,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
}
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit