btrfs: avoid blocking on space revervation when doing nowait dio writes
When doing a NOWAIT direct IO write, if we can NOCOW then it means we can proceed with the non-blocking, NOWAIT path. However reserving the metadata space and qgroup meta space can often result in blocking - flushing delalloc, wait for ordered extents to complete, trigger transaction commits, etc, going against the semantics of a NOWAIT write. So make the NOWAIT write path to try to reserve all the metadata it needs without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT then return -EAGAIN to make the caller fallback to a blocking direct IO write. This is part of a patchset comprised of the following patches: btrfs: avoid blocking on page locks with nowait dio on compressed range btrfs: avoid blocking nowait dio when locking file range btrfs: avoid double nocow check when doing nowait dio writes btrfs: stop allocating a path when checking if cross reference exists btrfs: free path at can_nocow_extent() before checking for checksum items btrfs: release path earlier at can_nocow_extent() btrfs: avoid blocking when allocating context for nowait dio read/write btrfs: avoid blocking on space revervation when doing nowait dio writes The following test was run before and after applying this patchset: $ cat io-uring-nodatacow-test.sh #!/bin/bash DEV=/dev/sdc MNT=/mnt/sdc MOUNT_OPTIONS="-o ssd -o nodatacow" MKFS_OPTIONS="-R free-space-tree -O no-holes" NUM_JOBS=4 FILE_SIZE=8G RUN_TIME=300 cat <<EOF > /tmp/fio-job.ini [io_uring_rw] rw=randrw fsync=0 fallocate=posix group_reporting=1 direct=1 ioengine=io_uring iodepth=64 bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5 filesize=$FILE_SIZE runtime=$RUN_TIME time_based filename=foobar directory=$MNT numjobs=$NUM_JOBS thread EOF echo performance | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $MNT &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The test was run a 12 cores box with 64G of ram, using a non-debug kernel config (Debian's default config) and a spinning disk. Result before the patchset: READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec Result after the patchset: READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec That's about +7.2% throughput for reads and +6.9% for writes. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
4f208dcc6b
commit
d4135134ab
@ -2893,7 +2893,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
|
||||
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
|
||||
|
||||
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
u64 disk_num_bytes);
|
||||
u64 disk_num_bytes, bool noflush);
|
||||
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
|
||||
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
|
||||
u64 start, u64 end);
|
||||
|
@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
|
||||
}
|
||||
|
||||
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
u64 disk_num_bytes)
|
||||
u64 disk_num_bytes, bool noflush)
|
||||
{
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
* If we have a transaction open (can happen if we call truncate_block
|
||||
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
|
||||
*/
|
||||
if (btrfs_is_free_space_inode(inode)) {
|
||||
if (noflush || btrfs_is_free_space_inode(inode)) {
|
||||
flush = BTRFS_RESERVE_NO_FLUSH;
|
||||
} else {
|
||||
if (current->journal_info)
|
||||
@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
|
||||
*/
|
||||
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
|
||||
&meta_reserve, &qgroup_reserve);
|
||||
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
|
||||
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
|
||||
noflush);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
|
||||
@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
|
||||
ret = btrfs_check_data_free_space(inode, reserved, start, len);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, len, len);
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
|
||||
if (ret < 0) {
|
||||
btrfs_free_reserved_data_space(inode, *reserved, start, len);
|
||||
extent_changeset_free(*reserved);
|
||||
|
@ -1684,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
|
||||
WARN_ON(reserve_bytes == 0);
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
|
||||
reserve_bytes,
|
||||
reserve_bytes);
|
||||
reserve_bytes, false);
|
||||
if (ret) {
|
||||
if (!only_release_metadata)
|
||||
btrfs_free_reserved_data_space(BTRFS_I(inode),
|
||||
|
@ -4705,7 +4705,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
|
||||
if (ret < 0) {
|
||||
if (!only_release_metadata)
|
||||
btrfs_free_reserved_data_space(inode, data_reserved,
|
||||
@ -7415,6 +7415,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
||||
u64 start, u64 len,
|
||||
unsigned int iomap_flags)
|
||||
{
|
||||
const bool nowait = (iomap_flags & IOMAP_NOWAIT);
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
struct extent_map *em = *map;
|
||||
int type;
|
||||
@ -7454,12 +7455,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
||||
struct extent_map *em2;
|
||||
|
||||
/* We can NOCOW, so only need to reserve metadata space. */
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
|
||||
nowait);
|
||||
if (ret < 0) {
|
||||
/* Our caller expects us to free the input extent map. */
|
||||
free_extent_map(em);
|
||||
*map = NULL;
|
||||
btrfs_dec_nocow_writers(fs_info, block_start);
|
||||
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
|
||||
ret = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
space_reserved = true;
|
||||
@ -7483,7 +7487,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
||||
free_extent_map(em);
|
||||
*map = NULL;
|
||||
|
||||
if (iomap_flags & IOMAP_NOWAIT)
|
||||
if (nowait)
|
||||
return -EAGAIN;
|
||||
|
||||
/* We have to COW, so need to reserve metadata and data space. */
|
||||
@ -10801,7 +10805,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
|
||||
ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
|
||||
if (ret)
|
||||
goto out_free_data_space;
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
|
||||
ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
|
||||
false);
|
||||
if (ret)
|
||||
goto out_qgroup_free_data;
|
||||
|
||||
|
@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
||||
}
|
||||
|
||||
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
||||
enum btrfs_qgroup_rsv_type type, bool enforce)
|
||||
enum btrfs_qgroup_rsv_type type, bool enforce,
|
||||
bool noflush)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
|
||||
if (ret <= 0 && ret != -EDQUOT)
|
||||
if ((ret <= 0 && ret != -EDQUOT) || noflush)
|
||||
return ret;
|
||||
|
||||
ret = try_flush_qgroup(root);
|
||||
|
@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
|
||||
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
||||
enum btrfs_qgroup_rsv_type type, bool enforce);
|
||||
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
|
||||
enum btrfs_qgroup_rsv_type type, bool enforce);
|
||||
enum btrfs_qgroup_rsv_type type, bool enforce,
|
||||
bool noflush);
|
||||
/* Reserve metadata space for pertrans and prealloc type */
|
||||
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
|
||||
int num_bytes, bool enforce)
|
||||
{
|
||||
return __btrfs_qgroup_reserve_meta(root, num_bytes,
|
||||
BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
|
||||
BTRFS_QGROUP_RSV_META_PERTRANS,
|
||||
enforce, false);
|
||||
}
|
||||
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
|
||||
int num_bytes, bool enforce)
|
||||
int num_bytes, bool enforce,
|
||||
bool noflush)
|
||||
{
|
||||
return __btrfs_qgroup_reserve_meta(root, num_bytes,
|
||||
BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
|
||||
BTRFS_QGROUP_RSV_META_PREALLOC,
|
||||
enforce, noflush);
|
||||
}
|
||||
|
||||
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
|
||||
|
@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
|
||||
|
||||
/* Reserve metadata for this range */
|
||||
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
|
||||
clamped_len, clamped_len);
|
||||
clamped_len, clamped_len,
|
||||
false);
|
||||
if (ret)
|
||||
goto release_page;
|
||||
|
||||
|
@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
|
||||
/* One for parent inode, two for dir entries */
|
||||
qgroup_num_bytes = 3 * fs_info->nodesize;
|
||||
ret = btrfs_qgroup_reserve_meta_prealloc(root,
|
||||
qgroup_num_bytes, true);
|
||||
qgroup_num_bytes, true,
|
||||
false);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user