btrfs: dio iomap DSYNC workaround
iomap dio will run generic_write_sync() for us if the iocb is DSYNC. This is problematic for us because of 2 reasons: 1. we hold the inode_lock() during this operation, and we take it in generic_write_sync() 2. we hold a read lock on the dio_sem but take the write lock in fsync Since we don't want to rip out this code right now, but reworking the locking is a bit much to do at this point, work around this problem with this masterpiece of a patch. First, we clear DSYNC on the iocb so that the iomap stuff doesn't know that it needs to handle the sync. We save this fact in current->journal_info, because we need to see do special things once we're in iomap_begin, and we have no way to pass private information into iomap_dio_rw(). Next we specify a separate iomap_dio_ops for sync, which implements an ->end_io() callback that gets called when the dio completes. This is important for AIO, because we really do need to run generic_write_sync() if we complete asynchronously. However if we're still in the submitting context when we enter ->end_io() we clear the flag so that the submitter knows they're the ones that needs to run generic_write_sync(). This is meant to be temporary. We need to work out how to eliminate the inode_lock() and the dio_sem in our fsync and use another mechanism to protect these operations. Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
committed by
David Sterba
parent
f85781fb50
commit
0eb79294db
@ -2023,7 +2023,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
|
|||||||
atomic_inc(&BTRFS_I(inode)->sync_writers);
|
atomic_inc(&BTRFS_I(inode)->sync_writers);
|
||||||
|
|
||||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||||
|
/*
|
||||||
|
* 1. We must always clear IOCB_DSYNC in order to not deadlock
|
||||||
|
* in iomap, as it calls generic_write_sync() in this case.
|
||||||
|
* 2. If we are async, we can call iomap_dio_complete() either
|
||||||
|
* in
|
||||||
|
*
|
||||||
|
* 2.1. A worker thread from the last bio completed. In this
|
||||||
|
* case we need to mark the btrfs_dio_data that it is
|
||||||
|
* async in order to call generic_write_sync() properly.
|
||||||
|
* This is handled by setting BTRFS_DIO_SYNC_STUB in the
|
||||||
|
* current->journal_info.
|
||||||
|
* 2.2 The submitter context, because all IO completed
|
||||||
|
* before we exited iomap_dio_rw(). In this case we can
|
||||||
|
* just re-set the IOCB_DSYNC on the iocb and we'll do
|
||||||
|
* the sync below. If our ->end_io() gets called and
|
||||||
|
* current->journal_info is set, then we know we're in
|
||||||
|
* our current context and we will clear
|
||||||
|
* current->journal_info to indicate that we need to
|
||||||
|
* sync below.
|
||||||
|
*/
|
||||||
|
if (sync) {
|
||||||
|
ASSERT(current->journal_info == NULL);
|
||||||
|
iocb->ki_flags &= ~IOCB_DSYNC;
|
||||||
|
current->journal_info = BTRFS_DIO_SYNC_STUB;
|
||||||
|
}
|
||||||
num_written = __btrfs_direct_write(iocb, from);
|
num_written = __btrfs_direct_write(iocb, from);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As stated above, we cleared journal_info, so we need to do
|
||||||
|
* the sync ourselves.
|
||||||
|
*/
|
||||||
|
if (sync && current->journal_info == NULL)
|
||||||
|
iocb->ki_flags |= IOCB_DSYNC;
|
||||||
|
current->journal_info = NULL;
|
||||||
} else {
|
} else {
|
||||||
num_written = btrfs_buffered_write(iocb, from);
|
num_written = btrfs_buffered_write(iocb, from);
|
||||||
if (num_written > 0)
|
if (num_written > 0)
|
||||||
|
@ -62,6 +62,7 @@ struct btrfs_dio_data {
|
|||||||
loff_t length;
|
loff_t length;
|
||||||
ssize_t submitted;
|
ssize_t submitted;
|
||||||
struct extent_changeset *data_reserved;
|
struct extent_changeset *data_reserved;
|
||||||
|
bool sync;
|
||||||
};
|
};
|
||||||
|
|
||||||
static const struct inode_operations btrfs_dir_inode_operations;
|
static const struct inode_operations btrfs_dir_inode_operations;
|
||||||
@ -7337,6 +7338,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
|
|||||||
int ret = 0;
|
int ret = 0;
|
||||||
u64 len = length;
|
u64 len = length;
|
||||||
bool unlock_extents = false;
|
bool unlock_extents = false;
|
||||||
|
bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We used current->journal_info here to see if we were sync, but
|
||||||
|
* there's a lot of tests in the enospc machinery to not do flushing if
|
||||||
|
* we have a journal_info set, so we need to clear this out and re-set
|
||||||
|
* it in iomap_end.
|
||||||
|
*/
|
||||||
|
ASSERT(current->journal_info == NULL ||
|
||||||
|
current->journal_info == BTRFS_DIO_SYNC_STUB);
|
||||||
|
current->journal_info = NULL;
|
||||||
|
|
||||||
if (!write)
|
if (!write)
|
||||||
len = min_t(u64, len, fs_info->sectorsize);
|
len = min_t(u64, len, fs_info->sectorsize);
|
||||||
@ -7362,6 +7374,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
|
|||||||
if (!dio_data)
|
if (!dio_data)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
|
dio_data->sync = sync;
|
||||||
dio_data->length = length;
|
dio_data->length = length;
|
||||||
if (write) {
|
if (write) {
|
||||||
dio_data->reserve = round_up(length, fs_info->sectorsize);
|
dio_data->reserve = round_up(length, fs_info->sectorsize);
|
||||||
@ -7509,6 +7522,14 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
|
|||||||
extent_changeset_free(dio_data->data_reserved);
|
extent_changeset_free(dio_data->data_reserved);
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
|
/*
|
||||||
|
* We're all done, we can re-set the current->journal_info now safely
|
||||||
|
* for our endio.
|
||||||
|
*/
|
||||||
|
if (dio_data->sync) {
|
||||||
|
ASSERT(current->journal_info == NULL);
|
||||||
|
current->journal_info = BTRFS_DIO_SYNC_STUB;
|
||||||
|
}
|
||||||
kfree(dio_data);
|
kfree(dio_data);
|
||||||
iomap->private = NULL;
|
iomap->private = NULL;
|
||||||
|
|
||||||
@ -7917,6 +7938,30 @@ out:
|
|||||||
return retval;
|
return retval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
|
||||||
|
int error, unsigned flags)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Now if we're still in the context of our submitter we know we can't
|
||||||
|
* safely run generic_write_sync(), so clear our flag here so that the
|
||||||
|
* caller knows to follow up with a sync.
|
||||||
|
*/
|
||||||
|
if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
|
||||||
|
current->journal_info = NULL;
|
||||||
|
return error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (error)
|
||||||
|
return error;
|
||||||
|
|
||||||
|
if (size) {
|
||||||
|
iocb->ki_flags |= IOCB_DSYNC;
|
||||||
|
return generic_write_sync(iocb, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static const struct iomap_ops btrfs_dio_iomap_ops = {
|
static const struct iomap_ops btrfs_dio_iomap_ops = {
|
||||||
.iomap_begin = btrfs_dio_iomap_begin,
|
.iomap_begin = btrfs_dio_iomap_begin,
|
||||||
.iomap_end = btrfs_dio_iomap_end,
|
.iomap_end = btrfs_dio_iomap_end,
|
||||||
@ -7926,6 +7971,11 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
|
|||||||
.submit_io = btrfs_submit_direct,
|
.submit_io = btrfs_submit_direct,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const struct iomap_dio_ops btrfs_sync_dops = {
|
||||||
|
.submit_io = btrfs_submit_direct,
|
||||||
|
.end_io = btrfs_maybe_fsync_end_io,
|
||||||
|
};
|
||||||
|
|
||||||
ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||||
{
|
{
|
||||||
struct file *file = iocb->ki_filp;
|
struct file *file = iocb->ki_filp;
|
||||||
@ -7954,8 +8004,16 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
|||||||
down_read(&BTRFS_I(inode)->dio_sem);
|
down_read(&BTRFS_I(inode)->dio_sem);
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
/*
|
||||||
is_sync_kiocb(iocb));
|
* We have are actually a sync iocb, so we need our fancy endio to know
|
||||||
|
* if we need to sync.
|
||||||
|
*/
|
||||||
|
if (current->journal_info)
|
||||||
|
ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
|
||||||
|
&btrfs_sync_dops, is_sync_kiocb(iocb));
|
||||||
|
else
|
||||||
|
ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
|
||||||
|
&btrfs_dio_ops, is_sync_kiocb(iocb));
|
||||||
|
|
||||||
if (ret == -ENOTBLK)
|
if (ret == -ENOTBLK)
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
@ -112,6 +112,7 @@ struct btrfs_transaction {
|
|||||||
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
|
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
|
||||||
|
|
||||||
#define BTRFS_SEND_TRANS_STUB ((void *)1)
|
#define BTRFS_SEND_TRANS_STUB ((void *)1)
|
||||||
|
#define BTRFS_DIO_SYNC_STUB ((void *)2)
|
||||||
|
|
||||||
struct btrfs_trans_handle {
|
struct btrfs_trans_handle {
|
||||||
u64 transid;
|
u64 transid;
|
||||||
|
Reference in New Issue
Block a user