for-6.10-rc2-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmZggXMACgkQxWXV+ddt WDupkA/9Foo2OsWR6wIQyBqzmHnhgzBwJ67q0F6MO2/iFfMRW/YIJH3Fk+0+PP40 BDK4xiz1DIl/qJvoSv4bpPNvy/lAovtVB/AV8rH+JaJNHP/fTjkqA3Ad6ZtZN45J KoHE4SoX4NT1v+zwJ2irrH1W2mPh8tNTYvZINPcLC/nX2UzYoNjiIFLRCMSe003M ybNjvv6VUHPk+9JAWsVt5pjDLu5E1EmXakXv5mvGaIVr0ljNUPCwhFip20YMpVfo 17t6MezmeqwGbrJgMpJyPOSsghaA68lzuzVVyAFFoxqlGLZ5rgtXTmK4O4NsyZfr EMkwNR1IDt7fVXUkHy4X/8f9V8Wwmmwp8bSY4rTTgA4hg3w0w4FCX+uNOWHagkaS 8vWWTJBSvJKJwLUfWhKVHIaiUEkFEhmnUQPjqlfSxc+mQgxJcK1djgdVkVxSudrp l0xdDG0WTWiO0zniIXbIlZ7tCeUgL1kcovZmDIA6em+HSipryvSFdYT+h7VKgzzv XTJvdXKMSiqMvXoT2BRYkmWVeuUBhJ1EptkGidZBgTZ7EFfuGnhBCRgq9YSaWnak 2SBvgjxKQzyxVpqWllOsksRg2/fSl9vdlGK3KjyGW1pAwrZD/zbmG/ZqH2MVOfjt LdswuwKd25pYpamYZqrCyJtIZlTSUrWpasaX1P28gs0uRCuFaiY= =q3Ic -----END PGP SIGNATURE----- Merge tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fix from David Sterba: "A fix for fast fsync that needs to handle errors during writes after some COW failure so it does not lead to an inconsistent state" * tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: ensure fast fsync waits for ordered extents after a write failure
This commit is contained in:
commit
19ca0d8a43
@ -89,6 +89,16 @@ enum {
|
||||
BTRFS_INODE_FREE_SPACE_INODE,
|
||||
/* Set when there are no capabilities in XATTs for the inode. */
|
||||
BTRFS_INODE_NO_CAP_XATTR,
|
||||
/*
|
||||
* Set if an error happened when doing a COW write before submitting a
|
||||
* bio or during writeback. Used for both buffered writes and direct IO
|
||||
* writes. This is to signal a fast fsync that it has to wait for
|
||||
* ordered extents to complete and therefore not log extent maps that
|
||||
* point to unwritten extents (when an ordered extent completes and it
|
||||
* has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
|
||||
* range).
|
||||
*/
|
||||
BTRFS_INODE_COW_WRITE_ERROR,
|
||||
};
|
||||
|
||||
/* in memory btrfs inode */
|
||||
|
@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
*/
|
||||
if (full_sync || btrfs_is_zoned(fs_info)) {
|
||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
||||
clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
|
||||
} else {
|
||||
/*
|
||||
* Get our ordered extents as soon as possible to avoid doing
|
||||
@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
|
||||
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
|
||||
&ctx.ordered_extents);
|
||||
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
|
||||
if (ret)
|
||||
goto out_release_extents;
|
||||
|
||||
/*
|
||||
* Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
|
||||
* starting and waiting for writeback, because for buffered IO
|
||||
* it may have been set during the end IO callback
|
||||
* (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
|
||||
* case an error happened and we need to wait for ordered
|
||||
* extents to complete so that any extent maps that point to
|
||||
* unwritten locations are dropped and we don't log them.
|
||||
*/
|
||||
if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
|
||||
&BTRFS_I(inode)->runtime_flags))
|
||||
ret = btrfs_wait_ordered_range(inode, start, len);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
|
@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
|
||||
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
|
||||
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
|
||||
|
||||
/*
|
||||
* If this is a COW write it means we created new extent maps for the
|
||||
* range and they point to unwritten locations if we got an error either
|
||||
* before submitting a bio or during IO.
|
||||
*
|
||||
* We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
|
||||
* are queuing its completion below. During completion, at
|
||||
* btrfs_finish_one_ordered(), we will drop the extent maps for the
|
||||
* unwritten extents.
|
||||
*
|
||||
* However because completion runs in a work queue we can end up having
|
||||
* a fast fsync running before that. In the case of direct IO, once we
|
||||
* unlock the inode the fsync might start, and we queue the completion
|
||||
* before unlocking the inode. In the case of buffered IO when writeback
|
||||
* finishes (end_bbio_data_write()) we queue the completion, so if the
|
||||
* writeback was triggered by a fast fsync, the fsync might start
|
||||
* logging before ordered extent completion runs in the work queue.
|
||||
*
|
||||
* The fast fsync will log file extent items based on the extent maps it
|
||||
* finds, so if by the time it collects extent maps the ordered extent
|
||||
* completion didn't happen yet, it will log file extent items that
|
||||
* point to unwritten extents, resulting in a corruption if a crash
|
||||
* happens and the log tree is replayed. Note that a fast fsync does not
|
||||
* wait for completion of ordered extents in order to reduce latency.
|
||||
*
|
||||
* Set a flag in the inode so that the next fast fsync will wait for
|
||||
* ordered extents to complete before starting to log.
|
||||
*/
|
||||
if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
|
||||
set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
|
||||
|
||||
if (ret)
|
||||
btrfs_queue_ordered_fn(ordered);
|
||||
return ret;
|
||||
|
Loading…
Reference in New Issue
Block a user