xfs: ensure truncate forces zeroed blocks to disk

A new fsync vs power fail test in xfstests indicated that XFS can
have unreliable data consistency when doing extending truncates that
require block zeroing. The blocks beyond EOF get zeroed in memory,
but we never force those changes to disk before we run the
transaction that extends the file size and exposes those blocks to
userspace. This can result in the blocks not being correctly zeroed
after a crash.

Because in-memory behaviour is correct, tools like fsx don't pick up
any coherency problems - it's not until the filesystem is shutdown
or the system crashes after writing the truncate transaction to the
journal but before the zeroed data in the page cache is flushed that
the issue is exposed.

Fix this by also flushing the dirty data in memory region between
the old size and new size when we've found blocks that need zeroing
in the truncate process.

Reported-by: Liu Bo <bo.li.liu@oracle.com>
cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
Dave Chinner 2015-02-23 22:37:08 +11:00 committed by Dave Chinner
parent dfcc70a8c8
commit 5885ebda87
3 changed files with 29 additions and 30 deletions

View File

@ -397,7 +397,8 @@ STATIC int /* error (positive) */
xfs_zero_last_block( xfs_zero_last_block(
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_fsize_t offset, xfs_fsize_t offset,
xfs_fsize_t isize) xfs_fsize_t isize,
bool *did_zeroing)
{ {
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
@ -425,6 +426,7 @@ xfs_zero_last_block(
zero_len = mp->m_sb.sb_blocksize - zero_offset; zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset) if (isize + zero_len > offset)
zero_len = offset - isize; zero_len = offset - isize;
*did_zeroing = true;
return xfs_iozero(ip, isize, zero_len); return xfs_iozero(ip, isize, zero_len);
} }
@ -443,7 +445,8 @@ int /* error (positive) */
xfs_zero_eof( xfs_zero_eof(
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_off_t offset, /* starting I/O offset */ xfs_off_t offset, /* starting I/O offset */
xfs_fsize_t isize) /* current inode size */ xfs_fsize_t isize, /* current inode size */
bool *did_zeroing)
{ {
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_zero_fsb; xfs_fileoff_t start_zero_fsb;
@ -465,7 +468,7 @@ xfs_zero_eof(
* We only zero a part of that block so it is handled specially. * We only zero a part of that block so it is handled specially.
*/ */
if (XFS_B_FSB_OFFSET(mp, isize) != 0) { if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
error = xfs_zero_last_block(ip, offset, isize); error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
if (error) if (error)
return error; return error;
} }
@ -525,6 +528,7 @@ xfs_zero_eof(
if (error) if (error)
return error; return error;
*did_zeroing = true;
start_zero_fsb = imap.br_startoff + imap.br_blockcount; start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
} }
@ -567,13 +571,15 @@ restart:
* having to redo all checks before. * having to redo all checks before.
*/ */
if (*pos > i_size_read(inode)) { if (*pos > i_size_read(inode)) {
bool zero = false;
if (*iolock == XFS_IOLOCK_SHARED) { if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, *iolock); xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL; *iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, *iolock); xfs_rw_ilock(ip, *iolock);
goto restart; goto restart;
} }
error = xfs_zero_eof(ip, *pos, i_size_read(inode)); error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
if (error) if (error)
return error; return error;
} }

View File

@ -384,10 +384,11 @@ enum xfs_prealloc_flags {
XFS_PREALLOC_INVISIBLE = (1 << 4), XFS_PREALLOC_INVISIBLE = (1 << 4),
}; };
int xfs_update_prealloc_flags(struct xfs_inode *, int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags); enum xfs_prealloc_flags flags);
int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
int xfs_iozero(struct xfs_inode *, loff_t, size_t); xfs_fsize_t isize, bool *did_zeroing);
int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
#define IHOLD(ip) \ #define IHOLD(ip) \

View File

@ -751,6 +751,7 @@ xfs_setattr_size(
int error; int error;
uint lock_flags = 0; uint lock_flags = 0;
uint commit_flags = 0; uint commit_flags = 0;
bool did_zeroing = false;
trace_xfs_setattr(ip); trace_xfs_setattr(ip);
@ -794,20 +795,16 @@ xfs_setattr_size(
return error; return error;
/* /*
* Now we can make the changes. Before we join the inode to the * File data changes must be complete before we start the transaction to
* transaction, take care of the part of the truncation that must be * modify the inode. This needs to be done before joining the inode to
* done without the inode lock. This needs to be done before joining * the transaction because the inode cannot be unlocked once it is a
* the inode to the transaction, because the inode cannot be unlocked * part of the transaction.
* once it is a part of the transaction. *
* Start with zeroing any data block beyond EOF that we may expose on
* file extension.
*/ */
if (newsize > oldsize) { if (newsize > oldsize) {
/* error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
* Do the first part of growing a file: zero any data in the
* last block that is beyond the old EOF. We need to do this
* before the inode is joined to the transaction to modify
* i_size.
*/
error = xfs_zero_eof(ip, newsize, oldsize);
if (error) if (error)
return error; return error;
} }
@ -817,23 +814,18 @@ xfs_setattr_size(
* any previous writes that are beyond the on disk EOF and the new * any previous writes that are beyond the on disk EOF and the new
* EOF that have not been written out need to be written here. If we * EOF that have not been written out need to be written here. If we
* do not write the data out, we expose ourselves to the null files * do not write the data out, we expose ourselves to the null files
* problem. * problem. Note that this includes any block zeroing we did above;
* * otherwise those blocks may not be zeroed after a crash.
* Only flush from the on disk size to the smaller of the in memory
* file size or the new size as that's the range we really care about
* here and prevents waiting for other data not within the range we
* care about here.
*/ */
if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { if (newsize > ip->i_d.di_size &&
(oldsize != ip->i_d.di_size || did_zeroing)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize); ip->i_d.di_size, newsize);
if (error) if (error)
return error; return error;
} }
/* /* Now wait for all direct I/O to complete. */
* Wait for all direct I/O to complete.
*/
inode_dio_wait(inode); inode_dio_wait(inode);
/* /*