2018-06-06 05:42:14 +03:00
// SPDX-License-Identifier: GPL-2.0
2005-04-17 02:20:36 +04:00
/*
2005-11-02 06:58:39 +03:00
* Copyright ( c ) 2000 - 2001 , 2005 Silicon Graphics , Inc .
* All Rights Reserved .
2005-04-17 02:20:36 +04:00
*/
# include "xfs.h"
2019-06-29 05:25:35 +03:00
# include "xfs_shared.h"
2013-10-23 03:51:50 +04:00
# include "xfs_format.h"
2005-11-02 06:38:42 +03:00
# include "xfs_fs.h"
2013-10-23 03:50:10 +04:00
# include "xfs_log_format.h"
# include "xfs_trans_resv.h"
2005-04-17 02:20:36 +04:00
# include "xfs_mount.h"
2017-10-31 22:04:49 +03:00
# include "xfs_errortag.h"
2005-04-17 02:20:36 +04:00
# include "xfs_error.h"
2017-06-21 03:54:47 +03:00
# include "xfs_sysfs.h"
2018-01-23 05:09:48 +03:00
# include "xfs_inode.h"
2005-04-17 02:20:36 +04:00
# ifdef DEBUG
2017-06-21 03:54:46 +03:00
static unsigned int xfs_errortag_random_default [ ] = {
XFS_RANDOM_DEFAULT ,
XFS_RANDOM_IFLUSH_1 ,
XFS_RANDOM_IFLUSH_2 ,
XFS_RANDOM_IFLUSH_3 ,
XFS_RANDOM_IFLUSH_4 ,
XFS_RANDOM_IFLUSH_5 ,
XFS_RANDOM_IFLUSH_6 ,
XFS_RANDOM_DA_READ_BUF ,
XFS_RANDOM_BTREE_CHECK_LBLOCK ,
XFS_RANDOM_BTREE_CHECK_SBLOCK ,
XFS_RANDOM_ALLOC_READ_AGF ,
XFS_RANDOM_IALLOC_READ_AGI ,
XFS_RANDOM_ITOBP_INOTOBP ,
XFS_RANDOM_IUNLINK ,
XFS_RANDOM_IUNLINK_REMOVE ,
XFS_RANDOM_DIR_INO_VALIDATE ,
XFS_RANDOM_BULKSTAT_READ_CHUNK ,
XFS_RANDOM_IODONE_IOERR ,
XFS_RANDOM_STRATREAD_IOERR ,
XFS_RANDOM_STRATCMPL_IOERR ,
XFS_RANDOM_DIOWRITE_IOERR ,
XFS_RANDOM_BMAPIFORMAT ,
XFS_RANDOM_FREE_EXTENT ,
XFS_RANDOM_RMAP_FINISH_ONE ,
XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE ,
XFS_RANDOM_REFCOUNT_FINISH_ONE ,
XFS_RANDOM_BMAP_FINISH_ONE ,
XFS_RANDOM_AG_RESV_CRITICAL ,
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
0 , /* XFS_RANDOM_DROP_WRITES has been removed */
2017-06-27 19:52:32 +03:00
XFS_RANDOM_LOG_BAD_CRC ,
2017-08-09 04:21:52 +03:00
XFS_RANDOM_LOG_ITEM_PIN ,
2017-10-18 00:16:29 +03:00
XFS_RANDOM_BUF_LRU_REF ,
2018-05-14 16:34:36 +03:00
XFS_RANDOM_FORCE_SCRUB_REPAIR ,
2018-07-20 19:28:40 +03:00
XFS_RANDOM_FORCE_SUMMARY_RECALC ,
2019-02-07 21:37:16 +03:00
XFS_RANDOM_IUNLINK_FALLBACK ,
2020-05-06 23:29:19 +03:00
XFS_RANDOM_BUF_IOERROR ,
2021-01-23 03:48:15 +03:00
XFS_RANDOM_REDUCE_MAX_IEXTENTS ,
2021-01-23 03:48:17 +03:00
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT ,
2021-03-24 05:05:39 +03:00
XFS_RANDOM_AG_RESV_FAIL ,
2022-05-11 10:01:22 +03:00
XFS_RANDOM_LARP ,
2022-05-11 10:01:23 +03:00
XFS_RANDOM_DA_LEAF_SPLIT ,
2022-05-11 10:01:23 +03:00
XFS_RANDOM_ATTR_LEAF_TO_NODE ,
2022-11-29 04:24:35 +03:00
XFS_RANDOM_WB_DELAY_MS ,
2022-11-29 04:24:36 +03:00
XFS_RANDOM_WRITE_DELAY_MS ,
2024-04-16 00:54:19 +03:00
XFS_RANDOM_EXCHMAPS_FINISH_ONE ,
2017-06-21 03:54:46 +03:00
} ;
2005-04-17 02:20:36 +04:00
2017-06-21 03:54:47 +03:00
struct xfs_errortag_attr {
struct attribute attr ;
unsigned int tag ;
} ;
static inline struct xfs_errortag_attr *
to_attr ( struct attribute * attr )
{
return container_of ( attr , struct xfs_errortag_attr , attr ) ;
}
static inline struct xfs_mount *
to_mp ( struct kobject * kobject )
{
struct xfs_kobj * kobj = to_kobj ( kobject ) ;
return container_of ( kobj , struct xfs_mount , m_errortag_kobj ) ;
}
STATIC ssize_t
xfs_errortag_attr_store (
struct kobject * kobject ,
struct attribute * attr ,
const char * buf ,
size_t count )
{
struct xfs_mount * mp = to_mp ( kobject ) ;
struct xfs_errortag_attr * xfs_attr = to_attr ( attr ) ;
int ret ;
unsigned int val ;
if ( strcmp ( buf , " default " ) = = 0 ) {
val = xfs_errortag_random_default [ xfs_attr - > tag ] ;
} else {
ret = kstrtouint ( buf , 0 , & val ) ;
if ( ret )
return ret ;
}
ret = xfs_errortag_set ( mp , xfs_attr - > tag , val ) ;
if ( ret )
return ret ;
return count ;
}
STATIC ssize_t
xfs_errortag_attr_show (
struct kobject * kobject ,
struct attribute * attr ,
char * buf )
{
struct xfs_mount * mp = to_mp ( kobject ) ;
struct xfs_errortag_attr * xfs_attr = to_attr ( attr ) ;
return snprintf ( buf , PAGE_SIZE , " %u \n " ,
xfs_errortag_get ( mp , xfs_attr - > tag ) ) ;
}
static const struct sysfs_ops xfs_errortag_sysfs_ops = {
. show = xfs_errortag_attr_show ,
. store = xfs_errortag_attr_store ,
} ;
# define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
static struct xfs_errortag_attr xfs_errortag_attr_ # # _name = { \
. attr = { . name = __stringify ( _name ) , \
. mode = VERIFY_OCTAL_PERMISSIONS ( S_IWUSR | S_IRUGO ) } , \
. tag = ( _tag ) , \
}
# define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
XFS_ERRORTAG_ATTR_RW ( noerror , XFS_ERRTAG_NOERROR ) ;
XFS_ERRORTAG_ATTR_RW ( iflush1 , XFS_ERRTAG_IFLUSH_1 ) ;
XFS_ERRORTAG_ATTR_RW ( iflush2 , XFS_ERRTAG_IFLUSH_2 ) ;
XFS_ERRORTAG_ATTR_RW ( iflush3 , XFS_ERRTAG_IFLUSH_3 ) ;
XFS_ERRORTAG_ATTR_RW ( iflush4 , XFS_ERRTAG_IFLUSH_4 ) ;
XFS_ERRORTAG_ATTR_RW ( iflush5 , XFS_ERRTAG_IFLUSH_5 ) ;
XFS_ERRORTAG_ATTR_RW ( iflush6 , XFS_ERRTAG_IFLUSH_6 ) ;
XFS_ERRORTAG_ATTR_RW ( dareadbuf , XFS_ERRTAG_DA_READ_BUF ) ;
XFS_ERRORTAG_ATTR_RW ( btree_chk_lblk , XFS_ERRTAG_BTREE_CHECK_LBLOCK ) ;
XFS_ERRORTAG_ATTR_RW ( btree_chk_sblk , XFS_ERRTAG_BTREE_CHECK_SBLOCK ) ;
XFS_ERRORTAG_ATTR_RW ( readagf , XFS_ERRTAG_ALLOC_READ_AGF ) ;
XFS_ERRORTAG_ATTR_RW ( readagi , XFS_ERRTAG_IALLOC_READ_AGI ) ;
XFS_ERRORTAG_ATTR_RW ( itobp , XFS_ERRTAG_ITOBP_INOTOBP ) ;
XFS_ERRORTAG_ATTR_RW ( iunlink , XFS_ERRTAG_IUNLINK ) ;
XFS_ERRORTAG_ATTR_RW ( iunlinkrm , XFS_ERRTAG_IUNLINK_REMOVE ) ;
XFS_ERRORTAG_ATTR_RW ( dirinovalid , XFS_ERRTAG_DIR_INO_VALIDATE ) ;
XFS_ERRORTAG_ATTR_RW ( bulkstat , XFS_ERRTAG_BULKSTAT_READ_CHUNK ) ;
XFS_ERRORTAG_ATTR_RW ( logiodone , XFS_ERRTAG_IODONE_IOERR ) ;
XFS_ERRORTAG_ATTR_RW ( stratread , XFS_ERRTAG_STRATREAD_IOERR ) ;
XFS_ERRORTAG_ATTR_RW ( stratcmpl , XFS_ERRTAG_STRATCMPL_IOERR ) ;
XFS_ERRORTAG_ATTR_RW ( diowrite , XFS_ERRTAG_DIOWRITE_IOERR ) ;
XFS_ERRORTAG_ATTR_RW ( bmapifmt , XFS_ERRTAG_BMAPIFORMAT ) ;
XFS_ERRORTAG_ATTR_RW ( free_extent , XFS_ERRTAG_FREE_EXTENT ) ;
XFS_ERRORTAG_ATTR_RW ( rmap_finish_one , XFS_ERRTAG_RMAP_FINISH_ONE ) ;
XFS_ERRORTAG_ATTR_RW ( refcount_continue_update , XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE ) ;
XFS_ERRORTAG_ATTR_RW ( refcount_finish_one , XFS_ERRTAG_REFCOUNT_FINISH_ONE ) ;
XFS_ERRORTAG_ATTR_RW ( bmap_finish_one , XFS_ERRTAG_BMAP_FINISH_ONE ) ;
XFS_ERRORTAG_ATTR_RW ( ag_resv_critical , XFS_ERRTAG_AG_RESV_CRITICAL ) ;
2017-06-27 19:52:32 +03:00
XFS_ERRORTAG_ATTR_RW ( log_bad_crc , XFS_ERRTAG_LOG_BAD_CRC ) ;
2017-08-09 04:21:52 +03:00
XFS_ERRORTAG_ATTR_RW ( log_item_pin , XFS_ERRTAG_LOG_ITEM_PIN ) ;
2017-10-18 00:16:29 +03:00
XFS_ERRORTAG_ATTR_RW ( buf_lru_ref , XFS_ERRTAG_BUF_LRU_REF ) ;
2018-05-14 16:34:36 +03:00
XFS_ERRORTAG_ATTR_RW ( force_repair , XFS_ERRTAG_FORCE_SCRUB_REPAIR ) ;
2018-07-20 19:28:40 +03:00
XFS_ERRORTAG_ATTR_RW ( bad_summary , XFS_ERRTAG_FORCE_SUMMARY_RECALC ) ;
2019-02-07 21:37:16 +03:00
XFS_ERRORTAG_ATTR_RW ( iunlink_fallback , XFS_ERRTAG_IUNLINK_FALLBACK ) ;
2020-05-06 23:29:19 +03:00
XFS_ERRORTAG_ATTR_RW ( buf_ioerror , XFS_ERRTAG_BUF_IOERROR ) ;
2021-01-23 03:48:15 +03:00
XFS_ERRORTAG_ATTR_RW ( reduce_max_iextents , XFS_ERRTAG_REDUCE_MAX_IEXTENTS ) ;
2021-01-23 03:48:17 +03:00
XFS_ERRORTAG_ATTR_RW ( bmap_alloc_minlen_extent , XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT ) ;
2021-03-24 05:05:39 +03:00
XFS_ERRORTAG_ATTR_RW ( ag_resv_fail , XFS_ERRTAG_AG_RESV_FAIL ) ;
2022-05-11 10:01:22 +03:00
XFS_ERRORTAG_ATTR_RW ( larp , XFS_ERRTAG_LARP ) ;
2022-05-11 10:01:23 +03:00
XFS_ERRORTAG_ATTR_RW ( da_leaf_split , XFS_ERRTAG_DA_LEAF_SPLIT ) ;
2022-05-11 10:01:23 +03:00
XFS_ERRORTAG_ATTR_RW ( attr_leaf_to_node , XFS_ERRTAG_ATTR_LEAF_TO_NODE ) ;
2022-11-29 04:24:35 +03:00
XFS_ERRORTAG_ATTR_RW ( wb_delay_ms , XFS_ERRTAG_WB_DELAY_MS ) ;
2022-11-29 04:24:36 +03:00
XFS_ERRORTAG_ATTR_RW ( write_delay_ms , XFS_ERRTAG_WRITE_DELAY_MS ) ;
2024-04-16 00:54:19 +03:00
XFS_ERRORTAG_ATTR_RW ( exchmaps_finish_one , XFS_ERRTAG_EXCHMAPS_FINISH_ONE ) ;
2017-06-21 03:54:47 +03:00
static struct attribute * xfs_errortag_attrs [ ] = {
XFS_ERRORTAG_ATTR_LIST ( noerror ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush1 ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush2 ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush3 ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush4 ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush5 ) ,
XFS_ERRORTAG_ATTR_LIST ( iflush6 ) ,
XFS_ERRORTAG_ATTR_LIST ( dareadbuf ) ,
XFS_ERRORTAG_ATTR_LIST ( btree_chk_lblk ) ,
XFS_ERRORTAG_ATTR_LIST ( btree_chk_sblk ) ,
XFS_ERRORTAG_ATTR_LIST ( readagf ) ,
XFS_ERRORTAG_ATTR_LIST ( readagi ) ,
XFS_ERRORTAG_ATTR_LIST ( itobp ) ,
XFS_ERRORTAG_ATTR_LIST ( iunlink ) ,
XFS_ERRORTAG_ATTR_LIST ( iunlinkrm ) ,
XFS_ERRORTAG_ATTR_LIST ( dirinovalid ) ,
XFS_ERRORTAG_ATTR_LIST ( bulkstat ) ,
XFS_ERRORTAG_ATTR_LIST ( logiodone ) ,
XFS_ERRORTAG_ATTR_LIST ( stratread ) ,
XFS_ERRORTAG_ATTR_LIST ( stratcmpl ) ,
XFS_ERRORTAG_ATTR_LIST ( diowrite ) ,
XFS_ERRORTAG_ATTR_LIST ( bmapifmt ) ,
XFS_ERRORTAG_ATTR_LIST ( free_extent ) ,
XFS_ERRORTAG_ATTR_LIST ( rmap_finish_one ) ,
XFS_ERRORTAG_ATTR_LIST ( refcount_continue_update ) ,
XFS_ERRORTAG_ATTR_LIST ( refcount_finish_one ) ,
XFS_ERRORTAG_ATTR_LIST ( bmap_finish_one ) ,
XFS_ERRORTAG_ATTR_LIST ( ag_resv_critical ) ,
2017-06-27 19:52:32 +03:00
XFS_ERRORTAG_ATTR_LIST ( log_bad_crc ) ,
2017-08-09 04:21:52 +03:00
XFS_ERRORTAG_ATTR_LIST ( log_item_pin ) ,
2017-10-18 00:16:29 +03:00
XFS_ERRORTAG_ATTR_LIST ( buf_lru_ref ) ,
2018-05-14 16:34:36 +03:00
XFS_ERRORTAG_ATTR_LIST ( force_repair ) ,
2018-07-20 19:28:40 +03:00
XFS_ERRORTAG_ATTR_LIST ( bad_summary ) ,
2019-02-07 21:37:16 +03:00
XFS_ERRORTAG_ATTR_LIST ( iunlink_fallback ) ,
2020-05-06 23:29:19 +03:00
XFS_ERRORTAG_ATTR_LIST ( buf_ioerror ) ,
2021-01-23 03:48:15 +03:00
XFS_ERRORTAG_ATTR_LIST ( reduce_max_iextents ) ,
2021-01-23 03:48:17 +03:00
XFS_ERRORTAG_ATTR_LIST ( bmap_alloc_minlen_extent ) ,
2021-03-24 05:05:39 +03:00
XFS_ERRORTAG_ATTR_LIST ( ag_resv_fail ) ,
2022-05-11 10:01:22 +03:00
XFS_ERRORTAG_ATTR_LIST ( larp ) ,
2022-05-11 10:01:23 +03:00
XFS_ERRORTAG_ATTR_LIST ( da_leaf_split ) ,
2022-05-11 10:01:23 +03:00
XFS_ERRORTAG_ATTR_LIST ( attr_leaf_to_node ) ,
2022-11-29 04:24:35 +03:00
XFS_ERRORTAG_ATTR_LIST ( wb_delay_ms ) ,
2022-11-29 04:24:36 +03:00
XFS_ERRORTAG_ATTR_LIST ( write_delay_ms ) ,
2024-04-16 00:54:19 +03:00
XFS_ERRORTAG_ATTR_LIST ( exchmaps_finish_one ) ,
2017-06-21 03:54:47 +03:00
NULL ,
} ;
2022-01-03 22:10:18 +03:00
ATTRIBUTE_GROUPS ( xfs_errortag ) ;
2017-06-21 03:54:47 +03:00
2023-02-10 05:56:48 +03:00
static const struct kobj_type xfs_errortag_ktype = {
2017-06-21 03:54:47 +03:00
. release = xfs_sysfs_release ,
. sysfs_ops = & xfs_errortag_sysfs_ops ,
2022-01-03 22:10:18 +03:00
. default_groups = xfs_errortag_groups ,
2017-06-21 03:54:47 +03:00
} ;
2005-04-17 02:20:36 +04:00
int
2017-06-21 03:54:46 +03:00
xfs_errortag_init (
struct xfs_mount * mp )
2005-04-17 02:20:36 +04:00
{
2022-10-19 00:38:14 +03:00
int ret ;
2024-01-16 01:59:39 +03:00
mp - > m_errortag = kzalloc ( sizeof ( unsigned int ) * XFS_ERRTAG_MAX ,
GFP_KERNEL | __GFP_RETRY_MAYFAIL ) ;
2017-06-21 03:54:46 +03:00
if ( ! mp - > m_errortag )
return - ENOMEM ;
2017-06-21 03:54:47 +03:00
2022-10-19 00:38:14 +03:00
ret = xfs_sysfs_init ( & mp - > m_errortag_kobj , & xfs_errortag_ktype ,
& mp - > m_kobj , " errortag " ) ;
if ( ret )
2024-01-16 01:59:43 +03:00
kfree ( mp - > m_errortag ) ;
2022-10-19 00:38:14 +03:00
return ret ;
2017-06-21 03:54:46 +03:00
}
2005-04-17 02:20:36 +04:00
2017-06-21 03:54:46 +03:00
void
xfs_errortag_del (
struct xfs_mount * mp )
{
2017-06-21 03:54:47 +03:00
xfs_sysfs_del ( & mp - > m_errortag_kobj ) ;
2024-01-16 01:59:43 +03:00
kfree ( mp - > m_errortag ) ;
2017-06-21 03:54:46 +03:00
}
2005-04-17 02:20:36 +04:00
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
static bool
xfs_errortag_valid (
unsigned int error_tag )
{
if ( error_tag > = XFS_ERRTAG_MAX )
return false ;
/* Error out removed injection types */
if ( error_tag = = XFS_ERRTAG_DROP_WRITES )
return false ;
return true ;
}
2022-11-29 04:24:35 +03:00
bool
xfs_errortag_enabled (
struct xfs_mount * mp ,
unsigned int tag )
{
if ( ! mp - > m_errortag )
return false ;
if ( ! xfs_errortag_valid ( tag ) )
return false ;
return mp - > m_errortag [ tag ] ! = 0 ;
}
2017-06-21 03:54:46 +03:00
bool
xfs_errortag_test (
struct xfs_mount * mp ,
const char * expression ,
const char * file ,
int line ,
unsigned int error_tag )
{
unsigned int randfactor ;
2005-04-17 02:20:36 +04:00
2017-06-30 19:46:07 +03:00
/*
* To be able to use error injection anywhere , we need to ensure error
* injection mechanism is already initialized .
*
* Code paths like I / O completion can be called before the
* initialization is complete , but be able to inject errors in such
* places is still useful .
*/
if ( ! mp - > m_errortag )
return false ;
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
if ( ! xfs_errortag_valid ( error_tag ) )
return false ;
2017-06-21 03:54:46 +03:00
randfactor = mp - > m_errortag [ error_tag ] ;
2022-10-10 05:44:02 +03:00
if ( ! randfactor | | get_random_u32_below ( randfactor ) )
2017-06-21 03:54:46 +03:00
return false ;
2005-04-17 02:20:36 +04:00
2017-06-21 03:54:46 +03:00
xfs_warn_ratelimited ( mp ,
" Injecting error (%s) at file %s, line %d, on filesystem \" %s \" " ,
2019-11-05 00:58:40 +03:00
expression , file , line , mp - > m_super - > s_id ) ;
2017-06-21 03:54:46 +03:00
return true ;
2005-04-17 02:20:36 +04:00
}
2017-06-21 03:54:47 +03:00
int
xfs_errortag_get (
struct xfs_mount * mp ,
unsigned int error_tag )
{
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
if ( ! xfs_errortag_valid ( error_tag ) )
2017-06-21 03:54:47 +03:00
return - EINVAL ;
return mp - > m_errortag [ error_tag ] ;
}
2005-04-17 02:20:36 +04:00
int
2017-06-21 03:54:46 +03:00
xfs_errortag_set (
struct xfs_mount * mp ,
unsigned int error_tag ,
unsigned int tag_value )
2005-04-17 02:20:36 +04:00
{
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
if ( ! xfs_errortag_valid ( error_tag ) )
2016-06-21 04:53:28 +03:00
return - EINVAL ;
2017-06-21 03:54:46 +03:00
mp - > m_errortag [ error_tag ] = tag_value ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
int
2017-06-21 03:54:46 +03:00
xfs_errortag_add (
struct xfs_mount * mp ,
unsigned int error_tag )
2005-04-17 02:20:36 +04:00
{
2021-03-22 19:52:02 +03:00
BUILD_BUG_ON ( ARRAY_SIZE ( xfs_errortag_random_default ) ! = XFS_ERRTAG_MAX ) ;
xfs: drop write error injection is unfixable, remove it
With the changes to scan the page cache for dirty data to avoid data
corruptions from partial write cleanup racing with other page cache
operations, the drop writes error injection no longer works the same
way it used to and causes xfs/196 to fail. This is because xfs/196
writes to the file and populates the page cache before it turns on
the error injection and starts failing -overwrites-.
The result is that the original drop-writes code failed writes only
-after- overwriting the data in the cache, followed by invalidates
the cached data, then punching out the delalloc extent from under
that data.
On the surface, this looks fine. The problem is that page cache
invalidation *doesn't guarantee that it removes anything from the
page cache* and it doesn't change the dirty state of the folio. When
block size == page size and we do page aligned IO (as xfs/196 does)
everything happens to align perfectly and page cache invalidation
removes the single page folios that span the written data. Hence the
followup delalloc punch pass does not find cached data over that
range and it can punch the extent out.
IOWs, xfs/196 "works" for block size == page size with the new
code. I say "works", because it actually only works for the case
where IO is page aligned, and no data was read from disk before
writes occur. Because the moment we actually read data first, the
readahead code allocates multipage folios and suddenly the
invalidate code goes back to zeroing subfolio ranges without
changing dirty state.
Hence, with multipage folios in play, block size == page size is
functionally identical to block size < page size behaviour, and
drop-writes is manifestly broken w.r.t to this case. Invalidation of
a subfolio range doesn't result in the folio being removed from the
cache, just the range gets zeroed. Hence after we've sequentially
walked over a folio that we've dirtied (via write data) and then
invalidated, we end up with a dirty folio full of zeroed data.
And because the new code skips punching ranges that have dirty
folios covering them, we end up leaving the delalloc range intact
after failing all the writes. Hence failed writes now end up
writing zeroes to disk in the cases where invalidation zeroes folios
rather than removing them from cache.
This is a fundamental change of behaviour that is needed to avoid
the data corruption vectors that exist in the old write fail path,
and it renders the drop-writes injection non-functional and
unworkable as it stands.
As it is, I think the error injection is also now unnecessary, as
partial writes that need delalloc extent are going to be a lot more
common with stale iomap detection in place. Hence this patch removes
the drop-writes error injection completely. xfs/196 can remain for
testing kernels that don't have this data corruption fix, but those
that do will report:
xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
2022-11-29 01:09:17 +03:00
if ( ! xfs_errortag_valid ( error_tag ) )
2017-06-21 03:54:46 +03:00
return - EINVAL ;
2005-04-17 02:20:36 +04:00
2017-06-21 03:54:46 +03:00
return xfs_errortag_set ( mp , error_tag ,
xfs_errortag_random_default [ error_tag ] ) ;
}
2005-04-17 02:20:36 +04:00
2017-06-21 03:54:46 +03:00
int
xfs_errortag_clearall (
struct xfs_mount * mp )
{
memset ( mp - > m_errortag , 0 , sizeof ( unsigned int ) * XFS_ERRTAG_MAX ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
2008-08-13 10:17:37 +04:00
# endif /* DEBUG */
2005-04-17 02:20:36 +04:00
void
xfs_error_report (
2010-04-13 09:22:08 +04:00
const char * tag ,
int level ,
struct xfs_mount * mp ,
const char * filename ,
int linenum ,
2018-01-08 21:51:25 +03:00
xfs_failaddr_t failaddr )
2005-04-17 02:20:36 +04:00
{
if ( level < = xfs_error_level ) {
2011-03-07 02:02:35 +03:00
xfs_alert_tag ( mp , XFS_PTAG_ERROR_REPORT ,
2015-03-25 06:56:21 +03:00
" Internal error %s at line %d of file %s. Caller %pS " ,
2018-01-08 21:51:25 +03:00
tag , linenum , filename , failaddr ) ;
2005-04-17 02:20:36 +04:00
xfs_stack_trace ( ) ;
}
}
void
xfs_corruption_error (
2010-04-13 09:22:08 +04:00
const char * tag ,
int level ,
struct xfs_mount * mp ,
2019-11-02 19:40:36 +03:00
const void * buf ,
2018-06-04 20:23:54 +03:00
size_t bufsize ,
2010-04-13 09:22:08 +04:00
const char * filename ,
int linenum ,
2018-01-08 21:51:25 +03:00
xfs_failaddr_t failaddr )
2005-04-17 02:20:36 +04:00
{
2019-11-17 21:36:52 +03:00
if ( buf & & level < = xfs_error_level )
2018-06-04 20:23:54 +03:00
xfs_hex_dump ( buf , bufsize ) ;
2018-01-08 21:51:25 +03:00
xfs_error_report ( tag , level , mp , filename , linenum , failaddr ) ;
2011-03-07 02:03:35 +03:00
xfs_alert ( mp , " Corruption detected. Unmount and run xfs_repair " ) ;
2005-04-17 02:20:36 +04:00
}
2014-02-27 08:21:07 +04:00
2019-11-02 19:40:53 +03:00
/*
* Complain about the kinds of metadata corruption that we can ' t detect from a
* verifier , such as incorrect inter - block relationship data . Does not set
* bp - > b_error .
2020-03-11 20:37:54 +03:00
*
* Call xfs_buf_mark_corrupt , not this function .
2019-11-02 19:40:53 +03:00
*/
void
xfs_buf_corruption_error (
2020-03-11 20:37:54 +03:00
struct xfs_buf * bp ,
xfs_failaddr_t fa )
2019-11-02 19:40:53 +03:00
{
struct xfs_mount * mp = bp - > b_mount ;
xfs_alert_tag ( mp , XFS_PTAG_VERIFIER_ERROR ,
" Metadata corruption detected at %pS, %s block 0x%llx " ,
2021-08-19 04:47:05 +03:00
fa , bp - > b_ops - > name , xfs_buf_daddr ( bp ) ) ;
2019-11-02 19:40:53 +03:00
xfs_alert ( mp , " Unmount and run xfs_repair " ) ;
if ( xfs_error_level > = XFS_ERRLEVEL_HIGH )
xfs_stack_trace ( ) ;
}
2014-02-27 08:21:07 +04:00
/*
* Warnings specifically for verifier errors . Differentiate CRC vs . invalid
* values , and omit the stack trace unless the error level is tuned high .
*/
void
2018-03-23 20:06:53 +03:00
xfs_buf_verifier_error (
2018-01-08 21:51:02 +03:00
struct xfs_buf * bp ,
2018-01-08 21:51:03 +03:00
int error ,
2018-03-23 20:06:53 +03:00
const char * name ,
2019-11-02 19:40:36 +03:00
const void * buf ,
2018-03-23 20:06:53 +03:00
size_t bufsz ,
2018-01-08 21:51:03 +03:00
xfs_failaddr_t failaddr )
2014-02-27 08:21:07 +04:00
{
2019-06-29 05:27:29 +03:00
struct xfs_mount * mp = bp - > b_mount ;
2018-01-08 21:51:03 +03:00
xfs_failaddr_t fa ;
2018-03-23 20:06:53 +03:00
int sz ;
2014-02-27 08:21:07 +04:00
2018-01-08 21:51:03 +03:00
fa = failaddr ? failaddr : __return_address ;
__xfs_buf_ioerror ( bp , error , fa ) ;
2018-01-08 21:51:02 +03:00
2019-02-01 20:12:20 +03:00
xfs_alert_tag ( mp , XFS_PTAG_VERIFIER_ERROR ,
" Metadata %s detected at %pS, %s block 0x%llx %s " ,
2014-06-25 08:58:08 +04:00
bp - > b_error = = - EFSBADCRC ? " CRC error " : " corruption " ,
2021-08-19 04:47:05 +03:00
fa , bp - > b_ops - > name , xfs_buf_daddr ( bp ) , name ) ;
2014-02-27 08:21:07 +04:00
xfs_alert ( mp , " Unmount and run xfs_repair " ) ;
if ( xfs_error_level > = XFS_ERRLEVEL_LOW ) {
2018-03-23 20:06:53 +03:00
sz = min_t ( size_t , XFS_CORRUPTION_DUMP_LEN , bufsz ) ;
2018-01-08 21:51:26 +03:00
xfs_alert ( mp , " First %d bytes of corrupted metadata buffer: " ,
2018-03-23 20:06:53 +03:00
sz ) ;
xfs_hex_dump ( buf , sz ) ;
2014-02-27 08:21:07 +04:00
}
if ( xfs_error_level > = XFS_ERRLEVEL_HIGH )
xfs_stack_trace ( ) ;
}
2018-01-23 05:09:48 +03:00
2018-03-23 20:06:53 +03:00
/*
* Warnings specifically for verifier errors . Differentiate CRC vs . invalid
* values , and omit the stack trace unless the error level is tuned high .
*/
void
xfs_verifier_error (
struct xfs_buf * bp ,
int error ,
xfs_failaddr_t failaddr )
{
return xfs_buf_verifier_error ( bp , error , " " , xfs_buf_offset ( bp , 0 ) ,
XFS_CORRUPTION_DUMP_LEN , failaddr ) ;
}
2018-01-23 05:09:48 +03:00
/*
* Warnings for inode corruption problems . Don ' t bother with the stack
* trace unless the error level is turned up high .
*/
void
xfs_inode_verifier_error (
struct xfs_inode * ip ,
int error ,
const char * name ,
2019-11-02 19:40:36 +03:00
const void * buf ,
2018-01-23 05:09:48 +03:00
size_t bufsz ,
xfs_failaddr_t failaddr )
{
struct xfs_mount * mp = ip - > i_mount ;
xfs_failaddr_t fa ;
int sz ;
fa = failaddr ? failaddr : __return_address ;
xfs_alert ( mp , " Metadata %s detected at %pS, inode 0x%llx %s " ,
error = = - EFSBADCRC ? " CRC error " : " corruption " ,
fa , ip - > i_ino , name ) ;
xfs_alert ( mp , " Unmount and run xfs_repair " ) ;
if ( buf & & xfs_error_level > = XFS_ERRLEVEL_LOW ) {
sz = min_t ( size_t , XFS_CORRUPTION_DUMP_LEN , bufsz ) ;
xfs_alert ( mp , " First %d bytes of corrupted metadata buffer: " ,
sz ) ;
xfs_hex_dump ( buf , sz ) ;
}
if ( xfs_error_level > = XFS_ERRLEVEL_HIGH )
xfs_stack_trace ( ) ;
}