2008-09-06 00:13:11 +04:00
/*
* Copyright ( C ) 2008 Oracle . All rights reserved .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation .
*
* This program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* General Public License for more details .
*
* You should have received a copy of the GNU General Public
* License along with this program ; if not , write to the
* Free Software Foundation , Inc . , 59 Temple Place - Suite 330 ,
* Boston , MA 021110 - 1307 , USA .
*/
# ifndef __TREE_LOG_
# define __TREE_LOG_
2014-04-02 15:51:06 +04:00
# include "ctree.h"
# include "transaction.h"
2009-10-13 21:21:08 +04:00
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
# define BTRFS_NO_LOG_SYNC 256
2014-02-20 14:08:58 +04:00
struct btrfs_log_ctx {
int log_ret ;
2014-02-20 14:08:59 +04:00
int log_transid ;
Btrfs: fix data corruption after fast fsync and writeback error
When we do a fast fsync, we start all ordered operations and then while
they're running in parallel we visit the list of modified extent maps
and construct their matching file extent items and write them to the
log btree. After that, in btrfs_sync_log() we wait for all the ordered
operations to finish (via btrfs_wait_logged_extents).
The problem with this is that we were completely ignoring errors that
can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM
or -EIO errors for example. When such error happens, it means we have parts
of the on disk extent that weren't written to, and so we end up logging
file extent items that point to these extents that contain garbage/random
data - so after a crash/reboot plus log replay, we get our inode's metadata
pointing to those extents.
This worked in contrast with the full (non-fast) fsync path, where we
start all ordered operations, wait for them to finish and then write
to the log btree. In this path, after each ordered operation completes
we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return
-EIO if so (via btrfs_wait_ordered_range).
So if an error happens with any ordered operation, just return a -EIO
error to userspace, so that it knows that not all of its previous writes
were durably persisted and the application can take proper action (like
redo the writes for e.g.) - and definitely not leave any file extent items
in the log refer to non fully written extents.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-05 18:14:39 +04:00
int io_err ;
2014-02-20 14:08:58 +04:00
struct list_head list ;
} ;
static inline void btrfs_init_log_ctx ( struct btrfs_log_ctx * ctx )
{
ctx - > log_ret = 0 ;
2014-02-20 14:08:59 +04:00
ctx - > log_transid = 0 ;
Btrfs: fix data corruption after fast fsync and writeback error
When we do a fast fsync, we start all ordered operations and then while
they're running in parallel we visit the list of modified extent maps
and construct their matching file extent items and write them to the
log btree. After that, in btrfs_sync_log() we wait for all the ordered
operations to finish (via btrfs_wait_logged_extents).
The problem with this is that we were completely ignoring errors that
can happen in the extent write path, such as -ENOSPC, a temporary -ENOMEM
or -EIO errors for example. When such error happens, it means we have parts
of the on disk extent that weren't written to, and so we end up logging
file extent items that point to these extents that contain garbage/random
data - so after a crash/reboot plus log replay, we get our inode's metadata
pointing to those extents.
This worked in contrast with the full (non-fast) fsync path, where we
start all ordered operations, wait for them to finish and then write
to the log btree. In this path, after each ordered operation completes
we check if it's flagged with an error (BTRFS_ORDERED_IOERR) and return
-EIO if so (via btrfs_wait_ordered_range).
So if an error happens with any ordered operation, just return a -EIO
error to userspace, so that it knows that not all of its previous writes
were durably persisted and the application can take proper action (like
redo the writes for e.g.) - and definitely not leave any file extent items
in the log refer to non fully written extents.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
2014-09-05 18:14:39 +04:00
ctx - > io_err = 0 ;
2014-02-20 14:08:58 +04:00
INIT_LIST_HEAD ( & ctx - > list ) ;
}
2014-04-02 15:51:06 +04:00
static inline void btrfs_set_log_full_commit ( struct btrfs_fs_info * fs_info ,
struct btrfs_trans_handle * trans )
{
ACCESS_ONCE ( fs_info - > last_trans_log_full_commit ) = trans - > transid ;
}
static inline int btrfs_need_log_full_commit ( struct btrfs_fs_info * fs_info ,
struct btrfs_trans_handle * trans )
{
return ACCESS_ONCE ( fs_info - > last_trans_log_full_commit ) = =
trans - > transid ;
}
2008-09-06 00:13:11 +04:00
int btrfs_sync_log ( struct btrfs_trans_handle * trans ,
2014-02-20 14:08:58 +04:00
struct btrfs_root * root , struct btrfs_log_ctx * ctx ) ;
2008-09-06 00:13:11 +04:00
int btrfs_free_log ( struct btrfs_trans_handle * trans , struct btrfs_root * root ) ;
2010-05-16 18:49:59 +04:00
int btrfs_free_log_root_tree ( struct btrfs_trans_handle * trans ,
struct btrfs_fs_info * fs_info ) ;
2008-09-06 00:13:11 +04:00
int btrfs_recover_log_trees ( struct btrfs_root * tree_root ) ;
int btrfs_log_dentry_safe ( struct btrfs_trans_handle * trans ,
2014-02-20 14:08:58 +04:00
struct btrfs_root * root , struct dentry * dentry ,
2014-09-07 01:34:39 +04:00
const loff_t start ,
const loff_t end ,
2014-02-20 14:08:58 +04:00
struct btrfs_log_ctx * ctx ) ;
2008-09-06 00:13:11 +04:00
int btrfs_del_dir_entries_in_log ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
const char * name , int name_len ,
struct inode * dir , u64 index ) ;
int btrfs_del_inode_ref_in_log ( struct btrfs_trans_handle * trans ,
struct btrfs_root * root ,
const char * name , int name_len ,
struct inode * inode , u64 dirid ) ;
2012-03-01 17:56:26 +04:00
void btrfs_end_log_trans ( struct btrfs_root * root ) ;
2009-03-24 17:24:20 +03:00
int btrfs_pin_log_trans ( struct btrfs_root * root ) ;
void btrfs_record_unlink_dir ( struct btrfs_trans_handle * trans ,
struct inode * dir , struct inode * inode ,
int for_rename ) ;
int btrfs_log_new_name ( struct btrfs_trans_handle * trans ,
struct inode * inode , struct inode * old_dir ,
struct dentry * parent ) ;
2008-09-06 00:13:11 +04:00
# endif