2012-11-29 08:28:09 +04:00
/*
2012-11-02 12:10:40 +04:00
* fs / f2fs / inode . c
*
* Copyright ( c ) 2012 Samsung Electronics Co . , Ltd .
* http : //www.samsung.com/
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
# include <linux/fs.h>
# include <linux/f2fs_fs.h>
# include <linux/buffer_head.h>
# include <linux/writeback.h>
# include "f2fs.h"
# include "node.h"
2013-04-19 20:28:40 +04:00
# include <trace/events/f2fs.h>
2012-11-02 12:10:40 +04:00
void f2fs_set_inode_flags ( struct inode * inode )
{
unsigned int flags = F2FS_I ( inode ) - > i_flags ;
2014-04-15 10:19:38 +04:00
unsigned int new_fl = 0 ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_SYNC_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_SYNC ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_APPEND_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_APPEND ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_IMMUTABLE_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_IMMUTABLE ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_NOATIME_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_NOATIME ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_DIRSYNC_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_DIRSYNC ;
2015-08-24 05:41:32 +03:00
inode_set_flags ( inode , new_fl ,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC ) ;
2012-11-02 12:10:40 +04:00
}
2013-10-08 13:01:51 +04:00
static void __get_inode_rdev ( struct inode * inode , struct f2fs_inode * ri )
{
if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) | |
S_ISFIFO ( inode - > i_mode ) | | S_ISSOCK ( inode - > i_mode ) ) {
if ( ri - > i_addr [ 0 ] )
2014-01-18 00:44:39 +04:00
inode - > i_rdev =
old_decode_dev ( le32_to_cpu ( ri - > i_addr [ 0 ] ) ) ;
2013-10-08 13:01:51 +04:00
else
2014-01-18 00:44:39 +04:00
inode - > i_rdev =
new_decode_dev ( le32_to_cpu ( ri - > i_addr [ 1 ] ) ) ;
2013-10-08 13:01:51 +04:00
}
}
2015-03-18 03:16:35 +03:00
static bool __written_first_block ( struct f2fs_inode * ri )
{
2015-03-24 22:04:20 +03:00
block_t addr = le32_to_cpu ( ri - > i_addr [ 0 ] ) ;
if ( addr ! = NEW_ADDR & & addr ! = NULL_ADDR )
2015-03-18 03:16:35 +03:00
return true ;
return false ;
}
2013-10-08 13:01:51 +04:00
static void __set_inode_rdev ( struct inode * inode , struct f2fs_inode * ri )
{
if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) ) {
if ( old_valid_dev ( inode - > i_rdev ) ) {
2014-01-18 00:44:39 +04:00
ri - > i_addr [ 0 ] =
cpu_to_le32 ( old_encode_dev ( inode - > i_rdev ) ) ;
2013-10-08 13:01:51 +04:00
ri - > i_addr [ 1 ] = 0 ;
} else {
ri - > i_addr [ 0 ] = 0 ;
2014-01-18 00:44:39 +04:00
ri - > i_addr [ 1 ] =
cpu_to_le32 ( new_encode_dev ( inode - > i_rdev ) ) ;
2013-10-08 13:01:51 +04:00
ri - > i_addr [ 2 ] = 0 ;
}
}
}
2015-01-06 09:28:43 +03:00
static void __recover_inline_status ( struct inode * inode , struct page * ipage )
2014-10-24 06:48:09 +04:00
{
void * inline_data = inline_data_addr ( ipage ) ;
2015-01-06 09:28:43 +03:00
__le32 * start = inline_data ;
__le32 * end = start + MAX_INLINE_DATA / sizeof ( __le32 ) ;
2014-10-24 06:48:09 +04:00
2015-01-06 09:28:43 +03:00
while ( start < end ) {
if ( * start + + ) {
f2fs_wait_on_page_writeback ( ipage , NODE ) ;
2014-10-24 06:48:09 +04:00
2015-01-06 09:28:43 +03:00
set_inode_flag ( F2FS_I ( inode ) , FI_DATA_EXIST ) ;
set_raw_inline ( F2FS_I ( inode ) , F2FS_INODE ( ipage ) ) ;
set_page_dirty ( ipage ) ;
return ;
}
2014-10-24 06:48:09 +04:00
}
2015-01-06 09:28:43 +03:00
return ;
2014-10-24 06:48:09 +04:00
}
2012-11-02 12:10:40 +04:00
static int do_read_inode ( struct inode * inode )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 12:10:40 +04:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct page * node_page ;
struct f2fs_inode * ri ;
/* Check if ino is within scope */
2013-03-17 12:27:20 +04:00
if ( check_nid_range ( sbi , inode - > i_ino ) ) {
f2fs_msg ( inode - > i_sb , KERN_ERR , " bad inode number: %lu " ,
( unsigned long ) inode - > i_ino ) ;
2014-06-12 09:23:41 +04:00
WARN_ON ( 1 ) ;
2013-03-17 12:27:20 +04:00
return - EINVAL ;
}
2012-11-02 12:10:40 +04:00
node_page = get_node_page ( sbi , inode - > i_ino ) ;
if ( IS_ERR ( node_page ) )
return PTR_ERR ( node_page ) ;
2013-12-26 11:30:41 +04:00
ri = F2FS_INODE ( node_page ) ;
2012-11-02 12:10:40 +04:00
inode - > i_mode = le16_to_cpu ( ri - > i_mode ) ;
i_uid_write ( inode , le32_to_cpu ( ri - > i_uid ) ) ;
i_gid_write ( inode , le32_to_cpu ( ri - > i_gid ) ) ;
set_nlink ( inode , le32_to_cpu ( ri - > i_links ) ) ;
inode - > i_size = le64_to_cpu ( ri - > i_size ) ;
inode - > i_blocks = le64_to_cpu ( ri - > i_blocks ) ;
inode - > i_atime . tv_sec = le64_to_cpu ( ri - > i_atime ) ;
inode - > i_ctime . tv_sec = le64_to_cpu ( ri - > i_ctime ) ;
inode - > i_mtime . tv_sec = le64_to_cpu ( ri - > i_mtime ) ;
inode - > i_atime . tv_nsec = le32_to_cpu ( ri - > i_atime_nsec ) ;
inode - > i_ctime . tv_nsec = le32_to_cpu ( ri - > i_ctime_nsec ) ;
inode - > i_mtime . tv_nsec = le32_to_cpu ( ri - > i_mtime_nsec ) ;
inode - > i_generation = le32_to_cpu ( ri - > i_generation ) ;
fi - > i_current_depth = le32_to_cpu ( ri - > i_current_depth ) ;
fi - > i_xattr_nid = le32_to_cpu ( ri - > i_xattr_nid ) ;
fi - > i_flags = le32_to_cpu ( ri - > i_flags ) ;
fi - > flags = 0 ;
fi - > i_advise = ri - > i_advise ;
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 12:52:48 +04:00
fi - > i_pino = le32_to_cpu ( ri - > i_pino ) ;
2014-02-27 13:20:00 +04:00
fi - > i_dir_level = ri - > i_dir_level ;
2013-10-08 13:01:51 +04:00
2015-12-28 22:39:06 +03:00
if ( f2fs_init_extent_tree ( inode , & ri - > i_ext ) )
set_page_dirty ( node_page ) ;
2015-02-05 12:46:29 +03:00
2013-08-08 10:16:22 +04:00
get_inline_info ( fi , ri ) ;
2013-10-08 13:01:51 +04:00
2014-10-24 06:48:09 +04:00
/* check data exist */
if ( f2fs_has_inline_data ( inode ) & & ! f2fs_exist_data ( inode ) )
2015-01-06 09:28:43 +03:00
__recover_inline_status ( inode , node_page ) ;
2014-10-24 06:48:09 +04:00
2013-10-08 13:01:51 +04:00
/* get rdev by using inline_info */
__get_inode_rdev ( inode , ri ) ;
2015-03-18 03:16:35 +03:00
if ( __written_first_block ( ri ) )
set_inode_flag ( F2FS_I ( inode ) , FI_FIRST_BLOCK_WRITTEN ) ;
2012-11-02 12:10:40 +04:00
f2fs_put_page ( node_page , 1 ) ;
2014-12-05 21:51:50 +03:00
2015-07-15 12:28:53 +03:00
stat_inc_inline_xattr ( inode ) ;
2014-12-05 21:51:50 +03:00
stat_inc_inline_inode ( inode ) ;
stat_inc_inline_dir ( inode ) ;
2015-01-06 09:28:43 +03:00
return 0 ;
2012-11-02 12:10:40 +04:00
}
struct inode * f2fs_iget ( struct super_block * sb , unsigned long ino )
{
struct f2fs_sb_info * sbi = F2FS_SB ( sb ) ;
struct inode * inode ;
2013-04-19 20:28:40 +04:00
int ret = 0 ;
2012-11-02 12:10:40 +04:00
inode = iget_locked ( sb , ino ) ;
if ( ! inode )
return ERR_PTR ( - ENOMEM ) ;
2013-04-19 20:28:40 +04:00
if ( ! ( inode - > i_state & I_NEW ) ) {
trace_f2fs_iget ( inode ) ;
2012-11-02 12:10:40 +04:00
return inode ;
2013-04-19 20:28:40 +04:00
}
2012-11-02 12:10:40 +04:00
if ( ino = = F2FS_NODE_INO ( sbi ) | | ino = = F2FS_META_INO ( sbi ) )
goto make_now ;
ret = do_read_inode ( inode ) ;
if ( ret )
goto bad_inode ;
make_now :
if ( ino = = F2FS_NODE_INO ( sbi ) ) {
inode - > i_mapping - > a_ops = & f2fs_node_aops ;
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_ZERO ) ;
} else if ( ino = = F2FS_META_INO ( sbi ) ) {
inode - > i_mapping - > a_ops = & f2fs_meta_aops ;
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_ZERO ) ;
} else if ( S_ISREG ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_file_inode_operations ;
inode - > i_fop = & f2fs_file_operations ;
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
} else if ( S_ISDIR ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_dir_inode_operations ;
inode - > i_fop = & f2fs_dir_operations ;
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
2014-10-18 04:57:29 +04:00
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_HIGH_ZERO ) ;
2012-11-02 12:10:40 +04:00
} else if ( S_ISLNK ( inode - > i_mode ) ) {
2015-04-30 01:10:53 +03:00
if ( f2fs_encrypted_inode ( inode ) )
inode - > i_op = & f2fs_encrypted_symlink_inode_operations ;
else
inode - > i_op = & f2fs_symlink_inode_operations ;
2015-11-17 09:07:57 +03:00
inode_nohighmem ( inode ) ;
2012-11-02 12:10:40 +04:00
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
} else if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) | |
S_ISFIFO ( inode - > i_mode ) | | S_ISSOCK ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_special_inode_operations ;
init_special_inode ( inode , inode - > i_mode , inode - > i_rdev ) ;
} else {
ret = - EIO ;
goto bad_inode ;
}
unlock_new_inode ( inode ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_iget ( inode ) ;
2012-11-02 12:10:40 +04:00
return inode ;
bad_inode :
iget_failed ( inode ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_iget_exit ( inode , ret ) ;
2012-11-02 12:10:40 +04:00
return ERR_PTR ( ret ) ;
}
2016-01-08 00:23:12 +03:00
int update_inode ( struct inode * inode , struct page * node_page )
2012-11-02 12:10:40 +04:00
{
struct f2fs_inode * ri ;
2014-01-10 11:26:14 +04:00
f2fs_wait_on_page_writeback ( node_page , NODE ) ;
2012-11-02 12:10:40 +04:00
2013-12-26 11:30:41 +04:00
ri = F2FS_INODE ( node_page ) ;
2012-11-02 12:10:40 +04:00
ri - > i_mode = cpu_to_le16 ( inode - > i_mode ) ;
ri - > i_advise = F2FS_I ( inode ) - > i_advise ;
ri - > i_uid = cpu_to_le32 ( i_uid_read ( inode ) ) ;
ri - > i_gid = cpu_to_le32 ( i_gid_read ( inode ) ) ;
ri - > i_links = cpu_to_le32 ( inode - > i_nlink ) ;
ri - > i_size = cpu_to_le64 ( i_size_read ( inode ) ) ;
ri - > i_blocks = cpu_to_le64 ( inode - > i_blocks ) ;
2015-02-05 12:46:29 +03:00
2015-06-20 03:53:26 +03:00
if ( F2FS_I ( inode ) - > extent_tree )
set_raw_extent ( & F2FS_I ( inode ) - > extent_tree - > largest ,
& ri - > i_ext ) ;
else
memset ( & ri - > i_ext , 0 , sizeof ( ri - > i_ext ) ) ;
2013-08-08 10:16:22 +04:00
set_raw_inline ( F2FS_I ( inode ) , ri ) ;
2012-11-02 12:10:40 +04:00
ri - > i_atime = cpu_to_le64 ( inode - > i_atime . tv_sec ) ;
ri - > i_ctime = cpu_to_le64 ( inode - > i_ctime . tv_sec ) ;
ri - > i_mtime = cpu_to_le64 ( inode - > i_mtime . tv_sec ) ;
ri - > i_atime_nsec = cpu_to_le32 ( inode - > i_atime . tv_nsec ) ;
ri - > i_ctime_nsec = cpu_to_le32 ( inode - > i_ctime . tv_nsec ) ;
ri - > i_mtime_nsec = cpu_to_le32 ( inode - > i_mtime . tv_nsec ) ;
ri - > i_current_depth = cpu_to_le32 ( F2FS_I ( inode ) - > i_current_depth ) ;
ri - > i_xattr_nid = cpu_to_le32 ( F2FS_I ( inode ) - > i_xattr_nid ) ;
ri - > i_flags = cpu_to_le32 ( F2FS_I ( inode ) - > i_flags ) ;
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 12:52:48 +04:00
ri - > i_pino = cpu_to_le32 ( F2FS_I ( inode ) - > i_pino ) ;
2012-11-02 12:10:40 +04:00
ri - > i_generation = cpu_to_le32 ( inode - > i_generation ) ;
2014-02-27 13:20:00 +04:00
ri - > i_dir_level = F2FS_I ( inode ) - > i_dir_level ;
f2fs: save device node number into f2fs_inode
This patch stores inode->i_rdev into on-disk inode structure.
Alun reported that:
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # mknod mnt/sda1 b 8 1
aspire tmp # mknod mnt/null c 1 3
aspire tmp # mknod mnt/console c 5 1
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 5, 1 Jan 22 18:44 console
crw-r--r-- 1 root root 1, 3 Jan 22 18:44 null
brw-r--r-- 1 root root 8, 1 Jan 22 18:44 sda1
aspire tmp # umount mnt
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 console
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 null
brw-r--r-- 1 root root 0, 0 Jan 22 18:44 sda1
In this report, f2fs lost the major/minor numbers of device files after umount.
The reason was revealed that f2fs does not store the inode->i_rdev to the
on-disk inode data structure.
So, as the other file systems do, f2fs also stores i_rdev into the i_addr fields
in on-disk inode structure without any on-disk layout changes.
Note that, this bug is limited to device files made by mknod().
Reported-and-Tested-by: Alun Jones <alun.linux@ty-penguin.org.uk>
Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-01-23 04:40:23 +04:00
2013-10-08 13:01:51 +04:00
__set_inode_rdev ( inode , ri ) ;
f2fs: fix handling errors got by f2fs_write_inode
Ruslan reported that f2fs hangs with an infinite loop in f2fs_sync_file():
while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
f2fs_write_inode(inode, NULL);
The reason was revealed that the cold flag is not set even thought this inode is
a normal file. Therefore, sync_node_pages() skips to write node blocks since it
only writes cold node blocks.
The cold flag is stored to the node_footer in node block, and whenever a new
node page is allocated, it is set according to its file type, file or directory.
But, after sudden-power-off, when recovering the inode page, f2fs doesn't recover
its cold flag.
So, let's assign the cold flag in more right places.
One more thing:
If f2fs_write_inode() returns an error due to whatever situations, there would
be no dirty node pages so that sync_node_pages() returns zero.
(i.e., zero means nothing was written.)
Reported-by: Ruslan N. Marchenko <me@ruff.mobi>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-19 10:28:39 +04:00
set_cold_node ( inode , node_page ) ;
2013-06-10 04:17:01 +04:00
clear_inode_flag ( F2FS_I ( inode ) , FI_DIRTY_INODE ) ;
2016-01-08 00:23:12 +03:00
2016-01-25 16:57:05 +03:00
/* deleted inode */
if ( inode - > i_nlink = = 0 )
clear_inline_node ( node_page ) ;
2016-01-08 00:23:12 +03:00
return set_page_dirty ( node_page ) ;
2012-11-02 12:10:40 +04:00
}
2016-01-08 00:23:12 +03:00
int update_inode_page ( struct inode * inode )
2012-11-02 12:10:40 +04:00
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 12:10:40 +04:00
struct page * node_page ;
2016-01-08 00:23:12 +03:00
int ret = 0 ;
2014-01-24 04:42:16 +04:00
retry :
2012-11-02 12:10:40 +04:00
node_page = get_node_page ( sbi , inode - > i_ino ) ;
2014-01-24 04:42:16 +04:00
if ( IS_ERR ( node_page ) ) {
int err = PTR_ERR ( node_page ) ;
if ( err = = - ENOMEM ) {
cond_resched ( ) ;
goto retry ;
} else if ( err ! = - ENOENT ) {
f2fs_stop_checkpoint ( sbi ) ;
}
2016-01-08 00:23:12 +03:00
return 0 ;
2014-01-24 04:42:16 +04:00
}
2016-01-08 00:23:12 +03:00
ret = update_inode ( inode , node_page ) ;
2012-11-02 12:10:40 +04:00
f2fs_put_page ( node_page , 1 ) ;
2016-01-08 00:23:12 +03:00
return ret ;
2012-11-02 12:10:40 +04:00
}
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
int f2fs_write_inode ( struct inode * inode , struct writeback_control * wbc )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
if ( inode - > i_ino = = F2FS_NODE_INO ( sbi ) | |
inode - > i_ino = = F2FS_META_INO ( sbi ) )
return 0 ;
2013-06-10 04:17:01 +04:00
if ( ! is_inode_flag_set ( F2FS_I ( inode ) , FI_DIRTY_INODE ) )
return 0 ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
/*
2015-09-12 21:25:30 +03:00
* We need to balance fs here to prevent from producing dirty node pages
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
* during the urgent cleaning time when runing out of free sections .
*/
2016-01-08 00:23:12 +03:00
if ( update_inode_page ( inode ) )
2016-01-08 01:15:04 +03:00
f2fs_balance_fs ( sbi , true ) ;
2014-01-24 04:42:16 +04:00
return 0 ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
}
2012-11-29 08:28:09 +04:00
/*
2012-11-02 12:10:40 +04:00
* Called at the last iput ( ) if i_nlink is zero
*/
void f2fs_evict_inode ( struct inode * inode )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2015-06-23 20:36:08 +03:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
nid_t xnid = fi - > i_xattr_nid ;
2015-08-24 12:40:45 +03:00
int err = 0 ;
2012-11-02 12:10:40 +04:00
2014-10-07 04:39:50 +04:00
/* some remained atomic pages should discarded */
2014-12-09 17:08:59 +03:00
if ( f2fs_is_atomic_file ( inode ) )
2014-10-07 04:39:50 +04:00
commit_inmem_pages ( inode , true ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_evict_inode ( inode ) ;
2014-04-04 01:47:49 +04:00
truncate_inode_pages_final ( & inode - > i_data ) ;
2012-11-02 12:10:40 +04:00
if ( inode - > i_ino = = F2FS_NODE_INO ( sbi ) | |
inode - > i_ino = = F2FS_META_INO ( sbi ) )
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 08:00:57 +04:00
goto out_clear ;
2012-11-02 12:10:40 +04:00
2014-09-13 02:53:45 +04:00
f2fs_bug_on ( sbi , get_dirty_pages ( inode ) ) ;
2015-12-16 08:09:20 +03:00
remove_dirty_inode ( inode ) ;
2012-11-02 12:10:40 +04:00
2015-06-20 03:53:26 +03:00
f2fs_destroy_extent_tree ( inode ) ;
2012-11-02 12:10:40 +04:00
if ( inode - > i_nlink | | is_bad_inode ( inode ) )
goto no_delete ;
2013-01-29 13:30:07 +04:00
sb_start_intwrite ( inode - > i_sb ) ;
2015-06-23 20:36:08 +03:00
set_inode_flag ( fi , FI_NO_ALLOC ) ;
2012-11-02 12:10:40 +04:00
i_size_write ( inode , 0 ) ;
if ( F2FS_HAS_BLOCKS ( inode ) )
2015-08-24 12:40:45 +03:00
err = f2fs_truncate ( inode , true ) ;
2012-11-02 12:10:40 +04:00
2015-08-24 12:40:45 +03:00
if ( ! err ) {
f2fs_lock_op ( sbi ) ;
err = remove_inode_page ( inode ) ;
f2fs_unlock_op ( sbi ) ;
}
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
2013-01-29 13:30:07 +04:00
sb_end_intwrite ( inode - > i_sb ) ;
2012-11-02 12:10:40 +04:00
no_delete :
2015-07-15 12:28:53 +03:00
stat_dec_inline_xattr ( inode ) ;
2014-10-14 07:00:16 +04:00
stat_dec_inline_dir ( inode ) ;
2014-10-14 21:29:50 +04:00
stat_dec_inline_inode ( inode ) ;
2015-03-19 14:27:51 +03:00
2014-04-30 10:04:39 +04:00
invalidate_mapping_pages ( NODE_MAPPING ( sbi ) , inode - > i_ino , inode - > i_ino ) ;
2014-08-04 05:54:58 +04:00
if ( xnid )
invalidate_mapping_pages ( NODE_MAPPING ( sbi ) , xnid , xnid ) ;
2015-06-23 20:36:08 +03:00
if ( is_inode_flag_set ( fi , FI_APPEND_WRITE ) )
2015-12-15 08:29:47 +03:00
add_ino_entry ( sbi , inode - > i_ino , APPEND_INO ) ;
2015-06-23 20:36:08 +03:00
if ( is_inode_flag_set ( fi , FI_UPDATE_WRITE ) )
2015-12-15 08:29:47 +03:00
add_ino_entry ( sbi , inode - > i_ino , UPDATE_INO ) ;
2015-06-23 20:36:08 +03:00
if ( is_inode_flag_set ( fi , FI_FREE_NID ) ) {
2015-08-24 12:40:45 +03:00
if ( err & & err ! = - ENOENT )
alloc_nid_done ( sbi , inode - > i_ino ) ;
else
alloc_nid_failed ( sbi , inode - > i_ino ) ;
2015-06-23 20:36:08 +03:00
clear_inode_flag ( fi , FI_FREE_NID ) ;
}
2015-08-24 12:40:45 +03:00
if ( err & & err ! = - ENOENT ) {
if ( ! exist_written_data ( sbi , inode - > i_ino , ORPHAN_INO ) ) {
/*
* get here because we failed to release resource
* of inode previously , reminder our user to run fsck
* for fixing .
*/
set_sbi_flag ( sbi , SBI_NEED_FSCK ) ;
f2fs_msg ( sbi - > sb , KERN_WARNING ,
" inode (ino:%lu) resource leak, run fsck "
" to fix this issue! " , inode - > i_ino ) ;
}
}
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 08:00:57 +04:00
out_clear :
2015-04-22 06:39:58 +03:00
# ifdef CONFIG_F2FS_FS_ENCRYPTION
2015-06-23 20:36:08 +03:00
if ( fi - > i_crypt_info )
f2fs_free_encryption_info ( inode , fi - > i_crypt_info ) ;
2015-04-22 06:39:58 +03:00
# endif
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 08:00:57 +04:00
clear_inode ( inode ) ;
2012-11-02 12:10:40 +04:00
}
2014-09-25 22:55:53 +04:00
/* caller should call f2fs_lock_op() */
void handle_failed_inode ( struct inode * inode )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2015-08-24 12:40:45 +03:00
int err = 0 ;
2014-09-25 22:55:53 +04:00
clear_nlink ( inode ) ;
make_bad_inode ( inode ) ;
unlock_new_inode ( inode ) ;
i_size_write ( inode , 0 ) ;
if ( F2FS_HAS_BLOCKS ( inode ) )
2015-08-24 12:40:45 +03:00
err = f2fs_truncate ( inode , false ) ;
if ( ! err )
err = remove_inode_page ( inode ) ;
2014-09-25 22:55:53 +04:00
2015-08-24 12:40:45 +03:00
/*
* if we skip truncate_node in remove_inode_page bacause we failed
* before , it ' s better to find another way to release resource of
* this inode ( e . g . valid block count , node block or nid ) . Here we
* choose to add this inode to orphan list , so that we can call iput
* for releasing in orphan recovery flow .
*
* Note : we should add inode to orphan list before f2fs_unlock_op ( )
* so we can prevent losing this orphan when encoutering checkpoint
* and following suddenly power - off .
*/
if ( err & & err ! = - ENOENT ) {
err = acquire_orphan_inode ( sbi ) ;
if ( ! err )
add_orphan_inode ( sbi , inode - > i_ino ) ;
}
2014-09-25 22:55:53 +04:00
2015-06-23 20:36:08 +03:00
set_inode_flag ( F2FS_I ( inode ) , FI_FREE_NID ) ;
2014-09-25 22:55:53 +04:00
f2fs_unlock_op ( sbi ) ;
/* iput will drop the inode object */
iput ( inode ) ;
}