2012-11-29 08:28:09 +04:00
/*
2012-11-02 12:10:40 +04:00
* fs / f2fs / inode . c
*
* Copyright ( c ) 2012 Samsung Electronics Co . , Ltd .
* http : //www.samsung.com/
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
# include <linux/fs.h>
# include <linux/f2fs_fs.h>
# include <linux/buffer_head.h>
# include <linux/writeback.h>
2014-04-15 10:19:38 +04:00
# include <linux/bitops.h>
2012-11-02 12:10:40 +04:00
# include "f2fs.h"
# include "node.h"
2013-04-19 20:28:40 +04:00
# include <trace/events/f2fs.h>
2012-11-02 12:10:40 +04:00
void f2fs_set_inode_flags ( struct inode * inode )
{
unsigned int flags = F2FS_I ( inode ) - > i_flags ;
2014-04-15 10:19:38 +04:00
unsigned int new_fl = 0 ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_SYNC_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_SYNC ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_APPEND_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_APPEND ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_IMMUTABLE_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_IMMUTABLE ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_NOATIME_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_NOATIME ;
2012-11-02 12:10:40 +04:00
if ( flags & FS_DIRSYNC_FL )
2014-04-15 10:19:38 +04:00
new_fl | = S_DIRSYNC ;
set_mask_bits ( & inode - > i_flags ,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC , new_fl ) ;
2012-11-02 12:10:40 +04:00
}
2013-10-08 13:01:51 +04:00
static void __get_inode_rdev ( struct inode * inode , struct f2fs_inode * ri )
{
if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) | |
S_ISFIFO ( inode - > i_mode ) | | S_ISSOCK ( inode - > i_mode ) ) {
if ( ri - > i_addr [ 0 ] )
2014-01-18 00:44:39 +04:00
inode - > i_rdev =
old_decode_dev ( le32_to_cpu ( ri - > i_addr [ 0 ] ) ) ;
2013-10-08 13:01:51 +04:00
else
2014-01-18 00:44:39 +04:00
inode - > i_rdev =
new_decode_dev ( le32_to_cpu ( ri - > i_addr [ 1 ] ) ) ;
2013-10-08 13:01:51 +04:00
}
}
static void __set_inode_rdev ( struct inode * inode , struct f2fs_inode * ri )
{
if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) ) {
if ( old_valid_dev ( inode - > i_rdev ) ) {
2014-01-18 00:44:39 +04:00
ri - > i_addr [ 0 ] =
cpu_to_le32 ( old_encode_dev ( inode - > i_rdev ) ) ;
2013-10-08 13:01:51 +04:00
ri - > i_addr [ 1 ] = 0 ;
} else {
ri - > i_addr [ 0 ] = 0 ;
2014-01-18 00:44:39 +04:00
ri - > i_addr [ 1 ] =
cpu_to_le32 ( new_encode_dev ( inode - > i_rdev ) ) ;
2013-10-08 13:01:51 +04:00
ri - > i_addr [ 2 ] = 0 ;
}
}
}
2015-01-06 09:28:43 +03:00
static void __recover_inline_status ( struct inode * inode , struct page * ipage )
2014-10-24 06:48:09 +04:00
{
void * inline_data = inline_data_addr ( ipage ) ;
2015-01-06 09:28:43 +03:00
__le32 * start = inline_data ;
__le32 * end = start + MAX_INLINE_DATA / sizeof ( __le32 ) ;
2014-10-24 06:48:09 +04:00
2015-01-06 09:28:43 +03:00
while ( start < end ) {
if ( * start + + ) {
f2fs_wait_on_page_writeback ( ipage , NODE ) ;
2014-10-24 06:48:09 +04:00
2015-01-06 09:28:43 +03:00
set_inode_flag ( F2FS_I ( inode ) , FI_DATA_EXIST ) ;
set_raw_inline ( F2FS_I ( inode ) , F2FS_INODE ( ipage ) ) ;
set_page_dirty ( ipage ) ;
return ;
}
2014-10-24 06:48:09 +04:00
}
2015-01-06 09:28:43 +03:00
return ;
2014-10-24 06:48:09 +04:00
}
2012-11-02 12:10:40 +04:00
static int do_read_inode ( struct inode * inode )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 12:10:40 +04:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct page * node_page ;
struct f2fs_inode * ri ;
/* Check if ino is within scope */
2013-03-17 12:27:20 +04:00
if ( check_nid_range ( sbi , inode - > i_ino ) ) {
f2fs_msg ( inode - > i_sb , KERN_ERR , " bad inode number: %lu " ,
( unsigned long ) inode - > i_ino ) ;
2014-06-12 09:23:41 +04:00
WARN_ON ( 1 ) ;
2013-03-17 12:27:20 +04:00
return - EINVAL ;
}
2012-11-02 12:10:40 +04:00
node_page = get_node_page ( sbi , inode - > i_ino ) ;
if ( IS_ERR ( node_page ) )
return PTR_ERR ( node_page ) ;
2013-12-26 11:30:41 +04:00
ri = F2FS_INODE ( node_page ) ;
2012-11-02 12:10:40 +04:00
inode - > i_mode = le16_to_cpu ( ri - > i_mode ) ;
i_uid_write ( inode , le32_to_cpu ( ri - > i_uid ) ) ;
i_gid_write ( inode , le32_to_cpu ( ri - > i_gid ) ) ;
set_nlink ( inode , le32_to_cpu ( ri - > i_links ) ) ;
inode - > i_size = le64_to_cpu ( ri - > i_size ) ;
inode - > i_blocks = le64_to_cpu ( ri - > i_blocks ) ;
inode - > i_atime . tv_sec = le64_to_cpu ( ri - > i_atime ) ;
inode - > i_ctime . tv_sec = le64_to_cpu ( ri - > i_ctime ) ;
inode - > i_mtime . tv_sec = le64_to_cpu ( ri - > i_mtime ) ;
inode - > i_atime . tv_nsec = le32_to_cpu ( ri - > i_atime_nsec ) ;
inode - > i_ctime . tv_nsec = le32_to_cpu ( ri - > i_ctime_nsec ) ;
inode - > i_mtime . tv_nsec = le32_to_cpu ( ri - > i_mtime_nsec ) ;
inode - > i_generation = le32_to_cpu ( ri - > i_generation ) ;
fi - > i_current_depth = le32_to_cpu ( ri - > i_current_depth ) ;
fi - > i_xattr_nid = le32_to_cpu ( ri - > i_xattr_nid ) ;
fi - > i_flags = le32_to_cpu ( ri - > i_flags ) ;
fi - > flags = 0 ;
fi - > i_advise = ri - > i_advise ;
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 12:52:48 +04:00
fi - > i_pino = le32_to_cpu ( ri - > i_pino ) ;
2014-02-27 13:20:00 +04:00
fi - > i_dir_level = ri - > i_dir_level ;
2013-10-08 13:01:51 +04:00
2015-02-05 12:46:29 +03:00
write_lock ( & fi - > ext_lock ) ;
2012-11-02 12:10:40 +04:00
get_extent_info ( & fi - > ext , ri - > i_ext ) ;
2015-02-05 12:46:29 +03:00
write_unlock ( & fi - > ext_lock ) ;
2013-08-08 10:16:22 +04:00
get_inline_info ( fi , ri ) ;
2013-10-08 13:01:51 +04:00
2014-10-24 06:48:09 +04:00
/* check data exist */
if ( f2fs_has_inline_data ( inode ) & & ! f2fs_exist_data ( inode ) )
2015-01-06 09:28:43 +03:00
__recover_inline_status ( inode , node_page ) ;
2014-10-24 06:48:09 +04:00
2013-10-08 13:01:51 +04:00
/* get rdev by using inline_info */
__get_inode_rdev ( inode , ri ) ;
2012-11-02 12:10:40 +04:00
f2fs_put_page ( node_page , 1 ) ;
2014-12-05 21:51:50 +03:00
stat_inc_inline_inode ( inode ) ;
stat_inc_inline_dir ( inode ) ;
2015-01-06 09:28:43 +03:00
return 0 ;
2012-11-02 12:10:40 +04:00
}
struct inode * f2fs_iget ( struct super_block * sb , unsigned long ino )
{
struct f2fs_sb_info * sbi = F2FS_SB ( sb ) ;
struct inode * inode ;
2013-04-19 20:28:40 +04:00
int ret = 0 ;
2012-11-02 12:10:40 +04:00
inode = iget_locked ( sb , ino ) ;
if ( ! inode )
return ERR_PTR ( - ENOMEM ) ;
2013-04-19 20:28:40 +04:00
if ( ! ( inode - > i_state & I_NEW ) ) {
trace_f2fs_iget ( inode ) ;
2012-11-02 12:10:40 +04:00
return inode ;
2013-04-19 20:28:40 +04:00
}
2012-11-02 12:10:40 +04:00
if ( ino = = F2FS_NODE_INO ( sbi ) | | ino = = F2FS_META_INO ( sbi ) )
goto make_now ;
ret = do_read_inode ( inode ) ;
if ( ret )
goto bad_inode ;
make_now :
if ( ino = = F2FS_NODE_INO ( sbi ) ) {
inode - > i_mapping - > a_ops = & f2fs_node_aops ;
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_ZERO ) ;
} else if ( ino = = F2FS_META_INO ( sbi ) ) {
inode - > i_mapping - > a_ops = & f2fs_meta_aops ;
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_ZERO ) ;
} else if ( S_ISREG ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_file_inode_operations ;
inode - > i_fop = & f2fs_file_operations ;
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
} else if ( S_ISDIR ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_dir_inode_operations ;
inode - > i_fop = & f2fs_dir_operations ;
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
2014-10-18 04:57:29 +04:00
mapping_set_gfp_mask ( inode - > i_mapping , GFP_F2FS_HIGH_ZERO ) ;
2012-11-02 12:10:40 +04:00
} else if ( S_ISLNK ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_symlink_inode_operations ;
inode - > i_mapping - > a_ops = & f2fs_dblock_aops ;
} else if ( S_ISCHR ( inode - > i_mode ) | | S_ISBLK ( inode - > i_mode ) | |
S_ISFIFO ( inode - > i_mode ) | | S_ISSOCK ( inode - > i_mode ) ) {
inode - > i_op = & f2fs_special_inode_operations ;
init_special_inode ( inode , inode - > i_mode , inode - > i_rdev ) ;
} else {
ret = - EIO ;
goto bad_inode ;
}
unlock_new_inode ( inode ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_iget ( inode ) ;
2012-11-02 12:10:40 +04:00
return inode ;
bad_inode :
iget_failed ( inode ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_iget_exit ( inode , ret ) ;
2012-11-02 12:10:40 +04:00
return ERR_PTR ( ret ) ;
}
void update_inode ( struct inode * inode , struct page * node_page )
{
struct f2fs_inode * ri ;
2014-01-10 11:26:14 +04:00
f2fs_wait_on_page_writeback ( node_page , NODE ) ;
2012-11-02 12:10:40 +04:00
2013-12-26 11:30:41 +04:00
ri = F2FS_INODE ( node_page ) ;
2012-11-02 12:10:40 +04:00
ri - > i_mode = cpu_to_le16 ( inode - > i_mode ) ;
ri - > i_advise = F2FS_I ( inode ) - > i_advise ;
ri - > i_uid = cpu_to_le32 ( i_uid_read ( inode ) ) ;
ri - > i_gid = cpu_to_le32 ( i_gid_read ( inode ) ) ;
ri - > i_links = cpu_to_le32 ( inode - > i_nlink ) ;
ri - > i_size = cpu_to_le64 ( i_size_read ( inode ) ) ;
ri - > i_blocks = cpu_to_le64 ( inode - > i_blocks ) ;
2015-02-05 12:46:29 +03:00
read_lock ( & F2FS_I ( inode ) - > ext_lock ) ;
2012-11-02 12:10:40 +04:00
set_raw_extent ( & F2FS_I ( inode ) - > ext , & ri - > i_ext ) ;
2015-02-05 12:46:29 +03:00
read_unlock ( & F2FS_I ( inode ) - > ext_lock ) ;
2013-08-08 10:16:22 +04:00
set_raw_inline ( F2FS_I ( inode ) , ri ) ;
2012-11-02 12:10:40 +04:00
ri - > i_atime = cpu_to_le64 ( inode - > i_atime . tv_sec ) ;
ri - > i_ctime = cpu_to_le64 ( inode - > i_ctime . tv_sec ) ;
ri - > i_mtime = cpu_to_le64 ( inode - > i_mtime . tv_sec ) ;
ri - > i_atime_nsec = cpu_to_le32 ( inode - > i_atime . tv_nsec ) ;
ri - > i_ctime_nsec = cpu_to_le32 ( inode - > i_ctime . tv_nsec ) ;
ri - > i_mtime_nsec = cpu_to_le32 ( inode - > i_mtime . tv_nsec ) ;
ri - > i_current_depth = cpu_to_le32 ( F2FS_I ( inode ) - > i_current_depth ) ;
ri - > i_xattr_nid = cpu_to_le32 ( F2FS_I ( inode ) - > i_xattr_nid ) ;
ri - > i_flags = cpu_to_le32 ( F2FS_I ( inode ) - > i_flags ) ;
f2fs: fix tracking parent inode number
Previously, f2fs didn't track the parent inode number correctly which is stored
in each f2fs_inode. In the case of the following scenario, a bug can be occured.
Let's suppose there are one directory, "/b", and two files, "/a" and "/b/a".
- pino of "/a" is ROOT_INO.
- pino of "/b/a" is DIR_B_INO.
Then,
# sync
: The inode pages of "/a" and "/b/a" contain the parent inode numbers as
ROOT_INO and DIR_B_INO respectively.
# mv /a /b/a
: The parent inode number of "/a" should be changed to DIR_B_INO, but f2fs
didn't do that. Ref. f2fs_set_link().
In order to fix this clearly, I added i_pino in f2fs_inode_info, and whenever
it needs to be changed like in f2fs_add_link() and f2fs_set_link(), it is
updated temporarily in f2fs_inode_info.
And later, f2fs_write_inode() stores the latest information to the inode pages.
For power-off-recovery, f2fs_sync_file() triggers simply f2fs_write_inode().
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-10 12:52:48 +04:00
ri - > i_pino = cpu_to_le32 ( F2FS_I ( inode ) - > i_pino ) ;
2012-11-02 12:10:40 +04:00
ri - > i_generation = cpu_to_le32 ( inode - > i_generation ) ;
2014-02-27 13:20:00 +04:00
ri - > i_dir_level = F2FS_I ( inode ) - > i_dir_level ;
f2fs: save device node number into f2fs_inode
This patch stores inode->i_rdev into on-disk inode structure.
Alun reported that:
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # mknod mnt/sda1 b 8 1
aspire tmp # mknod mnt/null c 1 3
aspire tmp # mknod mnt/console c 5 1
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 5, 1 Jan 22 18:44 console
crw-r--r-- 1 root root 1, 3 Jan 22 18:44 null
brw-r--r-- 1 root root 8, 1 Jan 22 18:44 sda1
aspire tmp # umount mnt
aspire tmp # mount -t f2fs /dev/sdb mnt
aspire tmp # ls -l mnt
total 2
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 console
crw-r--r-- 1 root root 0, 0 Jan 22 18:44 null
brw-r--r-- 1 root root 0, 0 Jan 22 18:44 sda1
In this report, f2fs lost the major/minor numbers of device files after umount.
The reason was revealed that f2fs does not store the inode->i_rdev to the
on-disk inode data structure.
So, as the other file systems do, f2fs also stores i_rdev into the i_addr fields
in on-disk inode structure without any on-disk layout changes.
Note that, this bug is limited to device files made by mknod().
Reported-and-Tested-by: Alun Jones <alun.linux@ty-penguin.org.uk>
Signed-off-by: Changman Lee <cm224.lee@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-01-23 04:40:23 +04:00
2013-10-08 13:01:51 +04:00
__set_inode_rdev ( inode , ri ) ;
f2fs: fix handling errors got by f2fs_write_inode
Ruslan reported that f2fs hangs with an infinite loop in f2fs_sync_file():
while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
f2fs_write_inode(inode, NULL);
The reason was revealed that the cold flag is not set even thought this inode is
a normal file. Therefore, sync_node_pages() skips to write node blocks since it
only writes cold node blocks.
The cold flag is stored to the node_footer in node block, and whenever a new
node page is allocated, it is set according to its file type, file or directory.
But, after sudden-power-off, when recovering the inode page, f2fs doesn't recover
its cold flag.
So, let's assign the cold flag in more right places.
One more thing:
If f2fs_write_inode() returns an error due to whatever situations, there would
be no dirty node pages so that sync_node_pages() returns zero.
(i.e., zero means nothing was written.)
Reported-by: Ruslan N. Marchenko <me@ruff.mobi>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-12-19 10:28:39 +04:00
set_cold_node ( inode , node_page ) ;
2012-11-02 12:10:40 +04:00
set_page_dirty ( node_page ) ;
2013-10-08 13:01:51 +04:00
2013-06-10 04:17:01 +04:00
clear_inode_flag ( F2FS_I ( inode ) , FI_DIRTY_INODE ) ;
2012-11-02 12:10:40 +04:00
}
2014-01-24 04:42:16 +04:00
void update_inode_page ( struct inode * inode )
2012-11-02 12:10:40 +04:00
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 12:10:40 +04:00
struct page * node_page ;
2014-01-24 04:42:16 +04:00
retry :
2012-11-02 12:10:40 +04:00
node_page = get_node_page ( sbi , inode - > i_ino ) ;
2014-01-24 04:42:16 +04:00
if ( IS_ERR ( node_page ) ) {
int err = PTR_ERR ( node_page ) ;
if ( err = = - ENOMEM ) {
cond_resched ( ) ;
goto retry ;
} else if ( err ! = - ENOENT ) {
f2fs_stop_checkpoint ( sbi ) ;
}
return ;
}
2012-11-02 12:10:40 +04:00
update_inode ( inode , node_page ) ;
f2fs_put_page ( node_page , 1 ) ;
}
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
int f2fs_write_inode ( struct inode * inode , struct writeback_control * wbc )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
if ( inode - > i_ino = = F2FS_NODE_INO ( sbi ) | |
inode - > i_ino = = F2FS_META_INO ( sbi ) )
return 0 ;
2013-06-10 04:17:01 +04:00
if ( ! is_inode_flag_set ( F2FS_I ( inode ) , FI_DIRTY_INODE ) )
return 0 ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
/*
* We need to lock here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections .
*/
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_lock_op ( sbi ) ;
2014-01-24 04:42:16 +04:00
update_inode_page ( inode ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_unlock_op ( sbi ) ;
2013-08-15 15:17:01 +04:00
if ( wbc )
f2fs_balance_fs ( sbi ) ;
2014-01-24 04:42:16 +04:00
return 0 ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
}
2012-11-29 08:28:09 +04:00
/*
2012-11-02 12:10:40 +04:00
* Called at the last iput ( ) if i_nlink is zero
*/
void f2fs_evict_inode ( struct inode * inode )
{
2014-09-03 02:31:18 +04:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2014-08-04 05:54:58 +04:00
nid_t xnid = F2FS_I ( inode ) - > i_xattr_nid ;
2012-11-02 12:10:40 +04:00
2014-10-07 04:39:50 +04:00
/* some remained atomic pages should discarded */
2014-12-09 17:08:59 +03:00
if ( f2fs_is_atomic_file ( inode ) )
2014-10-07 04:39:50 +04:00
commit_inmem_pages ( inode , true ) ;
2013-04-19 20:28:40 +04:00
trace_f2fs_evict_inode ( inode ) ;
2014-04-04 01:47:49 +04:00
truncate_inode_pages_final ( & inode - > i_data ) ;
2012-11-02 12:10:40 +04:00
if ( inode - > i_ino = = F2FS_NODE_INO ( sbi ) | |
inode - > i_ino = = F2FS_META_INO ( sbi ) )
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 08:00:57 +04:00
goto out_clear ;
2012-11-02 12:10:40 +04:00
2014-09-13 02:53:45 +04:00
f2fs_bug_on ( sbi , get_dirty_pages ( inode ) ) ;
2012-11-02 12:10:40 +04:00
remove_dirty_dir_inode ( inode ) ;
if ( inode - > i_nlink | | is_bad_inode ( inode ) )
goto no_delete ;
2013-01-29 13:30:07 +04:00
sb_start_intwrite ( inode - > i_sb ) ;
2012-11-02 12:10:40 +04:00
set_inode_flag ( F2FS_I ( inode ) , FI_NO_ALLOC ) ;
i_size_write ( inode , 0 ) ;
if ( F2FS_HAS_BLOCKS ( inode ) )
f2fs_truncate ( inode ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_lock_op ( sbi ) ;
2012-11-02 12:10:40 +04:00
remove_inode_page ( inode ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_unlock_op ( sbi ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
2013-01-29 13:30:07 +04:00
sb_end_intwrite ( inode - > i_sb ) ;
2012-11-02 12:10:40 +04:00
no_delete :
2014-10-14 07:00:16 +04:00
stat_dec_inline_dir ( inode ) ;
2014-10-14 21:29:50 +04:00
stat_dec_inline_inode ( inode ) ;
f2fs: enable rb-tree extent cache
This patch enables rb-tree based extent cache in f2fs.
When we mount with "-o extent_cache", f2fs will try to add recently accessed
page-block mappings into rb-tree based extent cache as much as possible, instead
of original one extent info cache.
By this way, f2fs can support more effective cache between dnode page cache and
disk. It will supply high hit ratio in the cache with fewer memory when dnode
page cache are reclaimed in environment of low memory.
Storage: Sandisk sd card 64g
1.append write file (offset: 0, size: 128M);
2.override write file (offset: 2M, size: 1M);
3.override write file (offset: 4M, size: 1M);
...
4.override write file (offset: 48M, size: 1M);
...
5.override write file (offset: 112M, size: 1M);
6.sync
7.echo 3 > /proc/sys/vm/drop_caches
8.read file (size:128M, unit: 4k, count: 32768)
(time dd if=/mnt/f2fs/128m bs=4k count=32768)
Extent Hit Ratio:
before patched
Hit Ratio 121 / 1071 1071 / 1071
Performance:
before patched
real 0m37.051s 0m35.556s
user 0m0.040s 0m0.026s
sys 0m2.990s 0m2.251s
Memory Cost:
before patched
Tree Count: 0 1 (size: 24 bytes)
Node Count: 0 45 (size: 1440 bytes)
v3:
o retest and given more details of test result.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-02-05 12:57:31 +03:00
f2fs_destroy_extent_tree ( inode ) ;
2014-04-30 10:04:39 +04:00
invalidate_mapping_pages ( NODE_MAPPING ( sbi ) , inode - > i_ino , inode - > i_ino ) ;
2014-08-04 05:54:58 +04:00
if ( xnid )
invalidate_mapping_pages ( NODE_MAPPING ( sbi ) , xnid , xnid ) ;
2014-07-25 18:40:59 +04:00
if ( is_inode_flag_set ( F2FS_I ( inode ) , FI_APPEND_WRITE ) )
add_dirty_inode ( sbi , inode - > i_ino , APPEND_INO ) ;
if ( is_inode_flag_set ( F2FS_I ( inode ) , FI_UPDATE_WRITE ) )
add_dirty_inode ( sbi , inode - > i_ino , UPDATE_INO ) ;
f2fs: avoid use invalid mapping of node_inode when evict meta inode
Andrey Tsyvarev reported:
"Using memory error detector reveals the following use-after-free error
in 3.15.0:
AddressSanitizer: heap-use-after-free in f2fs_evict_inode
Read of size 8 by thread T22279:
[<ffffffffa02d8702>] f2fs_evict_inode+0x102/0x2e0 [f2fs]
[<ffffffff812359af>] evict+0x15f/0x290
[< inlined >] iput+0x196/0x280 iput_final
[<ffffffff812369a6>] iput+0x196/0x280
[<ffffffffa02dc416>] f2fs_put_super+0xd6/0x170 [f2fs]
[<ffffffff81210095>] generic_shutdown_super+0xc5/0x1b0
[<ffffffff812105fd>] kill_block_super+0x4d/0xb0
[<ffffffff81210a86>] deactivate_locked_super+0x66/0x80
[<ffffffff81211c98>] deactivate_super+0x68/0x80
[<ffffffff8123cc88>] mntput_no_expire+0x198/0x250
[< inlined >] SyS_umount+0xe9/0x1a0 SYSC_umount
[<ffffffff8123f1c9>] SyS_umount+0xe9/0x1a0
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
Freed by thread T3:
[<ffffffffa02dc337>] f2fs_i_callback+0x27/0x30 [f2fs]
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_reclaim
[< inlined >] rcu_process_callbacks+0x2d6/0x930 rcu_do_batch
[< inlined >] rcu_process_callbacks+0x2d6/0x930 invoke_rcu_callbacks
[< inlined >] rcu_process_callbacks+0x2d6/0x930 __rcu_process_callbacks
[<ffffffff810fd266>] rcu_process_callbacks+0x2d6/0x930
[<ffffffff8107cce2>] __do_softirq+0x142/0x380
[<ffffffff8107cf50>] run_ksoftirqd+0x30/0x50
[<ffffffff810b2a87>] smpboot_thread_fn+0x197/0x280
[<ffffffff810a8238>] kthread+0x148/0x160
[<ffffffff81cc8d4c>] ret_from_fork+0x7c/0xb0
Allocated by thread T22276:
[<ffffffffa02dc7dd>] f2fs_alloc_inode+0x2d/0x170 [f2fs]
[<ffffffff81235e2a>] iget_locked+0x10a/0x230
[<ffffffffa02d7495>] f2fs_iget+0x35/0xa80 [f2fs]
[<ffffffffa02e2393>] f2fs_fill_super+0xb53/0xff0 [f2fs]
[<ffffffff81211bce>] mount_bdev+0x1de/0x240
[<ffffffffa02dbce0>] f2fs_mount+0x10/0x20 [f2fs]
[<ffffffff81212a85>] mount_fs+0x55/0x220
[<ffffffff8123c026>] vfs_kern_mount+0x66/0x200
[< inlined >] do_mount+0x2b4/0x1120 do_new_mount
[<ffffffff812400d4>] do_mount+0x2b4/0x1120
[< inlined >] SyS_mount+0xb2/0x110 SYSC_mount
[<ffffffff812414a2>] SyS_mount+0xb2/0x110
[<ffffffff81cc8df9>] system_call_fastpath+0x16/0x1b
The buggy address ffff8800587866c8 is located 48 bytes inside
of 680-byte region [ffff880058786698, ffff880058786940)
Memory state around the buggy address:
ffff880058786100: ffffffff ffffffff ffffffff ffffffff
ffff880058786200: ffffffff ffffffff ffffffrr rrrrrrrr
ffff880058786300: rrrrrrrr rrffffff ffffffff ffffffff
ffff880058786400: ffffffff ffffffff ffffffff ffffffff
ffff880058786500: ffffffff ffffffff ffffffff fffffffr
>ffff880058786600: rrrrrrrr rrrrrrrr rrrfffff ffffffff
^
ffff880058786700: ffffffff ffffffff ffffffff ffffffff
ffff880058786800: ffffffff ffffffff ffffffff ffffffff
ffff880058786900: ffffffff rrrrrrrr rrrrrrrr rrrr....
ffff880058786a00: ........ ........ ........ ........
ffff880058786b00: ........ ........ ........ ........
Legend:
f - 8 freed bytes
r - 8 redzone bytes
. - 8 allocated bytes
x=1..7 - x allocated bytes + (8-x) redzone bytes
Investigation shows, that f2fs_evict_inode, when called for
'meta_inode', uses invalidate_mapping_pages() for 'node_inode'.
But 'node_inode' is deleted before 'meta_inode' in f2fs_put_super via
iput().
It seems that in common usage scenario this use-after-free is benign,
because 'node_inode' remains partially valid data even after
kmem_cache_free().
But things may change if, while 'meta_inode' is evicted in one f2fs
filesystem, another (mounted) f2fs filesystem requests inode from cache,
and formely
'node_inode' of the first filesystem is returned."
Nids for both meta_inode and node_inode are reservation, so it's not necessary
for us to invalidate pages which will never be allocated.
To fix this issue, let's skipping needlessly invalidating pages for
{meta,node}_inode in f2fs_evict_inode.
Reported-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Tested-by: Andrey Tsyvarev <tsyvarev@ispras.ru>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-07-25 08:00:57 +04:00
out_clear :
clear_inode ( inode ) ;
2012-11-02 12:10:40 +04:00
}
2014-09-25 22:55:53 +04:00
/* caller should call f2fs_lock_op() */
void handle_failed_inode ( struct inode * inode )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
clear_nlink ( inode ) ;
make_bad_inode ( inode ) ;
unlock_new_inode ( inode ) ;
i_size_write ( inode , 0 ) ;
if ( F2FS_HAS_BLOCKS ( inode ) )
f2fs_truncate ( inode ) ;
remove_inode_page ( inode ) ;
2014-10-24 06:48:09 +04:00
clear_inode_flag ( F2FS_I ( inode ) , FI_INLINE_DATA ) ;
2014-10-14 07:00:16 +04:00
clear_inode_flag ( F2FS_I ( inode ) , FI_INLINE_DENTRY ) ;
2014-09-25 22:55:53 +04:00
alloc_nid_failed ( sbi , inode - > i_ino ) ;
f2fs_unlock_op ( sbi ) ;
/* iput will drop the inode object */
iput ( inode ) ;
}