2012-11-29 08:28:09 +04:00
/*
2012-11-02 12:13:32 +04:00
* fs / f2fs / recovery . c
*
* Copyright ( c ) 2012 Samsung Electronics Co . , Ltd .
* http : //www.samsung.com/
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation .
*/
# include <linux/fs.h>
# include <linux/f2fs_fs.h>
# include "f2fs.h"
# include "node.h"
# include "segment.h"
2014-09-16 03:46:08 +04:00
/*
* Roll forward recovery scenarios .
*
* [ Term ] F : fsync_mark , D : dentry_mark
*
* 1. inode ( x ) | CP | inode ( x ) | dnode ( F )
* - > Update the latest inode ( x ) .
*
* 2. inode ( x ) | CP | inode ( F ) | dnode ( F )
* - > No problem .
*
* 3. inode ( x ) | CP | dnode ( F ) | inode ( x )
* - > Recover to the latest dnode ( F ) , and drop the last inode ( x )
*
* 4. inode ( x ) | CP | dnode ( F ) | inode ( F )
* - > No problem .
*
* 5. CP | inode ( x ) | dnode ( F )
* - > The inode ( DF ) was missing . Should drop this dnode ( F ) .
*
* 6. CP | inode ( DF ) | dnode ( F )
* - > No problem .
*
* 7. CP | dnode ( F ) | inode ( DF )
* - > If f2fs_iget fails , then goto next to find inode ( DF ) .
*
* 8. CP | dnode ( F ) | inode ( x )
* - > If f2fs_iget fails , then goto next to find inode ( DF ) .
* But it will fail due to no inode ( DF ) .
*/
2012-11-02 12:13:32 +04:00
static struct kmem_cache * fsync_entry_slab ;
bool space_for_roll_forward ( struct f2fs_sb_info * sbi )
{
if ( sbi - > last_valid_block_count + sbi - > alloc_valid_block_count
> sbi - > user_block_count )
return false ;
return true ;
}
static struct fsync_inode_entry * get_fsync_inode ( struct list_head * head ,
nid_t ino )
{
struct fsync_inode_entry * entry ;
2014-03-29 07:33:17 +04:00
list_for_each_entry ( entry , head , list )
2012-11-02 12:13:32 +04:00
if ( entry - > inode - > i_ino = = ino )
return entry ;
2014-03-29 07:33:17 +04:00
2012-11-02 12:13:32 +04:00
return NULL ;
}
2014-09-12 01:29:06 +04:00
static int recover_dentry ( struct inode * inode , struct page * ipage )
2012-11-02 12:13:32 +04:00
{
2013-12-26 11:30:41 +04:00
struct f2fs_inode * raw_inode = F2FS_INODE ( ipage ) ;
2013-05-15 11:40:02 +04:00
nid_t pino = le32_to_cpu ( raw_inode - > i_pino ) ;
2013-05-28 04:19:22 +04:00
struct f2fs_dir_entry * de ;
2013-01-26 01:15:43 +04:00
struct qstr name ;
2012-11-02 12:13:32 +04:00
struct page * page ;
2013-05-28 04:19:22 +04:00
struct inode * dir , * einode ;
2012-11-02 12:13:32 +04:00
int err = 0 ;
2014-04-15 06:19:28 +04:00
dir = f2fs_iget ( inode - > i_sb , pino ) ;
if ( IS_ERR ( dir ) ) {
err = PTR_ERR ( dir ) ;
goto out ;
}
2013-01-26 01:15:43 +04:00
name . len = le32_to_cpu ( raw_inode - > i_namelen ) ;
name . name = raw_inode - > i_name ;
2013-12-23 07:12:21 +04:00
if ( unlikely ( name . len > F2FS_NAME_LEN ) ) {
WARN_ON ( 1 ) ;
err = - ENAMETOOLONG ;
2014-06-06 22:05:03 +04:00
goto out_err ;
2013-12-23 07:12:21 +04:00
}
2013-05-28 04:19:22 +04:00
retry :
de = f2fs_find_entry ( dir , & name , & page ) ;
2014-08-08 04:06:18 +04:00
if ( de & & inode - > i_ino = = le32_to_cpu ( de - > ino ) ) {
clear_inode_flag ( F2FS_I ( inode ) , FI_INC_LINK ) ;
2013-09-24 18:40:57 +04:00
goto out_unmap_put ;
2014-08-08 04:06:18 +04:00
}
2013-05-28 04:19:22 +04:00
if ( de ) {
einode = f2fs_iget ( inode - > i_sb , le32_to_cpu ( de - > ino ) ) ;
if ( IS_ERR ( einode ) ) {
WARN_ON ( 1 ) ;
2014-04-28 13:58:34 +04:00
err = PTR_ERR ( einode ) ;
if ( err = = - ENOENT )
2013-05-28 04:19:22 +04:00
err = - EEXIST ;
2013-09-24 18:40:57 +04:00
goto out_unmap_put ;
}
2014-09-03 02:31:18 +04:00
err = acquire_orphan_inode ( F2FS_I_SB ( inode ) ) ;
2013-09-24 18:40:57 +04:00
if ( err ) {
iput ( einode ) ;
goto out_unmap_put ;
2013-05-28 04:19:22 +04:00
}
2014-09-24 14:17:04 +04:00
f2fs_delete_entry ( de , page , dir , einode ) ;
2013-05-28 04:19:22 +04:00
iput ( einode ) ;
goto retry ;
2012-11-02 12:13:32 +04:00
}
2013-05-28 04:19:22 +04:00
err = __f2fs_add_link ( dir , & name , inode ) ;
2014-06-06 22:05:03 +04:00
if ( err )
goto out_err ;
if ( is_inode_flag_set ( F2FS_I ( dir ) , FI_DELAY_IPUT ) ) {
iput ( dir ) ;
} else {
add_dirty_dir_inode ( dir ) ;
set_inode_flag ( F2FS_I ( dir ) , FI_DELAY_IPUT ) ;
}
2013-09-24 18:40:57 +04:00
goto out ;
out_unmap_put :
2014-11-22 03:36:28 +03:00
f2fs_dentry_kunmap ( dir , page ) ;
2013-09-24 18:40:57 +04:00
f2fs_put_page ( page , 0 ) ;
2014-06-06 22:05:03 +04:00
out_err :
iput ( dir ) ;
2012-11-02 12:13:32 +04:00
out :
2014-01-18 00:44:39 +04:00
f2fs_msg ( inode - > i_sb , KERN_NOTICE ,
" %s: ino = %x, name = %s, dir = %lx, err = %d " ,
__func__ , ino_of_node ( ipage ) , raw_inode - > i_name ,
2013-05-23 14:02:13 +04:00
IS_ERR ( dir ) ? 0 : dir - > i_ino , err ) ;
2012-11-02 12:13:32 +04:00
return err ;
}
2014-09-12 01:29:06 +04:00
static void recover_inode ( struct inode * inode , struct page * page )
2012-11-02 12:13:32 +04:00
{
2014-09-16 03:46:08 +04:00
struct f2fs_inode * raw = F2FS_INODE ( page ) ;
inode - > i_mode = le16_to_cpu ( raw - > i_mode ) ;
i_size_write ( inode , le64_to_cpu ( raw - > i_size ) ) ;
inode - > i_atime . tv_sec = le64_to_cpu ( raw - > i_mtime ) ;
inode - > i_ctime . tv_sec = le64_to_cpu ( raw - > i_ctime ) ;
inode - > i_mtime . tv_sec = le64_to_cpu ( raw - > i_mtime ) ;
inode - > i_atime . tv_nsec = le32_to_cpu ( raw - > i_mtime_nsec ) ;
inode - > i_ctime . tv_nsec = le32_to_cpu ( raw - > i_ctime_nsec ) ;
inode - > i_mtime . tv_nsec = le32_to_cpu ( raw - > i_mtime_nsec ) ;
2013-05-16 10:04:49 +04:00
f2fs_msg ( inode - > i_sb , KERN_NOTICE , " recover_inode: ino = %x, name = %s " ,
2014-09-12 01:29:06 +04:00
ino_of_node ( page ) , F2FS_INODE ( page ) - > i_name ) ;
2012-11-02 12:13:32 +04:00
}
static int find_fsync_dnodes ( struct f2fs_sb_info * sbi , struct list_head * head )
{
2013-08-09 10:03:21 +04:00
unsigned long long cp_ver = cur_cp_version ( F2FS_CKPT ( sbi ) ) ;
2012-11-02 12:13:32 +04:00
struct curseg_info * curseg ;
2014-09-12 00:49:55 +04:00
struct page * page = NULL ;
2012-11-02 12:13:32 +04:00
block_t blkaddr ;
int err = 0 ;
/* get node pages in the current segment */
curseg = CURSEG_I ( sbi , CURSEG_WARM_NODE ) ;
2014-02-27 15:52:21 +04:00
blkaddr = NEXT_FREE_BLKADDR ( sbi , curseg ) ;
2012-11-02 12:13:32 +04:00
2014-12-08 10:02:52 +03:00
ra_meta_pages ( sbi , blkaddr , 1 , META_POR ) ;
2012-11-02 12:13:32 +04:00
while ( 1 ) {
struct fsync_inode_entry * entry ;
2014-09-23 22:23:01 +04:00
if ( blkaddr < MAIN_BLKADDR ( sbi ) | | blkaddr > = MAX_BLKADDR ( sbi ) )
2014-09-12 00:49:55 +04:00
return 0 ;
2012-11-02 12:13:32 +04:00
2014-12-08 10:02:52 +03:00
page = get_meta_page ( sbi , blkaddr ) ;
2013-03-08 16:29:23 +04:00
2013-03-20 14:01:06 +04:00
if ( cp_ver ! = cpver_of_node ( page ) )
2013-05-16 10:04:49 +04:00
break ;
2012-11-02 12:13:32 +04:00
if ( ! is_fsync_dnode ( page ) )
goto next ;
entry = get_fsync_inode ( head , ino_of_node ( page ) ) ;
if ( entry ) {
if ( IS_INODE ( page ) & & is_dent_dnode ( page ) )
set_inode_flag ( F2FS_I ( entry - > inode ) ,
FI_INC_LINK ) ;
} else {
if ( IS_INODE ( page ) & & is_dent_dnode ( page ) ) {
2013-03-20 14:01:06 +04:00
err = recover_inode_page ( sbi , page ) ;
if ( err )
2013-05-16 10:04:49 +04:00
break ;
2012-11-02 12:13:32 +04:00
}
/* add this fsync inode to the list */
2014-09-12 01:29:06 +04:00
entry = kmem_cache_alloc ( fsync_entry_slab , GFP_F2FS_ZERO ) ;
2012-11-02 12:13:32 +04:00
if ( ! entry ) {
err = - ENOMEM ;
2013-05-16 10:04:49 +04:00
break ;
2012-11-02 12:13:32 +04:00
}
2014-09-16 03:46:08 +04:00
/*
* CP | dnode ( F ) | inode ( DF )
* For this case , we should not give up now .
*/
2012-11-02 12:13:32 +04:00
entry - > inode = f2fs_iget ( sbi - > sb , ino_of_node ( page ) ) ;
if ( IS_ERR ( entry - > inode ) ) {
err = PTR_ERR ( entry - > inode ) ;
2012-12-22 07:10:12 +04:00
kmem_cache_free ( fsync_entry_slab , entry ) ;
2014-09-16 03:46:08 +04:00
if ( err = = - ENOENT )
goto next ;
2013-05-16 10:04:49 +04:00
break ;
2012-11-02 12:13:32 +04:00
}
2012-12-22 07:10:12 +04:00
list_add_tail ( & entry - > list , head ) ;
2012-11-02 12:13:32 +04:00
}
2013-05-15 05:49:13 +04:00
entry - > blkaddr = blkaddr ;
2014-09-12 01:29:06 +04:00
if ( IS_INODE ( page ) ) {
entry - > last_inode = blkaddr ;
if ( is_dent_dnode ( page ) )
entry - > last_dentry = blkaddr ;
}
2012-11-02 12:13:32 +04:00
next :
/* check next segment */
blkaddr = next_blkaddr_of_node ( page ) ;
2014-09-12 00:49:55 +04:00
f2fs_put_page ( page , 1 ) ;
2014-12-08 10:02:52 +03:00
ra_meta_pages_cond ( sbi , blkaddr ) ;
2012-11-02 12:13:32 +04:00
}
2014-09-12 00:49:55 +04:00
f2fs_put_page ( page , 1 ) ;
2012-11-02 12:13:32 +04:00
return err ;
}
2013-06-27 05:28:54 +04:00
static void destroy_fsync_dnodes ( struct list_head * head )
2012-11-02 12:13:32 +04:00
{
2013-01-20 19:02:58 +04:00
struct fsync_inode_entry * entry , * tmp ;
list_for_each_entry_safe ( entry , tmp , head , list ) {
2012-11-02 12:13:32 +04:00
iput ( entry - > inode ) ;
list_del ( & entry - > list ) ;
kmem_cache_free ( fsync_entry_slab , entry ) ;
}
}
2013-05-22 03:20:01 +04:00
static int check_index_in_prev_nodes ( struct f2fs_sb_info * sbi ,
2013-05-22 03:02:02 +04:00
block_t blkaddr , struct dnode_of_data * dn )
2012-11-02 12:13:32 +04:00
{
struct seg_entry * sentry ;
unsigned int segno = GET_SEGNO ( sbi , blkaddr ) ;
2014-02-04 08:01:10 +04:00
unsigned short blkoff = GET_BLKOFF_FROM_SEG0 ( sbi , blkaddr ) ;
2014-01-28 09:54:07 +04:00
struct f2fs_summary_block * sum_node ;
2012-11-02 12:13:32 +04:00
struct f2fs_summary sum ;
2014-01-28 09:54:07 +04:00
struct page * sum_page , * node_page ;
2013-05-22 03:02:02 +04:00
nid_t ino , nid ;
2012-11-02 12:13:32 +04:00
struct inode * inode ;
2013-08-12 16:08:03 +04:00
unsigned int offset ;
2012-11-02 12:13:32 +04:00
block_t bidx ;
int i ;
sentry = get_seg_entry ( sbi , segno ) ;
if ( ! f2fs_test_bit ( blkoff , sentry - > cur_valid_map ) )
2013-05-22 03:20:01 +04:00
return 0 ;
2012-11-02 12:13:32 +04:00
/* Get the previous summary */
for ( i = CURSEG_WARM_DATA ; i < = CURSEG_COLD_DATA ; i + + ) {
struct curseg_info * curseg = CURSEG_I ( sbi , i ) ;
if ( curseg - > segno = = segno ) {
sum = curseg - > sum_blk - > entries [ blkoff ] ;
2014-01-28 09:54:07 +04:00
goto got_it ;
2012-11-02 12:13:32 +04:00
}
}
2014-01-28 09:54:07 +04:00
sum_page = get_sum_page ( sbi , segno ) ;
sum_node = ( struct f2fs_summary_block * ) page_address ( sum_page ) ;
sum = sum_node - > entries [ blkoff ] ;
f2fs_put_page ( sum_page , 1 ) ;
got_it :
2013-05-22 03:02:02 +04:00
/* Use the locked dnode page and inode */
nid = le32_to_cpu ( sum . nid ) ;
if ( dn - > inode - > i_ino = = nid ) {
struct dnode_of_data tdn = * dn ;
tdn . nid = nid ;
tdn . node_page = dn - > inode_page ;
2013-06-24 02:47:23 +04:00
tdn . ofs_in_node = le16_to_cpu ( sum . ofs_in_node ) ;
2013-05-22 03:02:02 +04:00
truncate_data_blocks_range ( & tdn , 1 ) ;
2013-05-22 03:20:01 +04:00
return 0 ;
2013-05-22 03:02:02 +04:00
} else if ( dn - > nid = = nid ) {
struct dnode_of_data tdn = * dn ;
2013-06-24 02:47:23 +04:00
tdn . ofs_in_node = le16_to_cpu ( sum . ofs_in_node ) ;
2013-05-22 03:02:02 +04:00
truncate_data_blocks_range ( & tdn , 1 ) ;
2013-05-22 03:20:01 +04:00
return 0 ;
2013-05-22 03:02:02 +04:00
}
2012-11-02 12:13:32 +04:00
/* Get the node page */
2013-05-22 03:02:02 +04:00
node_page = get_node_page ( sbi , nid ) ;
2013-05-22 03:20:01 +04:00
if ( IS_ERR ( node_page ) )
return PTR_ERR ( node_page ) ;
2013-08-12 16:08:03 +04:00
offset = ofs_of_node ( node_page ) ;
2012-11-02 12:13:32 +04:00
ino = ino_of_node ( node_page ) ;
f2fs_put_page ( node_page , 1 ) ;
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 19:35:58 +04:00
if ( ino ! = dn - > inode - > i_ino ) {
/* Deallocate previous index in the node page */
inode = f2fs_iget ( sbi - > sb , ino ) ;
if ( IS_ERR ( inode ) )
return PTR_ERR ( inode ) ;
} else {
inode = dn - > inode ;
}
2012-12-22 07:09:43 +04:00
2013-08-12 16:08:03 +04:00
bidx = start_bidx_of_node ( offset , F2FS_I ( inode ) ) +
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 19:35:58 +04:00
le16_to_cpu ( sum . ofs_in_node ) ;
2013-08-12 16:08:03 +04:00
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 19:35:58 +04:00
if ( ino ! = dn - > inode - > i_ino ) {
truncate_hole ( inode , bidx , bidx + 1 ) ;
iput ( inode ) ;
} else {
struct dnode_of_data tdn ;
set_new_dnode ( & tdn , inode , dn - > inode_page , NULL , 0 ) ;
if ( get_dnode_of_data ( & tdn , bidx , LOOKUP_NODE ) )
return 0 ;
if ( tdn . data_blkaddr ! = NULL_ADDR )
truncate_data_blocks_range ( & tdn , 1 ) ;
f2fs_put_page ( tdn . node_page , 1 ) ;
}
2013-05-22 03:20:01 +04:00
return 0 ;
2012-11-02 12:13:32 +04:00
}
2013-03-20 14:01:06 +04:00
static int do_recover_data ( struct f2fs_sb_info * sbi , struct inode * inode ,
2012-11-02 12:13:32 +04:00
struct page * page , block_t blkaddr )
{
2013-08-12 16:08:03 +04:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
2012-11-02 12:13:32 +04:00
unsigned int start , end ;
struct dnode_of_data dn ;
struct f2fs_summary sum ;
struct node_info ni ;
2013-05-16 10:04:49 +04:00
int err = 0 , recovered = 0 ;
2012-11-02 12:13:32 +04:00
2014-08-08 10:49:17 +04:00
/* step 1: recover xattr */
if ( IS_INODE ( page ) ) {
recover_inline_xattr ( inode , page ) ;
} else if ( f2fs_has_xattr_block ( ofs_of_node ( page ) ) ) {
recover_xattr_data ( inode , page , blkaddr ) ;
2013-12-26 07:49:48 +04:00
goto out ;
2014-08-08 10:49:17 +04:00
}
2013-12-26 07:49:48 +04:00
2014-08-08 10:49:17 +04:00
/* step 2: recover inline data */
if ( recover_inline_data ( inode , page ) )
2014-01-28 07:25:06 +04:00
goto out ;
2014-08-08 10:49:17 +04:00
/* step 3: recover data indices */
2013-08-12 16:08:03 +04:00
start = start_bidx_of_node ( ofs_of_node ( page ) , fi ) ;
2014-04-26 15:59:52 +04:00
end = start + ADDRS_PER_PAGE ( page , fi ) ;
2012-11-02 12:13:32 +04:00
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_lock_op ( sbi ) ;
2013-12-26 07:49:48 +04:00
2012-11-02 12:13:32 +04:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
2013-03-20 14:01:06 +04:00
err = get_dnode_of_data ( & dn , start , ALLOC_NODE ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
if ( err ) {
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_unlock_op ( sbi ) ;
2013-12-26 07:49:48 +04:00
goto out ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 11:21:29 +04:00
}
2012-11-02 12:13:32 +04:00
2014-03-18 08:29:07 +04:00
f2fs_wait_on_page_writeback ( dn . node_page , NODE ) ;
2012-11-02 12:13:32 +04:00
get_node_info ( sbi , dn . nid , & ni ) ;
2014-09-03 02:52:58 +04:00
f2fs_bug_on ( sbi , ni . ino ! = ino_of_node ( page ) ) ;
f2fs_bug_on ( sbi , ofs_of_node ( dn . node_page ) ! = ofs_of_node ( page ) ) ;
2012-11-02 12:13:32 +04:00
for ( ; start < end ; start + + ) {
block_t src , dest ;
src = datablock_addr ( dn . node_page , dn . ofs_in_node ) ;
dest = datablock_addr ( page , dn . ofs_in_node ) ;
if ( src ! = dest & & dest ! = NEW_ADDR & & dest ! = NULL_ADDR ) {
if ( src = = NULL_ADDR ) {
2013-10-29 10:14:54 +04:00
err = reserve_new_block ( & dn ) ;
2012-11-02 12:13:32 +04:00
/* We should not get -ENOSPC */
2014-09-03 02:52:58 +04:00
f2fs_bug_on ( sbi , err ) ;
2012-11-02 12:13:32 +04:00
}
/* Check the previous node page having this index */
2013-05-22 03:20:01 +04:00
err = check_index_in_prev_nodes ( sbi , dest , & dn ) ;
if ( err )
goto err ;
2012-11-02 12:13:32 +04:00
set_summary ( & sum , dn . nid , dn . ofs_in_node , ni . version ) ;
/* write dummy data page */
recover_data_page ( sbi , NULL , & sum , src , dest ) ;
2014-12-31 09:57:55 +03:00
dn . data_blkaddr = dest ;
update_extent_cache ( & dn ) ;
2013-05-16 10:04:49 +04:00
recovered + + ;
2012-11-02 12:13:32 +04:00
}
dn . ofs_in_node + + ;
}
/* write node page in place */
set_summary ( & sum , dn . nid , 0 , 0 ) ;
if ( IS_INODE ( dn . node_page ) )
sync_inode_page ( & dn ) ;
copy_node_footer ( dn . node_page , page ) ;
fill_node_footer ( dn . node_page , dn . nid , ni . ino ,
ofs_of_node ( page ) , false ) ;
set_page_dirty ( dn . node_page ) ;
2013-05-22 03:20:01 +04:00
err :
2012-11-02 12:13:32 +04:00
f2fs_put_dnode ( & dn ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 14:08:30 +04:00
f2fs_unlock_op ( sbi ) ;
2013-12-26 07:49:48 +04:00
out :
2014-01-18 00:44:39 +04:00
f2fs_msg ( sbi - > sb , KERN_NOTICE ,
" recover_data: ino = %lx, recovered = %d blocks, err = %d " ,
inode - > i_ino , recovered , err ) ;
2013-05-22 03:20:01 +04:00
return err ;
2012-11-02 12:13:32 +04:00
}
2013-03-20 14:01:06 +04:00
static int recover_data ( struct f2fs_sb_info * sbi ,
2012-11-02 12:13:32 +04:00
struct list_head * head , int type )
{
2013-08-09 10:03:21 +04:00
unsigned long long cp_ver = cur_cp_version ( F2FS_CKPT ( sbi ) ) ;
2012-11-02 12:13:32 +04:00
struct curseg_info * curseg ;
2014-09-12 00:49:55 +04:00
struct page * page = NULL ;
2013-03-20 14:01:06 +04:00
int err = 0 ;
2012-11-02 12:13:32 +04:00
block_t blkaddr ;
/* get node pages in the current segment */
curseg = CURSEG_I ( sbi , type ) ;
blkaddr = NEXT_FREE_BLKADDR ( sbi , curseg ) ;
while ( 1 ) {
struct fsync_inode_entry * entry ;
2014-09-23 22:23:01 +04:00
if ( blkaddr < MAIN_BLKADDR ( sbi ) | | blkaddr > = MAX_BLKADDR ( sbi ) )
2014-09-12 00:49:55 +04:00
break ;
2012-11-02 12:13:32 +04:00
2014-12-08 10:02:52 +03:00
ra_meta_pages_cond ( sbi , blkaddr ) ;
page = get_meta_page ( sbi , blkaddr ) ;
2013-03-08 16:29:23 +04:00
2014-09-12 00:49:55 +04:00
if ( cp_ver ! = cpver_of_node ( page ) ) {
f2fs_put_page ( page , 1 ) ;
2013-05-20 05:26:09 +04:00
break ;
2014-09-12 00:49:55 +04:00
}
2012-11-02 12:13:32 +04:00
entry = get_fsync_inode ( head , ino_of_node ( page ) ) ;
if ( ! entry )
goto next ;
2014-09-16 03:46:08 +04:00
/*
* inode ( x ) | CP | inode ( x ) | dnode ( F )
* In this case , we can lose the latest inode ( x ) .
2014-09-12 01:29:06 +04:00
* So , call recover_inode for the inode update .
2014-09-16 03:46:08 +04:00
*/
2014-09-12 01:29:06 +04:00
if ( entry - > last_inode = = blkaddr )
recover_inode ( entry - > inode , page ) ;
if ( entry - > last_dentry = = blkaddr ) {
err = recover_dentry ( entry - > inode , page ) ;
if ( err ) {
f2fs_put_page ( page , 1 ) ;
break ;
}
}
2013-03-20 14:01:06 +04:00
err = do_recover_data ( sbi , entry - > inode , page , blkaddr ) ;
2014-09-12 00:49:55 +04:00
if ( err ) {
f2fs_put_page ( page , 1 ) ;
2013-05-20 05:26:09 +04:00
break ;
2014-09-12 00:49:55 +04:00
}
2012-11-02 12:13:32 +04:00
if ( entry - > blkaddr = = blkaddr ) {
iput ( entry - > inode ) ;
list_del ( & entry - > list ) ;
kmem_cache_free ( fsync_entry_slab , entry ) ;
}
next :
/* check next segment */
blkaddr = next_blkaddr_of_node ( page ) ;
2014-09-12 00:49:55 +04:00
f2fs_put_page ( page , 1 ) ;
2012-11-02 12:13:32 +04:00
}
2013-03-20 14:01:06 +04:00
if ( ! err )
allocate_new_segments ( sbi ) ;
return err ;
2012-11-02 12:13:32 +04:00
}
2013-03-20 14:01:06 +04:00
int recover_fsync_data ( struct f2fs_sb_info * sbi )
2012-11-02 12:13:32 +04:00
{
2014-07-26 02:47:25 +04:00
struct curseg_info * curseg = CURSEG_I ( sbi , CURSEG_WARM_NODE ) ;
2012-11-02 12:13:32 +04:00
struct list_head inode_list ;
2014-07-26 02:47:25 +04:00
block_t blkaddr ;
2013-03-20 14:01:06 +04:00
int err ;
2013-10-23 08:39:32 +04:00
bool need_writecp = false ;
2012-11-02 12:13:32 +04:00
fsync_entry_slab = f2fs_kmem_cache_create ( " f2fs_fsync_inode_entry " ,
2014-03-07 14:43:28 +04:00
sizeof ( struct fsync_inode_entry ) ) ;
2013-12-06 10:00:58 +04:00
if ( ! fsync_entry_slab )
2013-03-20 14:01:06 +04:00
return - ENOMEM ;
2012-11-02 12:13:32 +04:00
INIT_LIST_HEAD ( & inode_list ) ;
/* step #1: find fsynced inode numbers */
2013-10-23 08:39:32 +04:00
sbi - > por_doing = true ;
2014-07-26 02:47:25 +04:00
2014-08-14 03:30:46 +04:00
/* prevent checkpoint */
mutex_lock ( & sbi - > cp_mutex ) ;
2014-07-26 02:47:25 +04:00
blkaddr = NEXT_FREE_BLKADDR ( sbi , curseg ) ;
2013-03-20 14:01:06 +04:00
err = find_fsync_dnodes ( sbi , & inode_list ) ;
if ( err )
2012-11-02 12:13:32 +04:00
goto out ;
if ( list_empty ( & inode_list ) )
goto out ;
2013-10-23 08:39:32 +04:00
need_writecp = true ;
2013-09-24 05:26:24 +04:00
2012-11-02 12:13:32 +04:00
/* step #2: recover data */
2013-03-20 14:01:06 +04:00
err = recover_data ( sbi , & inode_list , CURSEG_WARM_NODE ) ;
2014-08-08 21:18:43 +04:00
if ( ! err )
2014-09-03 02:52:58 +04:00
f2fs_bug_on ( sbi , ! list_empty ( & inode_list ) ) ;
2012-11-02 12:13:32 +04:00
out :
2013-06-27 05:28:54 +04:00
destroy_fsync_dnodes ( & inode_list ) ;
2012-11-02 12:13:32 +04:00
kmem_cache_destroy ( fsync_entry_slab ) ;
2014-07-26 02:47:25 +04:00
2014-09-12 00:49:55 +04:00
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range ( META_MAPPING ( sbi ) ,
2014-09-23 22:23:01 +04:00
MAIN_BLKADDR ( sbi ) < < PAGE_CACHE_SHIFT , - 1 ) ;
2014-09-12 00:49:55 +04:00
2014-07-26 02:47:25 +04:00
if ( err ) {
truncate_inode_pages_final ( NODE_MAPPING ( sbi ) ) ;
truncate_inode_pages_final ( META_MAPPING ( sbi ) ) ;
}
2013-10-23 08:39:32 +04:00
sbi - > por_doing = false ;
2014-07-26 02:47:25 +04:00
if ( err ) {
discard_next_dnode ( sbi , blkaddr ) ;
/* Flush all the NAT/SIT pages */
while ( get_pages ( sbi , F2FS_DIRTY_META ) )
sync_meta_pages ( sbi , META , LONG_MAX ) ;
2014-08-14 03:30:46 +04:00
set_ckpt_flags ( sbi - > ckpt , CP_ERROR_FLAG ) ;
mutex_unlock ( & sbi - > cp_mutex ) ;
2014-07-26 02:47:25 +04:00
} else if ( need_writecp ) {
2014-09-21 08:57:51 +04:00
struct cp_control cpc = {
. reason = CP_SYNC ,
} ;
2014-08-14 03:30:46 +04:00
mutex_unlock ( & sbi - > cp_mutex ) ;
2014-09-21 08:57:51 +04:00
write_checkpoint ( sbi , & cpc ) ;
2014-08-14 03:30:46 +04:00
} else {
mutex_unlock ( & sbi - > cp_mutex ) ;
2014-07-26 02:47:25 +04:00
}
2013-03-20 14:01:06 +04:00
return err ;
2012-11-02 12:13:32 +04:00
}