From 1618e6e297082def6350887e1c6c606749716fac Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 30 Aug 2018 21:33:31 +0800 Subject: [PATCH 01/62] f2fs: add additional sanity check in f2fs_acl_from_disk() Add additinal sanity check for irregular case(e.g. corruption). If size of extended attribution is smaller than size of acl header, then return -EINVAL. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 111824199a88..20caf341701d 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -53,6 +53,9 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1); const char *end = value + size; + if (size < sizeof(struct f2fs_acl_header)) + return ERR_PTR(-EINVAL); + if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION)) return ERR_PTR(-EINVAL); From 7d20c8abb2edcf962ca857d51f4d0f9cd4b19053 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Sep 2018 03:52:17 +0800 Subject: [PATCH 02/62] f2fs: fix to avoid NULL pointer dereference on se->discard_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://bugzilla.kernel.org/show_bug.cgi?id=200951 These is a NULL pointer dereference issue reported in bugzilla: Hi, in the setup there is a SATA SSD connected to a SATA-to-USB bridge. The disc is "Samsung SSD 850 PRO 256G" which supports TRIM. There are four partitions: sda1: FAT /boot sda2: F2FS / sda3: F2FS /home sda4: F2FS The bridge is ASMT1153e which uses the "uas" driver. There is no TRIM pass-through, so, when mounting it reports: mounting with "discard" option, but the device does not support discard The USB host is USB3.0 and UASP capable. It is the one on RK3399. Given this everything works fine, except there is no TRIM support. In order to enable TRIM a new UDEV rule is added [1]: /etc/udev/rules.d/10-sata-bridge-trim.rules: ACTION=="add|change", ATTRS{idVendor}=="174c", ATTRS{idProduct}=="55aa", SUBSYSTEM=="scsi_disk", ATTR{provisioning_mode}="unmap" After reboot any F2FS write hangs forever and dmesg reports: Unable to handle kernel NULL pointer dereference Also tested on a x86_64 system: works fine even with TRIM enabled. same disc same bridge different usb host controller different cpu architecture not root filesystem Regards, Vicenç. [1] Post #5 in https://bbs.archlinux.org/viewtopic.php?id=236280 Unable to handle kernel NULL pointer dereference at virtual address 000000000000003e Mem abort info: ESR = 0x96000004 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000626e3122 [000000000000003e] pgd=0000000000000000 Internal error: Oops: 96000004 [#1] SMP Modules linked in: overlay snd_soc_hdmi_codec rc_cec dw_hdmi_i2s_audio dw_hdmi_cec snd_soc_simple_card snd_soc_simple_card_utils snd_soc_rockchip_i2s rockchip_rga snd_soc_rockchip_pcm rockchipdrm videobuf2_dma_sg v4l2_mem2mem rtc_rk808 videobuf2_memops analogix_dp videobuf2_v4l2 videobuf2_common dw_hdmi dw_wdt cec rc_core videodev drm_kms_helper media drm rockchip_thermal rockchip_saradc realtek drm_panel_orientation_quirks syscopyarea sysfillrect sysimgblt fb_sys_fops dwmac_rk stmmac_platform stmmac pwm_bl squashfs loop crypto_user gpio_keys hid_kensington CPU: 5 PID: 957 Comm: nvim Not tainted 4.19.0-rc1-1-ARCH #1 Hardware name: Sapphire-RK3399 Board (DT) pstate: 00000005 (nzcv daif -PAN -UAO) pc : update_sit_entry+0x304/0x4b0 lr : update_sit_entry+0x108/0x4b0 sp : ffff00000ca13bd0 x29: ffff00000ca13bd0 x28: 000000000000003e x27: 0000000000000020 x26: 0000000000080000 x25: 0000000000000048 x24: ffff8000ebb85cf8 x23: 0000000000000253 x22: 00000000ffffffff x21: 00000000000535f2 x20: 00000000ffffffdf x19: ffff8000eb9e6800 x18: ffff8000eb9e6be8 x17: 0000000007ce6926 x16: 000000001c83ffa8 x15: 0000000000000000 x14: ffff8000f602df90 x13: 0000000000000006 x12: 0000000000000040 x11: 0000000000000228 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 00000000000535f2 x6 : ffff8000ebff3440 x5 : ffff8000ebff3440 x4 : ffff8000ebe3a6c8 x3 : 00000000ffffffff x2 : 0000000000000020 x1 : 0000000000000000 x0 : ffff8000eb9e5800 Process nvim (pid: 957, stack limit = 0x0000000063a78320) Call trace: update_sit_entry+0x304/0x4b0 f2fs_invalidate_blocks+0x98/0x140 truncate_node+0x90/0x400 f2fs_remove_inode_page+0xe8/0x340 f2fs_evict_inode+0x2b0/0x408 evict+0xe0/0x1e0 iput+0x160/0x260 do_unlinkat+0x214/0x298 __arm64_sys_unlinkat+0x3c/0x68 el0_svc_handler+0x94/0x118 el0_svc+0x8/0xc Code: f9400800 b9488400 36080140 f9400f01 (387c4820) ---[ end trace a0f21a307118c477 ]--- The reason is it is possible to enable discard flag on block queue via UDEV, but during mount, f2fs will initialize se->discard_map only if this flag is set, once the flag is set after mount, f2fs may dereference NULL pointer on se->discard_map. So this patch does below changes to fix this issue: - initialize and update se->discard_map all the time. - don't clear DISCARD option if device has no QUEUE_FLAG_DISCARD flag during mount. - don't issue small discard on zoned block device. - introduce some functions to enhance the readability. Signed-off-by: Chao Yu Tested-by: Vicente Bergas Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 3 +-- fs/f2fs/f2fs.h | 15 ++++++++--- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 65 ++++++++++++++++++++--------------------------- fs/f2fs/super.c | 16 +++--------- 5 files changed, 46 insertions(+), 55 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 214a968962a1..ebe649d9793c 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -190,8 +190,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); - if (f2fs_discard_en(sbi)) - si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index abf925664d9c..848700610b79 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3399,11 +3399,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, } #endif -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + return f2fs_sb_has_blkzoned(sbi->sb); +} - return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5474aaa274b9..ca66b3eb197a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1978,7 +1978,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!f2fs_hw_support_discard(F2FS_SB(sb))) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30779aaa9dba..c372ff755818 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1725,11 +1725,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) return false; if (!force) { - if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -1835,7 +1835,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, dirty_i->nr_dirty[PRE]--; } - if (!test_opt(sbi, DISCARD)) + if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && @@ -2025,8 +2025,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - !f2fs_test_and_set_bit(offset, se->discard_map)) + if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ @@ -2054,8 +2053,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -2671,7 +2669,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (test_opt(sbi, DISCARD)) + if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); @@ -3762,13 +3760,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - if (f2fs_discard_en(sbi)) { - sit_i->sentries[start].discard_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, - GFP_KERNEL); - if (!sit_i->sentries[start].discard_map) - return -ENOMEM; - } + sit_i->sentries[start].discard_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -3916,18 +3912,16 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) total_node_blocks += se->valid_blocks; /* build discard map only one time */ - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; } if (sbi->segs_per_sec > 1) @@ -3965,16 +3959,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } if (sbi->segs_per_sec > 1) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 896b885f504e..50411fb25756 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -360,7 +360,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -415,14 +414,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - q = bdev_get_queue(sb->s_bdev); - if (blk_queue_discard(q)) { - set_opt(sbi, DISCARD); - } else if (!f2fs_sb_has_blkzoned(sb)) { - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + set_opt(sbi, DISCARD); break; case Opt_nodiscard: if (f2fs_sb_has_blkzoned(sb)) { @@ -1032,7 +1024,8 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1399,8 +1392,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) - set_opt(sbi, DISCARD); + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi->sb)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else From abde73c718293b83c23b2ca137f18b3f8650a553 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 31 Aug 2018 15:09:26 +0530 Subject: [PATCH 03/62] f2fs: fix unnecessary periodic wakeup of discard thread when dev is busy When dev is busy, discard thread wake up timeout can be aligned with the exact time that it needs to wait for dev to come out of busy. This helps to avoid unnecessary periodic wakeups and thus save some power. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c372ff755818..f36a4b71595f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1564,6 +1564,8 @@ static int issue_discard_thread(void *data) struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; + unsigned long interval = sbi->interval_time[REQ_TIME] * HZ; + long delta; set_freezable(); @@ -1600,7 +1602,11 @@ static int issue_discard_thread(void *data) __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1){ - wait_ms = dpolicy.mid_interval; + delta = (sbi->last_time[REQ_TIME] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); + else + wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } From 22d7ea1364140eaafb272875ff40e95c85a75bdf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Aug 2018 17:17:47 +0800 Subject: [PATCH 04/62] Revert "f2fs: use printk_ratelimited for f2fs_msg" Don't limit printing log, so that we will not miss any key messages. This reverts commit a36c106dffb616250117efb1cab271c19a8f94ff. In addition, we use printk_ratelimited to avoid too many log prints. - error injection - discard submission failure Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 848700610b79..b3de6c5b4f24 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1306,9 +1306,9 @@ struct f2fs_sb_info { }; #ifdef CONFIG_F2FS_FAULT_INJECTION -#define f2fs_show_injection_info(type) \ - printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, f2fs_fault_name[type], \ +#define f2fs_show_injection_info(type) \ + printk_ratelimited("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f36a4b71595f..34e05777941e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -905,9 +905,9 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, dc->error = 0; if (dc->error) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard(%u, %u, %u) failed, ret: %d", - dc->lstart, dc->start, dc->len, dc->error); + printk_ratelimited( + "%sF2FS-fs: Issue discard(%u, %u, %u) failed, ret: %d", + KERN_INFO, dc->lstart, dc->start, dc->len, dc->error); __detach_discard_cmd(dcc, dc); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 50411fb25756..18cf1d8c9e25 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -207,7 +207,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } From 0ded69f632bb717be9aeea3ae74e29050fcb060c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 22 Aug 2018 21:18:00 -0700 Subject: [PATCH 05/62] f2fs: avoid wrong decrypted data from disk 1. Create a file in an encrypted directory 2. Do GC & drop caches 3. Read stale data before its bio for metapage was not issued yet Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++-------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 3 +-- fs/f2fs/segment.c | 6 +++++- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 382c1ef9a9e4..8c204f896c22 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -565,9 +565,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_block_writeback(sbi, blkaddr); } return bio; @@ -582,6 +579,9 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, if (IS_ERR(bio)) return PTR_ERR(bio); + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, blkaddr); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; @@ -1558,6 +1558,12 @@ submit_and_realloc: } } + /* + * If the page is under writeback, we need to wait for + * its completion to see the correct decrypted data. + */ + f2fs_wait_on_block_writeback(inode, block_nr); + if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; @@ -1625,7 +1631,7 @@ static int encrypt_one_page(struct f2fs_io_info *fio) return 0; /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(inode, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -2382,10 +2388,6 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); - /* wait for GCed page writeback via META_MAPPING */ - if (f2fs_post_read_required(inode)) - f2fs_wait_on_block_writeback(sbi, blkaddr); - if (len == PAGE_SIZE || PageUptodate(page)) return 0; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b3de6c5b4f24..ad26ee0b1542 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2942,7 +2942,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ca66b3eb197a..e21a1923056d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -112,8 +112,7 @@ mapped: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed page writeback via META_MAPPING */ - if (f2fs_post_read_required(inode)) - f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 34e05777941e..187c848a65b8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3195,10 +3195,14 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) +void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *cpage; + if (!f2fs_post_read_required(inode)) + return; + if (!is_valid_data_blkaddr(sbi, blkaddr)) return; From 5ce805869cbed93267ed26552ff76e30f05c91f7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Sep 2018 11:40:12 -0700 Subject: [PATCH 06/62] f2fs: submit bio after shutdown Sometimes, some merged IOs could get a chance to be submitted, resulting in system hang in shutdown test. This issues IOs all the time after shutdown. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8c204f896c22..26f38b224bb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -533,6 +533,8 @@ skip: if (fio->in_list) goto next; out: + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + __submit_merged_bio(io); up_write(&io->io_rwsem); } From cda9cc595f0bb6ffa51a4efc4b6533dfa4039b4c Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 26 Jun 2018 13:12:43 +0800 Subject: [PATCH 07/62] f2fs: report error if quota off error during umount Now, we depend on fsck to ensure quota file data is ok, so we scan whole partition if checkpoint without umount flag. It's same for quota off error case, which may make quota file data inconsistent. generic/019 reports below error: __quota_error: 1160 callbacks suppressed Quota error (device zram1): write_blk: dquota write failed Quota error (device zram1): qtree_write_dquot: Error -28 occurred while creating quota Quota error (device zram1): write_blk: dquota write failed Quota error (device zram1): qtree_write_dquot: Error -28 occurred while creating quota Quota error (device zram1): write_blk: dquota write failed Quota error (device zram1): qtree_write_dquot: Error -28 occurred while creating quota Quota error (device zram1): write_blk: dquota write failed Quota error (device zram1): qtree_write_dquot: Error -28 occurred while creating quota Quota error (device zram1): write_blk: dquota write failed Quota error (device zram1): qtree_write_dquot: Error -28 occurred while creating quota VFS: Busy inodes after unmount of zram1. Self-destruct in 5 seconds. Have a nice day... If we failed in below path due to fail to write dquot block, we will miss to release quota inode, fix it. - f2fs_put_super - f2fs_quota_off_umount - f2fs_quota_off - f2fs_quota_sync <-- failed - dquot_quota_off <-- missed to call Signed-off-by: Yunlei He Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 18cf1d8c9e25..afb11715127b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1844,7 +1844,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, type); + err = f2fs_quota_sync(sb, type); + if (err) + goto out_put; err = dquot_quota_off(sb, type); if (err || f2fs_sb_has_quota_ino(sb)) @@ -1863,9 +1865,20 @@ out_put: void f2fs_quota_off_umount(struct super_block *sb) { int type; + int err; - for (type = 0; type < MAXQUOTAS; type++) - f2fs_quota_off(sb, type); + for (type = 0; type < MAXQUOTAS; type++) { + err = f2fs_quota_off(sb, type); + if (err) { + int ret = dquot_quota_off(sb, type); + + f2fs_msg(sb, KERN_ERR, + "Fail to turn off disk quota " + "(type: %d, err: %d, ret:%d), Please " + "run fsck to fix it.", type, err, ret); + set_sbi_flag(F2FS_SB(sb), SBI_NEED_FSCK); + } + } } static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) From 1378752b9921e60749eaf18ec6c47b33f9001abb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Aug 2018 17:11:05 +0800 Subject: [PATCH 08/62] f2fs: fix to flush all dirty inodes recovered in readonly fs generic/417 reported as blow: ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/devf2fs/inode.c:695! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 1 PID: 21697 Comm: umount Tainted: G W O 4.18.0-rc2+ #39 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: f2fs_evict_inode+0x556/0x580 [f2fs] Call Trace: ? _raw_spin_unlock+0x2c/0x50 evict+0xa8/0x170 dispose_list+0x34/0x40 evict_inodes+0x118/0x120 generic_shutdown_super+0x41/0x100 ? rcu_read_lock_sched_held+0x97/0xa0 kill_block_super+0x22/0x50 kill_f2fs_super+0x6f/0x80 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x81/0xa0 exit_to_usermode_loop+0x59/0xa7 do_fast_syscall_32+0x1f5/0x22c entry_SYSENTER_32+0x53/0x86 EIP: f2fs_evict_inode+0x556/0x580 [f2fs] It can simply reproduced with scripts: Enable quota feature during mkfs. Testcase1: 1. mkfs.f2fs /dev/zram0 2. mount -t f2fs /dev/zram0 /mnt/f2fs 3. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4k" -c "fsync" 4. godown /mnt/f2fs 5. umount /mnt/f2fs 6. mount -t f2fs -o ro /dev/zram0 /mnt/f2fs 7. umount /mnt/f2fs Testcase2: 1. mkfs.f2fs /dev/zram0 2. mount -t f2fs /dev/zram0 /mnt/f2fs 3. touch /mnt/f2fs/file 4. create process[pid = x] do: a) open /mnt/f2fs/file; b) unlink /mnt/f2fs/file 5. godown -f /mnt/f2fs 6. kill process[pid = x] 7. umount /mnt/f2fs 8. mount -t f2fs -o ro /dev/zram0 /mnt/f2fs 9. umount /mnt/f2fs The reason is: during recovery, i_{c,m}time of inode will be updated, then the inode can be set dirty w/o being tracked in sbi->inode_list[DIRTY_META] global list, so later write_checkpoint will not flush such dirty inode into node page. Once umount is called, sync_filesystem() in generic_shutdown_super() will skip syncng dirty inodes due to sb_rdonly check, leaving dirty inodes there. To solve this issue, during umount, add remove SB_RDONLY flag in sb->s_flags, to make sure sync_filesystem() will not be skipped. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 ++ fs/f2fs/f2fs.h | 1 + fs/f2fs/recovery.c | 14 +++++++++----- fs/f2fs/super.c | 3 +++ 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index e8b6b89bddb8..59d0472013f4 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -696,6 +696,8 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); out: + set_sbi_flag(sbi, SBI_IS_RECOVERED); + #ifdef CONFIG_QUOTA /* Turn quotas off */ if (quota_enabled) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ad26ee0b1542..3a8a60e6844c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1088,6 +1088,7 @@ enum { SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ + SBI_IS_RECOVERED, /* recovered orphan/data */ }; enum { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 95511ed11a22..e3aa6eee7a8b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -697,11 +697,15 @@ skip: /* let's drop all the directory inodes for clean checkpoint */ destroy_fsync_dnodes(&dir_list); - if (!err && need_writecp) { - struct cp_control cpc = { - .reason = CP_RECOVERY, - }; - err = f2fs_write_checkpoint(sbi, &cpc); + if (need_writecp) { + set_sbi_flag(sbi, SBI_IS_RECOVERED); + + if (!err) { + struct cp_control cpc = { + .reason = CP_RECOVERY, + }; + err = f2fs_write_checkpoint(sbi, &cpc); + } } kmem_cache_destroy(fsync_entry_slab); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index afb11715127b..968746e23feb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3180,6 +3180,9 @@ static void kill_f2fs_super(struct super_block *sb) }; f2fs_write_checkpoint(sbi, &cpc); } + + if (is_sbi_flag_set(sbi, SBI_IS_RECOVERED) && f2fs_readonly(sb)) + sb->s_flags &= ~SB_RDONLY; } kill_block_super(sb); } From 313ed62a3ddc2cbe16087630910b41a87bfd9a1f Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 31 Aug 2018 22:33:50 +0800 Subject: [PATCH 09/62] f2fs: cache NULL when both default_acl and acl are NULL default_acl and acl of newly created inode will be initiated as ACL_NOT_CACHED in vfs function inode_init_always() and later will be updated by calling xxx_init_acl() in specific filesystems. Howerver, when default_acl and acl are NULL then they keep the value of ACL_NOT_CACHED, this patch tries to cache NULL for acl/default_acl in this case. Signed-off-by: Chengguang Xu Acked-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 20caf341701d..cd82e1ce5d67 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -397,12 +397,16 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; From 0b2103e886e6de9802e1170e57c573443286a483 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Sep 2018 14:54:01 +0800 Subject: [PATCH 10/62] f2fs: fix memory leak of write_io in fill_super() It needs to release memory allocated for sbi->write_io in error path, otherwise, it will cause memory leak. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 968746e23feb..2f55713f7f99 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2871,7 +2871,7 @@ try_onemore: GFP_KERNEL); if (!sbi->write_io[i]) { err = -ENOMEM; - goto free_options; + goto free_bio_info; } for (j = HOT; j < n; j++) { From 4a70e255449c9a13eed7a6eeecc85a1ea63cef76 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Sep 2018 14:54:02 +0800 Subject: [PATCH 11/62] f2fs: fix memory leak of percpu counter in fill_super() In fill_super -> init_percpu_info, we should destroy percpu counter in error path, otherwise memory allcoated for percpu counter will leak. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2f55713f7f99..b2acb7f751f1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2458,8 +2458,12 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) if (err) return err; - return percpu_counter_init(&sbi->total_valid_inode_count, 0, + err = percpu_counter_init(&sbi->total_valid_inode_count, 0, GFP_KERNEL); + if (err) + percpu_counter_destroy(&sbi->alloc_valid_block_count); + + return err; } #ifdef CONFIG_BLK_DEV_ZONED From 042be0f849e5fc24116d0afecfaf926eed5cac63 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Sep 2018 20:34:12 +0800 Subject: [PATCH 12/62] f2fs: fix to do sanity check with current segment number https://bugzilla.kernel.org/show_bug.cgi?id=200219 Reproduction way: - mount image - run poc code - umount image F2FS-fs (loop1): Bitmap was wrongly set, blk:15364 ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/devf2fs/segment.c:2061! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 2 PID: 17686 Comm: umount Tainted: G W O 4.18.0-rc2+ #39 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: update_sit_entry+0x459/0x4e0 [f2fs] Code: e8 1c b5 fd ff 0f 0b 0f 0b 8b 45 e4 c7 44 24 08 9c 7a 6c f8 c7 44 24 04 bc 4a 6c f8 89 44 24 0c 8b 06 89 04 24 e8 f7 b4 fd ff <0f> 0b 8b 45 e4 0f b6 d2 89 54 24 10 c7 44 24 08 60 7a 6c f8 c7 44 EAX: 00000032 EBX: 000000f8 ECX: 00000002 EDX: 00000001 ESI: d7177000 EDI: f520fe68 EBP: d6477c6c ESP: d6477c34 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010282 CR0: 80050033 CR2: b7fbe000 CR3: 2a99b3c0 CR4: 000406f0 Call Trace: f2fs_allocate_data_block+0x124/0x580 [f2fs] do_write_page+0x78/0x150 [f2fs] f2fs_do_write_node_page+0x25/0xa0 [f2fs] __write_node_page+0x2bf/0x550 [f2fs] f2fs_sync_node_pages+0x60e/0x6d0 [f2fs] ? sync_inode_metadata+0x2f/0x40 ? f2fs_write_checkpoint+0x28f/0x7d0 [f2fs] ? up_write+0x1e/0x80 f2fs_write_checkpoint+0x2a9/0x7d0 [f2fs] ? mark_held_locks+0x5d/0x80 ? _raw_spin_unlock_irq+0x27/0x50 kill_f2fs_super+0x68/0x90 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x81/0xa0 exit_to_usermode_loop+0x59/0xa7 do_fast_syscall_32+0x1f5/0x22c entry_SYSENTER_32+0x53/0x86 EIP: 0xb7f95c51 Code: c1 1e f7 ff ff 89 e5 8b 55 08 85 d2 8b 81 64 cd ff ff 74 02 89 02 5d c3 8b 0c 24 c3 8b 1c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76 EAX: 00000000 EBX: 0871ab90 ECX: bfb2cd00 EDX: 00000000 ESI: 00000000 EDI: 0871ab90 EBP: 0871ab90 ESP: bfb2cd7c DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246 Modules linked in: f2fs(O) crc32_generic bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev aesni_intel snd_seq_device aes_i586 snd_timer crypto_simd snd cryptd soundcore mac_hid serio_raw video i2c_piix4 parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 [last unloaded: f2fs] ---[ end trace d423f83982cfcdc5 ]--- The reason is, different log headers using the same segment, once one log's next block address is used by another log, it will cause panic as above. Main area: 24 segs, 24 secs 24 zones - COLD data: 0, 0, 0 - WARM data: 1, 1, 1 - HOT data: 20, 20, 20 - Dir dnode: 22, 22, 22 - File dnode: 22, 22, 22 - Indir nodes: 21, 21, 21 So this patch adds sanity check to detect such condition to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2acb7f751f1..32402f0d9d40 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2325,7 +2325,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int segment_count_main; unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; - int i; + int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -2366,11 +2366,43 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = i; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u) and Data segment (%u)" + " has the same segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); From e1293bdfa01dbde03d9ac15b7ceea71ba10b89c3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 7 Sep 2018 19:49:07 +0800 Subject: [PATCH 13/62] f2fs: plug readahead IO in readdir() Add a plug to merge readahead IO in readdir(), expecting it can reduce bio count before submitting to block layer. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ecc3a4e2be96..bd0348cc860f 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -784,9 +784,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de = NULL; struct fscrypt_str de_name = FSTR_INIT(NULL, 0); struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); + struct blk_plug plug; + bool readdir_ra = sbi->readdir_ra == 1; + int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); + if (readdir_ra) + blk_start_plug(&plug); + while (bit_pos < d->max) { bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); if (bit_pos >= d->max) @@ -806,29 +812,33 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int err; err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return err; + goto out; de_name = *fstr; fstr->len = save_len; } if (!dir_emit(ctx, de_name.name, de_name.len, - le32_to_cpu(de->ino), d_type)) - return 1; + le32_to_cpu(de->ino), d_type)) { + err = 1; + goto out; + } - if (sbi->readdir_ra == 1) + if (readdir_ra) f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return 0; +out: + if (readdir_ra) + blk_finish_plug(&plug); + return err; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) From b430f7263673eab1dc40e662ae3441a9619d16b8 Mon Sep 17 00:00:00 2001 From: Zhikang Zhang Date: Mon, 10 Sep 2018 16:18:25 +0800 Subject: [PATCH 14/62] f2fs: avoid sleeping under spin_lock In the call trace below, we might sleep in function dput(). So in order to avoid sleeping under spin_lock, we remove f2fs_mark_inode_dirty_sync from __try_update_largest_extent && __drop_largest_extent. BUG: sleeping function called from invalid context at fs/dcache.c:796 Call trace: dump_backtrace+0x0/0x3f4 show_stack+0x24/0x30 dump_stack+0xe0/0x138 ___might_sleep+0x2a8/0x2c8 __might_sleep+0x78/0x10c dput+0x7c/0x750 block_dump___mark_inode_dirty+0x120/0x17c __mark_inode_dirty+0x344/0x11f0 f2fs_mark_inode_dirty_sync+0x40/0x50 __insert_extent_tree+0x2e0/0x2f4 f2fs_update_extent_tree_range+0xcf4/0xde8 f2fs_update_extent_cache+0x114/0x12c f2fs_update_data_blkaddr+0x40/0x50 write_data_page+0x150/0x314 do_write_data_page+0x648/0x2318 __write_data_page+0xdb4/0x1640 f2fs_write_cache_pages+0x768/0xafc __f2fs_write_data_pages+0x590/0x1218 f2fs_write_data_pages+0x64/0x74 do_writepages+0x74/0xe4 __writeback_single_inode+0xdc/0x15f0 writeback_sb_inodes+0x574/0xc98 __writeback_inodes_wb+0x190/0x204 wb_writeback+0x730/0xf14 wb_check_old_data_flush+0x1bc/0x1c8 wb_workfn+0x554/0xf74 process_one_work+0x440/0x118c worker_thread+0xac/0x974 kthread+0x1a0/0x1c8 ret_from_fork+0x10/0x1c Signed-off-by: Zhikang Zhang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 51 ++++++++++++++++++++++++++---------------- fs/f2fs/f2fs.h | 7 +++--- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 231b77ef5a53..a70cd2580eae 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -308,14 +308,13 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, return count - atomic_read(&et->node_cnt); } -static void __drop_largest_extent(struct inode *inode, +static void __drop_largest_extent(struct extent_tree *et, pgoff_t fofs, unsigned int len) { - struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { - largest->len = 0; - f2fs_mark_inode_dirty_sync(inode, true); + if (fofs < et->largest.fofs + et->largest.len && + fofs + len > et->largest.fofs) { + et->largest.len = 0; + et->largest_updated = true; } } @@ -416,12 +415,11 @@ out: return ret; } -static struct extent_node *__try_merge_extent_node(struct inode *inode, +static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, struct extent_node *next_ex) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -443,7 +441,7 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, if (!en) return NULL; - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); spin_lock(&sbi->extent_lock); if (!list_empty(&en->list)) { @@ -454,12 +452,11 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, return en; } -static struct extent_node *__insert_extent_tree(struct inode *inode, +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -476,7 +473,7 @@ do_insert: if (!en) return NULL; - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); /* update in global extent list */ spin_lock(&sbi->extent_lock); @@ -497,6 +494,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; + bool updated = false; if (!et) return; @@ -517,7 +515,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, * drop largest extent before lookup, in case it's already * been shrunk from extent tree */ - __drop_largest_extent(inode, fofs, len); + __drop_largest_extent(et, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, @@ -550,7 +548,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(inode, et, &ei, + en1 = __insert_extent_tree(sbi, et, &ei, NULL, NULL); next_en = en1; } else { @@ -570,7 +568,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(inode, et, en); + __try_update_largest_extent(et, en); else __release_extent_node(sbi, et, en); @@ -590,15 +588,16 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, len); - if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) - __insert_extent_tree(inode, et, &ei, + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - __drop_largest_extent(inode, 0, UINT_MAX); + et->largest.len = 0; + et->largest_updated = true; set_inode_flag(inode, FI_NO_EXTENT); } } @@ -606,7 +605,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) __free_extent_tree(sbi, et); + if (et->largest_updated) { + et->largest_updated = false; + updated = true; + } + write_unlock(&et->lock); + + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -705,6 +712,7 @@ void f2fs_drop_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree; + bool updated = false; if (!f2fs_may_extent_tree(inode)) return; @@ -713,8 +721,13 @@ void f2fs_drop_extent_tree(struct inode *inode) write_lock(&et->lock); __free_extent_tree(sbi, et); - __drop_largest_extent(inode, 0, UINT_MAX); + if (et->largest.len) { + et->largest.len = 0; + updated = true; + } write_unlock(&et->lock); + if (updated) + f2fs_mark_inode_dirty_sync(inode, true); } void f2fs_destroy_extent_tree(struct inode *inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3a8a60e6844c..88b8d5073581 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -572,6 +572,7 @@ struct extent_tree { struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ atomic_t node_cnt; /* # of extent node in rb-tree*/ + bool largest_updated; /* largest extent updated */ }; /* @@ -754,12 +755,12 @@ static inline bool __is_front_mergeable(struct extent_info *cur, } extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); -static inline void __try_update_largest_extent(struct inode *inode, - struct extent_tree *et, struct extent_node *en) +static inline void __try_update_largest_extent(struct extent_tree *et, + struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode, true); + et->largest_updated = true; } } From c8e927579e00a182eda07e4c45df9c8c699c8ded Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 11 Sep 2018 08:54:21 +0900 Subject: [PATCH 15/62] f2fs: fix setattr project check upon fssetxattr ioctl Currently, project quota could be changed by fssetxattr ioctl, and existed permission check inode_owner_or_capable() is obviously not enough, just think that common users could change project id of file, that could make users to break project quota easily. This patch try to follow same regular of xfs project quota: "Project Quota ID state is only allowed to change from within the init namespace. Enforce that restriction only if we are trying to change the quota ID state. Everything else is allowed in user namespaces." Besides that, check and set project id'state should be an atomic operation, protect whole operation with inode lock. Signed-off-by: Wang Shilong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 60 +++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e21a1923056d..61f95731c858 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2616,34 +2616,26 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) return 0; - err = mnt_want_write_file(filp); - if (err) - return err; - err = -EPERM; - inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) - goto out_unlock; + return err; ipage = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto out_unlock; - } + if (IS_ERR(ipage)) + return PTR_ERR(ipage); if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, i_projid)) { err = -EOVERFLOW; f2fs_put_page(ipage, 1); - goto out_unlock; + return err; } f2fs_put_page(ipage, 1); err = dquot_initialize(inode); if (err) - goto out_unlock; + return err; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { @@ -2657,9 +2649,6 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) inode->i_ctime = current_time(inode); out_dirty: f2fs_mark_inode_dirty_sync(inode, true); -out_unlock: - inode_unlock(inode); - mnt_drop_write_file(filp); return err; } #else @@ -2735,6 +2724,30 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) return 0; } +static int f2fs_ioctl_check_project(struct inode *inode, struct fsxattr *fa) +{ + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (__kprojid_val(F2FS_I(inode)->i_projid) != fa->fsx_projid) + return -EINVAL; + + if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) { + if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT)) + return -EINVAL; + } else { + if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + return 0; +} + static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -2762,19 +2775,20 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) return err; inode_lock(inode); + err = f2fs_ioctl_check_project(inode, &fa); + if (err) + goto out; flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | (flags & F2FS_FL_XFLAG_VISIBLE); err = __f2fs_ioc_setflags(inode, flags); - inode_unlock(inode); - mnt_drop_write_file(filp); if (err) - return err; + goto out; err = f2fs_ioc_setproject(filp, fa.fsx_projid); - if (err) - return err; - - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; } int f2fs_pin_file_control(struct inode *inode, bool inc) From 4cb037ec3f754fdcb0f1efe9cc3269206dd2adef Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 12 Sep 2018 13:32:52 +0800 Subject: [PATCH 16/62] f2fs: surround fault_injection related option parsing using CONFIG_F2FS_FAULT_INJECTION It's a little bit strange when fault_injection related options fail with -EINVAL which were already disabled from config, so surround all fault_injection related option parsing code using CONFIG_F2FS_FAULT_INJECTION. Meanwhile, slightly change warning message to keep consistency with option POSIX_ACL and FS_XATTR. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 32402f0d9d40..3106da1d9be6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -594,28 +594,31 @@ static int parse_options(struct super_block *sb, char *options) } F2FS_OPTION(sbi).write_io_size_bits = arg; break; +#ifdef CONFIG_F2FS_FAULT_INJECTION case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); set_opt(sbi, FAULT_INJECTION); -#else - f2fs_msg(sb, KERN_INFO, - "FAULT_INJECTION was not selected"); -#endif break; + case Opt_fault_type: if (args->from && match_int(args, &arg)) return -EINVAL; -#ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); -#else - f2fs_msg(sb, KERN_INFO, - "FAULT_INJECTION was not selected"); -#endif break; +#else + case Opt_fault_injection: + f2fs_msg(sb, KERN_INFO, + "fault_injection options not supported"); + break; + + case Opt_fault_type: + f2fs_msg(sb, KERN_INFO, + "fault_type options not supported"); + break; +#endif case Opt_lazytime: sb->s_flags |= SB_LAZYTIME; break; From 7c1a000d466235c875a989971cfda344e6bb1166 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Sep 2018 09:16:07 +0800 Subject: [PATCH 17/62] f2fs: add SPDX license identifiers Remove the verbose license text from f2fs files and replace them with SPDX tags. This does not change the license of any of the code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 5 +---- fs/f2fs/acl.h | 5 +---- fs/f2fs/checkpoint.c | 5 +---- fs/f2fs/data.c | 5 +---- fs/f2fs/debug.c | 5 +---- fs/f2fs/dir.c | 5 +---- fs/f2fs/extent_cache.c | 5 +---- fs/f2fs/f2fs.h | 5 +---- fs/f2fs/file.c | 5 +---- fs/f2fs/gc.c | 5 +---- fs/f2fs/gc.h | 5 +---- fs/f2fs/hash.c | 5 +---- fs/f2fs/inline.c | 4 +--- fs/f2fs/inode.c | 5 +---- fs/f2fs/namei.c | 5 +---- fs/f2fs/node.c | 5 +---- fs/f2fs/node.h | 5 +---- fs/f2fs/recovery.c | 5 +---- fs/f2fs/segment.c | 5 +---- fs/f2fs/segment.h | 5 +---- fs/f2fs/shrinker.c | 5 +---- fs/f2fs/super.c | 5 +---- fs/f2fs/sysfs.c | 5 +---- fs/f2fs/trace.c | 5 +---- fs/f2fs/trace.h | 5 +---- fs/f2fs/xattr.c | 5 +---- fs/f2fs/xattr.h | 5 +---- include/linux/f2fs_fs.h | 5 +---- 28 files changed, 28 insertions(+), 111 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index cd82e1ce5d67..fa707cdd4120 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.c * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext2/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include "f2fs.h" diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 2c685185c24d..b96823c59b15 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/acl.h * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext2/acl.h * * Copyright (C) 2001-2003 Andreas Gruenbacher, - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_ACL_H__ #define __F2FS_ACL_H__ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 59d0472013f4..d312d2829d5a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 26f38b224bb2..c8c4b54e2bbf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/data.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ebe649d9793c..d3c402183e3c 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs debugging statistics * @@ -5,10 +6,6 @@ * http://www.samsung.com/ * Copyright (c) 2012 Linux Foundation * Copyright (c) 2012 Greg Kroah-Hartman - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bd0348cc860f..c77a58038709 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/dir.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index a70cd2580eae..904ad7ba5a45 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs extent cache support * @@ -5,10 +6,6 @@ * Copyright (c) 2015 Samsung Electronics * Authors: Jaegeuk Kim * Chao Yu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 88b8d5073581..079f525d5764 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/f2fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 61f95731c858..357422a4c319 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/file.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c8d00422237..a4c1a419611d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index c8619e408009..bbac9d3787bd 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/gc.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define GC_THREAD_MIN_WB_PAGES 1 /* * a threshold to determine diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index eb2e031ea887..cc82f142f811 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/hash.c * @@ -7,10 +8,6 @@ * Portions of this code from linux/fs/ext3/hash.c * * Copyright (C) 2002 by Theodore Ts'o - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 115dc219344b..425d740f87fd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inline.c * Copyright (c) 2013, Intel Corporation * Authors: Huajun Li * Haicheng Li - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 959df2249875..86e7333d60c1 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/inode.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 1f67e389169f..9ad451ac2cec 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/namei.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dd2e45a661aa..fa2381c0bc47 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 0f4db7a61254..1c73d879a9bc 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ #define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e3aa6eee7a8b..56d34193a74b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/recovery.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 187c848a65b8..9a8d7d415a74 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index b3d9e317ff0c..086150028c6d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 36cfd816c160..9e13db994fdf 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs shrinker support * the basic infra was copied from fs/ubifs/shrinker.c * * Copyright (c) 2015 Motorola Mobility * Copyright (c) 2015 Jaegeuk Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3106da1d9be6..8c536105d5ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/super.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81c0e5337443..c8924c02accd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs sysfs interface * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * Copyright (c) 2017 Chao Yu - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index a1fcd00bbb2b..ce2a5eb210b6 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs IO tracer * * Copyright (c) 2014 Motorola Mobility * Copyright (c) 2014 Jaegeuk Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h index 67db24ac1e85..e8075fc5b228 100644 --- a/fs/f2fs/trace.h +++ b/fs/f2fs/trace.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * f2fs IO tracer * * Copyright (c) 2014 Motorola Mobility * Copyright (c) 2014 Jaegeuk Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_TRACE_H__ #define __F2FS_TRACE_H__ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 77a010e625f5..7261245c208d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/xattr.c * @@ -13,10 +14,6 @@ * suggestion of Luka Renko . * xattr consolidation Copyright (c) 2004 James Morris , * Red Hat Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include #include diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index dbcd1d16e669..67db134da0f5 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/xattr.h * @@ -9,10 +10,6 @@ * On-disk format of extended attributes for the ext2 filesystem. * * (C) 2001 Andreas Gruenbacher, - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __F2FS_XATTR_H__ #define __F2FS_XATTR_H__ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index f70f8ac9c4f4..1d4b196291d6 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * include/linux/f2fs_fs.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _LINUX_F2FS_FS_H #define _LINUX_F2FS_FS_H From 6f5c2ed0a26fae6904a88622c126dfb9369548a3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Sep 2018 09:22:29 +0800 Subject: [PATCH 18/62] f2fs: split IO error injection according to RW This patch adds to support injecting error for write IO, this can simulate IO error like fail_make_request or dm_flakey does. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 3 ++- fs/f2fs/data.c | 10 ++++++++-- fs/f2fs/f2fs.h | 3 ++- fs/f2fs/super.c | 3 ++- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index e5edd29687b5..bde3e91e5372 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -172,9 +172,10 @@ fault_type=%d Support configuring fault injection type, should be FAULT_DIR_DEPTH 0x000000100 FAULT_EVICT_INODE 0x000000200 FAULT_TRUNCATE 0x000000400 - FAULT_IO 0x000000800 + FAULT_READ_IO 0x000000800 FAULT_CHECKPOINT 0x000001000 FAULT_DISCARD 0x000002000 + FAULT_WRITE_IO 0x000004000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c8c4b54e2bbf..57c0823d22e0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -123,8 +123,9 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { - if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { - f2fs_show_injection_info(FAULT_IO); + if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), + FAULT_READ_IO)) { + f2fs_show_injection_info(FAULT_READ_IO); bio->bi_status = BLK_STS_IOERR; } @@ -145,6 +146,11 @@ static void f2fs_write_end_io(struct bio *bio) struct bio_vec *bvec; int i; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { + f2fs_show_injection_info(FAULT_WRITE_IO); + bio->bi_status = BLK_STS_IOERR; + } + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 079f525d5764..b676b82312e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,9 +50,10 @@ enum { FAULT_DIR_DEPTH, FAULT_EVICT_INODE, FAULT_TRUNCATE, - FAULT_IO, + FAULT_READ_IO, FAULT_CHECKPOINT, FAULT_DISCARD, + FAULT_WRITE_IO, FAULT_MAX, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8c536105d5ef..945468968d4e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -50,9 +50,10 @@ char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", [FAULT_TRUNCATE] = "truncate fail", - [FAULT_IO] = "IO error", + [FAULT_READ_IO] = "read IO error", [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", + [FAULT_WRITE_IO] = "write IO error", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, From a7d10cf3e4e3e308da01462a1ef8008233ee523d Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 19 Sep 2018 14:18:47 +0530 Subject: [PATCH 19/62] f2fs: add new idle interval timing for discard and gc paths This helps to control the frequency of submission of discard and GC requests independently, based on the need. Suggested-by: Chao Yu Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 17 +++++++++++++- fs/f2fs/f2fs.h | 30 ++++++++++++++++++++++--- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 14 +++++------- fs/f2fs/super.c | 2 ++ fs/f2fs/sysfs.c | 5 +++++ 6 files changed, 56 insertions(+), 14 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 94a24aedcdb2..3ac41774ad3c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -121,7 +121,22 @@ What: /sys/fs/f2fs//idle_interval Date: January 2016 Contact: "Jaegeuk Kim" Description: - Controls the idle timing. + Controls the idle timing for all paths other than + discard and gc path. + +What: /sys/fs/f2fs//discard_idle_interval +Date: September 2018 +Contact: "Chao Yu" +Contact: "Sahitya Tummala" +Description: + Controls the idle timing for discard path. + +What: /sys/fs/f2fs//gc_idle_interval +Date: September 2018 +Contact: "Chao Yu" +Contact: "Sahitya Tummala" +Description: + Controls the idle timing for gc path. What: /sys/fs/f2fs//iostat_enable Date: August 2017 diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b676b82312e0..125f8b79a8c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1093,6 +1093,8 @@ enum { enum { CP_TIME, REQ_TIME, + DISCARD_TIME, + GC_TIME, MAX_TIME, }; @@ -1344,7 +1346,15 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { - sbi->last_time[type] = jiffies; + unsigned long now = jiffies; + + sbi->last_time[type] = now; + + /* DISCARD_TIME and GC_TIME are based on REQ_TIME */ + if (type == REQ_TIME) { + sbi->last_time[DISCARD_TIME] = now; + sbi->last_time[GC_TIME] = now; + } } static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) @@ -1354,7 +1364,21 @@ static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) return time_after(jiffies, sbi->last_time[type] + interval); } -static inline bool is_idle(struct f2fs_sb_info *sbi) +static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, + int type) +{ + unsigned long interval = sbi->interval_time[type] * HZ; + unsigned int wait_ms = 0; + long delta; + + delta = (sbi->last_time[type] + interval) - jiffies; + if (delta > 0) + wait_ms = jiffies_to_msecs(delta); + + return wait_ms; +} + +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { struct block_device *bdev = sbi->sb->s_bdev; struct request_queue *q = bdev_get_queue(bdev); @@ -1363,7 +1387,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) return false; - return f2fs_time_over(sbi, REQ_TIME); + return f2fs_time_over(sbi, type); } /* diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a4c1a419611d..77ffa8045a3b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -80,7 +80,7 @@ static int gc_thread_func(void *data) if (!mutex_trylock(&sbi->gc_mutex)) goto next; - if (!is_idle(sbi)) { + if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); goto next; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9a8d7d415a74..97a4fae75651 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -508,7 +508,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi) && + if (!is_idle(sbi, REQ_TIME) && (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; @@ -1308,7 +1308,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, if (dc->state != D_PREP) goto next; - if (dpolicy->io_aware && !is_idle(sbi)) { + if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } @@ -1368,7 +1368,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->io_aware && i < dpolicy->io_aware_gran && - !is_idle(sbi)) { + !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } @@ -1561,8 +1561,6 @@ static int issue_discard_thread(void *data) struct discard_policy dpolicy; unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; int issued; - unsigned long interval = sbi->interval_time[REQ_TIME] * HZ; - long delta; set_freezable(); @@ -1599,10 +1597,8 @@ static int issue_discard_thread(void *data) __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1){ - delta = (sbi->last_time[REQ_TIME] + interval) - jiffies; - if (delta > 0) - wait_ms = jiffies_to_msecs(delta); - else + wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); + if (!wait_ms) wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 945468968d4e..c112403b3ca7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2461,6 +2461,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c8924c02accd..f5a545437b81 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -404,6 +404,9 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, + interval_time[DISCARD_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -457,6 +460,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(discard_idle_interval), + ATTR_LIST(gc_idle_interval), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), From f84262b0862d43b71b3e80a036cdd9d82e620367 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Sep 2018 15:45:19 -0700 Subject: [PATCH 20/62] f2fs: avoid infinite loop in f2fs_alloc_nid If we have an error in f2fs_build_free_nids, we're able to fall into a loop to find free nids. Suggested-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fa2381c0bc47..d62f53096b55 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2350,8 +2350,9 @@ retry: spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - f2fs_build_free_nids(sbi, true, false); - goto retry; + if (!f2fs_build_free_nids(sbi, true, false)) + goto retry; + return false; } /* From dc4cd1257c86451cec3e8e352cc376348e4f4af4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 20 Sep 2018 17:41:30 +0800 Subject: [PATCH 21/62] f2fs: fix to recover inode's uid/gid during POR Step to reproduce this bug: 1. logon as root 2. mount -t f2fs /dev/sdd /mnt; 3. touch /mnt/file; 4. chown system /mnt/file; chgrp system /mnt/file; 5. xfs_io -f /mnt/file -c "fsync"; 6. godown /mnt; 7. umount /mnt; 8. mount -t f2fs /dev/sdd /mnt; After step 8) we will expect file's uid/gid are all system, but during recovery, these two fields were not been recovered, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 56d34193a74b..41f2c0fe6d8e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -209,6 +209,8 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); + i_uid_write(inode, le32_to_cpu(raw->i_uid)); + i_gid_write(inode, le32_to_cpu(raw->i_gid)); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); From c6b1867b1da3b1203b4c49988afeebdcbdf65499 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 22 Sep 2018 22:43:09 +0800 Subject: [PATCH 22/62] f2fs: fix remount problem of option io_bits Currently we show mount option "io_bits=%u" as "io_size=%uKB", it will cause option parsing problem(unrecognized mount option) in remount. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c112403b3ca7..c47b1ef2685a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1330,7 +1330,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) from_kgid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resgid)); if (F2FS_IO_SIZE_BITS(sbi)) - seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); + seq_printf(seq, ",io_bits=%u", + F2FS_OPTION(sbi).write_io_size_bits); #ifdef CONFIG_F2FS_FAULT_INJECTION if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", From d83d0f5ba8532e649146ac32ae47167a28d98c84 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Sep 2018 13:25:04 -0700 Subject: [PATCH 23/62] f2fs: report ENOENT correctly in f2fs_rename This fixes wrong error report in f2fs_rename. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9ad451ac2cec..ded185baa9ae 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -820,7 +820,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; bool is_old_inline = f2fs_has_inline_dentry(old_dir); - int err = -ENOENT; + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; @@ -844,6 +844,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -1009,7 +1010,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; struct f2fs_dir_entry *old_entry, *new_entry; int old_nlink = 0, new_nlink = 0; - int err = -ENOENT; + int err; if (unlikely(f2fs_cp_error(sbi))) return -EIO; @@ -1030,6 +1031,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out; + err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) From 0a4daae5ffea39f5015334e4d18a6a80b447cae4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Sep 2018 15:28:40 -0700 Subject: [PATCH 24/62] f2fs: update i_size after DIO completion This is related to ee70daaba82d ("xfs: update i_size after unwritten conversion in dio completion") If we update i_size during dio_write, dio_read can read out stale data, which breaks xfstests/465. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 15 +++++++-------- fs/f2fs/f2fs.h | 1 + 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 57c0823d22e0..41102c227eee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -881,7 +881,6 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) struct f2fs_summary sum; struct node_info ni; block_t old_blkaddr; - pgoff_t fofs; blkcnt_t count = 1; int err; @@ -910,12 +909,10 @@ alloc: old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); - /* update i_size */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + - dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) - f2fs_i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_SHIFT)); + /* + * i_size will be updated by direct_IO. Otherwise, we'll get stale + * data from unwritten block via dio_read. + */ return 0; } @@ -1081,6 +1078,8 @@ next_block: last_ofs_in_node = dn.ofs_in_node; } } else { + WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO && + flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); if (!err) @@ -1260,7 +1259,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DEFAULT, NULL, + F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type( inode->i_write_hint)); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 125f8b79a8c1..b32fd9f7da95 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -599,6 +599,7 @@ enum { F2FS_GET_BLOCK_DEFAULT, F2FS_GET_BLOCK_FIEMAP, F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_DIO, F2FS_GET_BLOCK_PRE_DIO, F2FS_GET_BLOCK_PRE_AIO, F2FS_GET_BLOCK_PRECACHE, From f4474aa6e5e901ee4af21f39f1b9115aaaaec503 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:35:58 +0800 Subject: [PATCH 25/62] f2fs: fix to recover inode's project id during POR Testcase to reproduce this bug: 1. mkfs.f2fs -O extra_attr -O project_quota /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. touch /mnt/f2fs/file 4. sync 5. chattr -p 1 /mnt/f2fs/file 6. xfs_io -f /mnt/f2fs/file -c "fsync" 7. godown /mnt/f2fs 8. umount /mnt/f2fs 9. mount -t f2fs /dev/sdd /mnt/f2fs 10. lsattr -p /mnt/f2fs/file 0 -----------------N- /mnt/f2fs/file But actually, we expect the correct result is: 1 -----------------N- /mnt/f2fs/file The reason is we didn't recover inode.i_projid field during mount, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 41f2c0fe6d8e..2b59e37d9093 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -211,6 +211,19 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); i_uid_write(inode, le32_to_cpu(raw->i_uid)); i_gid_write(inode, le32_to_cpu(raw->i_gid)); + + if (raw->i_inline & F2FS_EXTRA_ATTR) { + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), + i_projid)) { + projid_t i_projid; + + i_projid = (projid_t)le32_to_cpu(raw->i_projid); + F2FS_I(inode)->i_projid = + make_kprojid(&init_user_ns, i_projid); + } + } + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); From 19c73a691ccf6fb2f12d4e9cf9830023966cec88 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:35:59 +0800 Subject: [PATCH 26/62] f2fs: fix to recover inode's i_flags during POR Testcase to reproduce this bug: 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. touch /mnt/f2fs/file 4. sync 5. chattr +A /mnt/f2fs/file 6. xfs_io -f /mnt/f2fs/file -c "fsync" 7. godown /mnt/f2fs 8. umount /mnt/f2fs 9. mount -t f2fs /dev/sdd /mnt/f2fs 10. lsattr /mnt/f2fs/file -----------------N- /mnt/f2fs/file But actually, we expect the corrct result is: -------A---------N- /mnt/f2fs/file The reason is we didn't recover inode.i_flags field during mount, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2b59e37d9093..d8c169c38417 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -233,6 +233,7 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); F2FS_I(inode)->i_advise = raw->i_advise; + F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); recover_inline_flags(inode, raw); From 7de36cf3e4087207f42a88992f8cb615a1bd902e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:36:00 +0800 Subject: [PATCH 27/62] f2fs: fix to recover inode's i_gc_failures during POR inode.i_gc_failures is used to indicate that skip count of migrating on blocks of inode, we should guarantee it can be recovered in sudden power-off case. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d8c169c38417..f11eefc24818 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -234,6 +234,8 @@ static void recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); + F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = + le16_to_cpu(raw->i_gc_failures); recover_inline_flags(inode, raw); From 5cd1f387a13b5188b4edb4c834310302a85a6ea2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:36:01 +0800 Subject: [PATCH 28/62] f2fs: fix to recover inode's crtime during POR Testcase to reproduce this bug: 1. mkfs.f2fs -O extra_attr -O inode_crtime /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. touch /mnt/f2fs/file 4. xfs_io -f /mnt/f2fs/file -c "fsync" 5. godown /mnt/f2fs 6. umount /mnt/f2fs 7. mount -t f2fs /dev/sdd /mnt/f2fs 8. xfs_io -f /mnt/f2fs/file -c "statx -r" stat.btime.tv_sec = 0 stat.btime.tv_nsec = 0 This patch fixes to recover inode creation time fields during mount. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d62f53096b55..8f4e12819c94 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2558,6 +2558,13 @@ retry: F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), i_projid)) dst->i_projid = src->i_projid; + + if (f2fs_sb_has_inode_crtime(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_crtime_nsec)) { + dst->i_crtime = src->i_crtime; + dst->i_crtime_nsec = src->i_crtime_nsec; + } } new_ni = old_ni; From 4a1728cad6340bfbe17bd17fd158b2165cd99508 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:36:03 +0800 Subject: [PATCH 29/62] f2fs: mark inode dirty explicitly in recover_inode() Mark inode dirty explicitly in the end of recover_inode() to make sure that all recoverable fields can be persisted later. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index f11eefc24818..fb24a6d734e9 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -239,6 +239,8 @@ static void recover_inode(struct inode *inode, struct page *page) recover_inline_flags(inode, raw); + f2fs_mark_inode_dirty_sync(inode, true); + if (file_enc_name(inode)) name = ""; else From edc55aaf0d1712b54a3704dd58423c7e495534fe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Sep 2018 17:36:06 -0700 Subject: [PATCH 30/62] f2fs: avoid f2fs_bug_on if f2fs_get_meta_page_nofail got EIO This patch avoids BUG_ON when f2fs_get_meta_page_nofail got EIO during xfstests/generic/475. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 10 +++++----- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 12 ++++++++++++ fs/f2fs/node.c | 32 ++++++++++++++++++++++++-------- fs/f2fs/recovery.c | 2 ++ fs/f2fs/segment.c | 3 +++ 6 files changed, 47 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d312d2829d5a..f4b9b81e3ed6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -119,11 +119,8 @@ retry: if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; - f2fs_stop_checkpoint(sbi, false); - f2fs_bug_on(sbi, 1); } - return page; } @@ -1496,7 +1493,10 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - f2fs_flush_nat_entries(sbi, cpc); + err = f2fs_flush_nat_entries(sbi, cpc); + if (err) + goto stop; + f2fs_flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ @@ -1505,7 +1505,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_release_discard_addrs(sbi); else f2fs_clear_prefree_segments(sbi, cpc); - +stop: unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b32fd9f7da95..dcfa6a94676e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2911,7 +2911,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_node_manager_caches(void); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 77ffa8045a3b..247b31e73303 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1066,6 +1066,18 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* reference all summary page */ while (segno < end_segno) { sum_page = f2fs_get_sum_page(sbi, segno++); + if (IS_ERR(sum_page)) { + int err = PTR_ERR(sum_page); + + end_segno = segno - 1; + for (segno = start_segno; segno < end_segno; segno++) { + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_put_page(sum_page, 0); + f2fs_put_page(sum_page, 0); + } + return err; + } unlock_page(sum_page); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8f4e12819c94..3994b44541b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -126,6 +126,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_current_nat_page(sbi, nid); + if (IS_ERR(src_page)) + return src_page; dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); @@ -2265,15 +2267,19 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - ret = scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + } else { + ret = scan_nat_page(sbi, page, nid); + f2fs_put_page(page, 1); + } if (ret) { up_read(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, !mount); f2fs_msg(sbi->sb, KERN_ERR, "NAT is corrupt, run fsck to fix it"); - return -EINVAL; + return ret; } } @@ -2708,7 +2714,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, __clear_bit_le(nat_index, nm_i->full_nat_bits); } -static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, +static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2732,6 +2738,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = page_address(page); f2fs_bug_on(sbi, !nat_blk); } @@ -2777,12 +2786,13 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); kmem_cache_free(nat_entry_set_slab, set); } + return 0; } /* * This function is called during the checkpointing process. */ -void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2792,6 +2802,7 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int found; nid_t set_idx = 0; LIST_HEAD(sets); + int err = 0; /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ if (enabled_nat_bits(sbi, cpc)) { @@ -2801,7 +2812,7 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (!nm_i->dirty_nat_cnt) - return; + return 0; down_write(&nm_i->nat_tree_lock); @@ -2824,11 +2835,16 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* flush dirty nats in nat entry set */ - list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set, cpc); + list_for_each_entry_safe(set, tmp, &sets, set_list) { + err = __flush_nat_entry_set(sbi, set, cpc); + if (err) + break; + } up_write(&nm_i->nat_tree_lock); /* Allow dirty nats by node block allocation in write_begin */ + + return err; } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fb24a6d734e9..93e3b6c46aae 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -375,6 +375,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } sum_page = f2fs_get_sum_page(sbi, segno); + if (IS_ERR(sum_page)) + return PTR_ERR(sum_page); sum_node = (struct f2fs_summary_block *)page_address(sum_page); sum = sum_node->entries[blkoff]; f2fs_put_page(sum_page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 97a4fae75651..d33e9d0d5d47 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2429,6 +2429,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type) __next_free_blkoff(sbi, curseg, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); + f2fs_bug_on(sbi, IS_ERR(sum_page)); sum_node = (struct f2fs_summary_block *)page_address(sum_page); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_put_page(sum_page, 1); @@ -3903,6 +3904,8 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); + if (IS_ERR(page)) + return PTR_ERR(page); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); From 61f7725aa148ee870436a29d3a24d5c00ab7e9af Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 25 Sep 2018 15:25:21 -0700 Subject: [PATCH 31/62] f2fs: return correct errno in f2fs_gc This fixes overriding error number in f2fs_gc. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 247b31e73303..f541aa3ff7d5 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1253,7 +1253,7 @@ stop: put_gc_inode(&gc_list); - if (sync) + if (sync && !ret) ret = sec_freed ? 0 : -EAGAIN; return ret; } From 89d13c38501df730cbb2e02c4499da1b5187119d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Sep 2018 22:15:31 -0700 Subject: [PATCH 32/62] f2fs: fix missing up_read This patch fixes missing up_read call. Fixes: c9b60788fc76 ("f2fs: fix to do sanity check with block address in main area") Cc: # 4.19+ Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3994b44541b4..acb819b8fc42 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1541,8 +1541,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, } if (__is_valid_data_blkaddr(ni.blk_addr) && - !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) { + up_read(&sbi->node_write); goto redirty_out; + } if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; From 095680f24f2673d860fd1d3d2f54f40f330b4c63 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Sep 2018 00:24:39 -0700 Subject: [PATCH 33/62] f2fs: keep lazytime on remount This patch fixes losing lazytime when remounting f2fs. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c47b1ef2685a..218695e44bd4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1550,6 +1550,7 @@ skip: (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_gc: if (need_restart_gc) { From b63e7be5908cb757fc6b98fb9534dabd78b5338a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Sep 2018 18:31:27 +0800 Subject: [PATCH 34/62] f2fs: add to account meta IO This patch supports to account meta IO, it enables to show write IO from f2fs more comprehensively via 'status' debugfs entry. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 13 +++++++++++++ fs/f2fs/f2fs.h | 15 +++++++++++++++ fs/f2fs/segment.c | 1 + 3 files changed, 29 insertions(+) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index d3c402183e3c..da1cabbc4973 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -118,6 +118,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } + for (i = META_CP; i < META_MAX; i++) + si->meta_count[i] = atomic_read(&sbi->meta_count[i]); + for (i = 0; i < 2; i++) { si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; @@ -329,6 +332,13 @@ static int stat_show(struct seq_file *s, void *v) si->prefree_count, si->free_segs, si->free_secs); seq_printf(s, "CP calls: %d (BG: %d)\n", si->cp_count, si->bg_cp_count); + seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]); + seq_printf(s, " - sit blocks : %u\n", + si->meta_count[META_SIT]); + seq_printf(s, " - nat blocks : %u\n", + si->meta_count[META_NAT]); + seq_printf(s, " - ssa blocks : %u\n", + si->meta_count[META_SSA]); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -441,6 +451,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + int i; si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL); if (!si) @@ -466,6 +477,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_inode, 0); atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + for (i = META_CP; i < META_MAX; i++) + atomic_set(&sbi->meta_count[i], 0); atomic_set(&sbi->aw_cnt, 0); atomic_set(&sbi->vw_cnt, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dcfa6a94676e..46cdb532e47d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -201,6 +201,7 @@ enum { META_NAT, META_SIT, META_SSA, + META_MAX, META_POR, DATA_GENERIC, META_GENERIC, @@ -1260,6 +1261,7 @@ struct f2fs_sb_info { */ #ifdef CONFIG_F2FS_STAT_FS struct f2fs_stat_info *stat_info; /* FS status information */ + atomic_t meta_count[META_MAX]; /* # of meta blocks */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ atomic_t inplace_count; /* # of inplace update */ @@ -3123,6 +3125,7 @@ struct f2fs_stat_info { int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; + unsigned int meta_count[META_MAX]; unsigned int segment_count[2]; unsigned int block_count[2]; unsigned int inplace_count; @@ -3174,6 +3177,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (f2fs_has_inline_dentry(inode)) \ (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) +#define stat_inc_meta_count(sbi, blkaddr) \ + do { \ + if (blkaddr < SIT_I(sbi)->sit_base_addr) \ + atomic_inc(&(sbi)->meta_count[META_CP]); \ + else if (blkaddr < NM_I(sbi)->nat_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SIT]); \ + else if (blkaddr < SM_I(sbi)->ssa_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_NAT]); \ + else if (blkaddr < SM_I(sbi)->main_blkaddr) \ + atomic_inc(&(sbi)->meta_count[META_SSA]); \ + } while (0) #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ @@ -3261,6 +3275,7 @@ void f2fs_destroy_root_stats(void); #define stat_inc_volatile_write(inode) do { } while (0) #define stat_dec_volatile_write(inode) do { } while (0) #define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_meta_count(sbi, blkaddr) do { } while (0) #define stat_inc_seg_type(sbi, curseg) do { } while (0) #define stat_inc_block_count(sbi, curseg) do { } while (0) #define stat_inc_inplace_blocks(sbi) do { } while (0) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d33e9d0d5d47..3fa8870569cc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3018,6 +3018,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page, ClearPageError(page); f2fs_submit_page_write(&fio); + stat_inc_meta_count(sbi, page->index); f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } From 274bd9ba39425610fdb9a6827602197a5cd27cd8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Sep 2018 18:31:28 +0800 Subject: [PATCH 35/62] f2fs: add to account skip count of background GC This patch adds to account skip count of background GC, and show stat info via 'status' debugfs entry. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++++ fs/f2fs/f2fs.h | 7 +++++++ fs/f2fs/gc.c | 14 +++++++++++--- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index da1cabbc4973..75bc62edc4c1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -101,6 +101,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID]; si->bg_gc = sbi->bg_gc; + si->io_skip_bggc = sbi->io_skip_bggc; + si->other_skip_bggc = sbi->other_skip_bggc; si->skipped_atomic_files[BG_GC] = sbi->skipped_atomic_files[BG_GC]; si->skipped_atomic_files[FG_GC] = sbi->skipped_atomic_files[FG_GC]; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -355,6 +357,8 @@ static int stat_show(struct seq_file *s, void *v) si->skipped_atomic_files[BG_GC] + si->skipped_atomic_files[FG_GC], si->skipped_atomic_files[BG_GC]); + seq_printf(s, "BG skip : IO: %u, Other: %u\n", + si->io_skip_bggc, si->other_skip_bggc); seq_puts(s, "\nExtent Cache:\n"); seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n", si->hit_largest, si->hit_cached, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 46cdb532e47d..19243678d5d9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1277,6 +1277,8 @@ struct f2fs_sb_info { atomic_t max_aw_cnt; /* max # of atomic writes */ atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ + unsigned int io_skip_bggc; /* skip background gc for in-flight IO */ + unsigned int other_skip_bggc; /* skip background gc for other reasons */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif spinlock_t stat_lock; /* lock for stat operations */ @@ -3104,6 +3106,7 @@ struct f2fs_stat_info { int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; + unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; int nr_discard_cmd; @@ -3141,6 +3144,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) +#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++) +#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++) #define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) #define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) @@ -3257,6 +3262,8 @@ void f2fs_destroy_root_stats(void); #define stat_inc_bg_cp_count(si) do { } while (0) #define stat_inc_call_count(si) do { } while (0) #define stat_inc_bggc_count(si) do { } while (0) +#define stat_io_skip_bggc_count(sbi) do { } while (0) +#define stat_other_skip_bggc_count(sbi) do { } while (0) #define stat_inc_dirty_inode(sbi, type) do { } while (0) #define stat_dec_dirty_inode(sbi, type) do { } while (0) #define stat_inc_total_hit(sb) do { } while (0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f541aa3ff7d5..99ed8a5d9249 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -40,13 +40,16 @@ static int gc_thread_func(void *data) if (gc_th->gc_wake) gc_th->gc_wake = 0; - if (try_to_freeze()) + if (try_to_freeze()) { + stat_other_skip_bggc_count(sbi); continue; + } if (kthread_should_stop()) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { increase_sleep_time(gc_th, &wait_ms); + stat_other_skip_bggc_count(sbi); continue; } @@ -55,8 +58,10 @@ static int gc_thread_func(void *data) f2fs_stop_checkpoint(sbi, false); } - if (!sb_start_write_trylock(sbi->sb)) + if (!sb_start_write_trylock(sbi->sb)) { + stat_other_skip_bggc_count(sbi); continue; + } /* * [GC triggering condition] @@ -77,12 +82,15 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!mutex_trylock(&sbi->gc_mutex)) + if (!mutex_trylock(&sbi->gc_mutex)) { + stat_other_skip_bggc_count(sbi); goto next; + } if (!is_idle(sbi, GC_TIME)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); + stat_io_skip_bggc_count(sbi); goto next; } From d440c52d3151a28358f4c2d52d8583a0aa54ab83 Mon Sep 17 00:00:00 2001 From: Junling Zheng Date: Fri, 28 Sep 2018 20:25:56 +0800 Subject: [PATCH 36/62] f2fs: support superblock checksum Now we support crc32 checksum for superblock. Reviewed-by: Chao Yu Signed-off-by: Junling Zheng Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 28 ++++++++++++++++++++++++++++ fs/f2fs/sysfs.c | 7 +++++++ include/linux/f2fs_fs.h | 3 ++- 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19243678d5d9..668836c2d678 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -148,6 +148,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_INODE_CRTIME 0x0100 #define F2FS_FEATURE_LOST_FOUND 0x0200 #define F2FS_FEATURE_VERITY 0x0400 /* reserved */ +#define F2FS_FEATURE_SB_CHKSUM 0x0800 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3431,6 +3432,7 @@ F2FS_FEATURE_FUNCS(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_FUNCS(quota_ino, QUOTA_INO); F2FS_FEATURE_FUNCS(inode_crtime, INODE_CRTIME); F2FS_FEATURE_FUNCS(lost_found, LOST_FOUND); +F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 218695e44bd4..a44913224e3b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2178,6 +2178,26 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, (bh->b_data + F2FS_SUPER_OFFSET); struct super_block *sb = sbi->sb; unsigned int blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + /* Check checksum_offset and crc in superblock */ + if (le32_to_cpu(raw_super->feature) & F2FS_FEATURE_SB_CHKSUM) { + crc_offset = le32_to_cpu(raw_super->checksum_offset); + if (crc_offset != + offsetof(struct f2fs_super_block, crc)) { + f2fs_msg(sb, KERN_INFO, + "Invalid SB checksum offset: %zu", + crc_offset); + return 1; + } + crc = le32_to_cpu(raw_super->crc); + if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) { + f2fs_msg(sb, KERN_INFO, + "Invalid SB checksum value: %u", crc); + return 1; + } + } if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { f2fs_msg(sb, KERN_INFO, @@ -2635,6 +2655,7 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi, int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { struct buffer_head *bh; + __u32 crc = 0; int err; if ((recover && f2fs_readonly(sbi->sb)) || @@ -2643,6 +2664,13 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return -EROFS; } + /* we should update superblock crc here */ + if (!recover && f2fs_sb_has_sb_chksum(sbi->sb)) { + crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), + offsetof(struct f2fs_super_block, crc)); + F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); + } + /* write back-up superblock first */ bh = sb_bread(sbi->sb, sbi->valid_super_block ? 0 : 1); if (!bh) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f5a545437b81..b777cbdd796b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -117,6 +117,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_lost_found(sb)) len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); + if (f2fs_sb_has_sb_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "sb_checksum"); len += snprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -334,6 +337,7 @@ enum feat_id { FEAT_QUOTA_INO, FEAT_INODE_CRTIME, FEAT_LOST_FOUND, + FEAT_SB_CHECKSUM, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -350,6 +354,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_QUOTA_INO: case FEAT_INODE_CRTIME: case FEAT_LOST_FOUND: + case FEAT_SB_CHECKSUM: return snprintf(buf, PAGE_SIZE, "supported\n"); } return 0; @@ -434,6 +439,7 @@ F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR); F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO); F2FS_FEATURE_RO_ATTR(inode_crtime, FEAT_INODE_CRTIME); F2FS_FEATURE_RO_ATTR(lost_found, FEAT_LOST_FOUND); +F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -493,6 +499,7 @@ static struct attribute *f2fs_feat_attrs[] = { ATTR_LIST(quota_ino), ATTR_LIST(inode_crtime), ATTR_LIST(lost_found), + ATTR_LIST(sb_checksum), NULL, }; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 1d4b196291d6..1db13ff9a3f4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -109,7 +109,8 @@ struct f2fs_super_block { struct f2fs_device devs[MAX_DEVICES]; /* device list */ __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */ __u8 hot_ext_count; /* # of hot file extension */ - __u8 reserved[314]; /* valid reserved region */ + __u8 reserved[310]; /* valid reserved region */ + __le32 crc; /* checksum of superblock */ } __packed; /* From bab475c5414e8d1fa182fd17ae966864e9c85741 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Sep 2018 23:41:16 +0800 Subject: [PATCH 37/62] Revert: "f2fs: check last page index in cached bio to decide submission" There is one case that we can leave bio in f2fs, result in hanging page writeback waiter. Thread A Thread B - f2fs_write_cache_pages - f2fs_submit_page_write page #0 cached in bio #0 of cold log - f2fs_submit_page_write page #1 cached in bio #1 of warm log - f2fs_write_cache_pages - f2fs_submit_page_write bio is full, submit bio #1 contain page #1 - f2fs_submit_merged_write_cond(, page #1) fail to submit bio #0 due to page #1 is not in any cached bios. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 38 +++++++++++++++++++------------------- fs/f2fs/f2fs.h | 4 ++-- fs/f2fs/node.c | 11 +++++------ fs/f2fs/segment.c | 11 +++++------ 5 files changed, 32 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f4b9b81e3ed6..97b429ba2911 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -276,8 +276,7 @@ static int __f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_write_cond(sbi, page->mapping->host, - 0, page->index, META); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); unlock_page(page); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 41102c227eee..ea16cadd416e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -322,8 +322,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, - struct inode *inode, nid_t ino, pgoff_t idx) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) { struct bio_vec *bvec; struct page *target; @@ -332,7 +332,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, if (!io->bio) return false; - if (!inode && !ino) + if (!inode && !page && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -342,11 +342,10 @@ static bool __has_merged_page(struct f2fs_bio_info *io, else target = fscrypt_control_page(bvec->bv_page); - if (idx != target->index) - continue; - if (inode && inode == target->mapping->host) return true; + if (page && page == target) + return true; if (ino && ino == ino_of_node(target)) return true; } @@ -355,7 +354,8 @@ static bool __has_merged_page(struct f2fs_bio_info *io, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - nid_t ino, pgoff_t idx, enum page_type type) + struct page *page, nid_t ino, + enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); enum temp_type temp; @@ -366,7 +366,7 @@ static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, io = sbi->write_io[btype] + temp; down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, ino, idx); + ret = __has_merged_page(io, inode, page, ino); up_read(&io->io_rwsem); /* TODO: use HOT temp only for meta pages now. */ @@ -397,12 +397,12 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, } static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type, bool force) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, bool force) { enum temp_type temp; - if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + if (!force && !has_merged_page(sbi, inode, page, ino, type)) return; for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { @@ -421,10 +421,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) } void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type) + struct inode *inode, struct page *page, + nid_t ino, enum page_type type) { - __submit_merged_write_cond(sbi, inode, ino, idx, type, false); + __submit_merged_write_cond(sbi, inode, page, ino, type, false); } void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) @@ -1952,7 +1952,7 @@ out: ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); clear_inode_flag(inode, FI_HOT_DATA); f2fs_remove_dirty_inode(inode); submitted = NULL; @@ -2010,10 +2010,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; + int nwritten = 0; pagevec_init(&pvec); @@ -2116,7 +2116,7 @@ continue_unlock: done = 1; break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -2138,9 +2138,9 @@ continue_unlock: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (last_idx != ULONG_MAX) + if (nwritten) f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, - 0, last_idx, DATA); + NULL, 0, DATA); return ret; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 668836c2d678..f633662bf645 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3032,8 +3032,8 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, - struct inode *inode, nid_t ino, pgoff_t idx, - enum page_type type); + struct inode *inode, struct page *page, + nid_t ino, enum page_type type); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); void f2fs_submit_page_write(struct f2fs_io_info *fio); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index acb819b8fc42..b1e3ff8147f6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1565,8 +1565,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, up_read(&sbi->node_write); if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, - page->index, NODE); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); submitted = NULL; } @@ -1631,13 +1630,13 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unsigned int *seq_id) { pgoff_t index; - pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_pages; + int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1715,7 +1714,7 @@ continue_unlock: f2fs_put_page(last_page, 0); break; } else if (submitted) { - last_idx = page->index; + nwritten++; } if (page == last_page) { @@ -1741,8 +1740,8 @@ continue_unlock: goto retry; } out: - if (last_idx != ULONG_MAX) - f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); + if (nwritten) + f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); return ret ? -EIO: 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 3fa8870569cc..e16dae0f0a5b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -371,7 +371,7 @@ static int __f2fs_commit_inmem_pages(struct inode *inode) .io_type = FS_DATA_IO, }; struct list_head revoke_list; - pgoff_t last_idx = ULONG_MAX; + bool submit_bio = false; int err = 0; INIT_LIST_HEAD(&revoke_list); @@ -406,14 +406,14 @@ retry: } /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - last_idx = page->index; + submit_bio = true; } unlock_page(page); list_move_tail(&cur->list, &revoke_list); } - if (last_idx != ULONG_MAX) - f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); + if (submit_bio) + f2fs_submit_merged_write_cond(sbi, inode, NULL, 0, DATA); if (err) { /* @@ -3181,8 +3181,7 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_write_cond(sbi, page->mapping->host, - 0, page->index, type); + f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); if (ordered) wait_on_page_writeback(page); else From 39a8695824510a951ded696d69b8dea3c720b109 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Sep 2018 18:33:18 +0800 Subject: [PATCH 38/62] f2fs: refactor ->page_mkwrite() flow Thread A Thread B - f2fs_vm_page_mkwrite - f2fs_setattr - down_write(i_mmap_sem) - truncate_setsize - f2fs_truncate - up_write(i_mmap_sem) - f2fs_reserve_block reserve NEW_ADDR - skip dirty page due to truncation 1. we don't need to rserve new block address for a truncated page. 2. dn.data_blkaddr is used out of node page lock coverage. Refactor ->page_mkwrite() flow to fix above issues: - use __do_map_lock() to avoid racing checkpoint() - lock data page in prior to dnode page - cover f2fs_reserve_block with i_mmap_sem lock - wait page writeback before zeroing page Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 45 ++++++++++++++++++++++----------------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ea16cadd416e..3f01bc2d73eb 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -972,7 +972,7 @@ map_blocks: return err; } -static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { if (lock) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f633662bf645..05217aad4c0b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3055,6 +3055,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); +void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 357422a4c319..6db808c80813 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -47,7 +47,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; + struct dnode_of_data dn = { .node_changed = false }; int err; if (unlikely(f2fs_cp_error(sbi))) { @@ -59,19 +59,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - /* block allocation */ - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_reserve_block(&dn, page->index); - if (err) { - f2fs_unlock_op(sbi); - goto out; - } - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - - f2fs_balance_fs(sbi, dn.node_changed); - file_update_time(vmf->vma->vm_file); down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); @@ -83,11 +70,28 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } + /* block allocation */ + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_block(&dn, page->index); + f2fs_put_dnode(&dn); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + if (err) { + unlock_page(page); + goto out_sem; + } + + /* fill the page */ + f2fs_wait_on_page_writeback(page, DATA, false); + + /* wait for GCed page writeback via META_MAPPING */ + f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); + /* * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) - goto mapped; + goto out_sem; /* page is wholly or partially inside EOF */ if (((loff_t)(page->index + 1) << PAGE_SHIFT) > @@ -104,16 +108,11 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); trace_f2fs_vm_page_mkwrite(page, DATA); -mapped: - /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA, false); - - /* wait for GCed page writeback via META_MAPPING */ - f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); -out: + + f2fs_balance_fs(sbi, dn.node_changed); + sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); err: From f847c699cff3f050286ee0a08632046468e7a511 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 27 Sep 2018 18:34:52 +0800 Subject: [PATCH 39/62] f2fs: allow out-place-update for direct IO in LFS mode Normally, DIO uses in-pllace-update, but in LFS mode, f2fs doesn't allow triggering any in-place-update writes, so we fallback direct write to buffered write, result in bad performance of large size write. This patch adds to support triggering out-place-update for direct IO to enhance its performance. Note that it needs to exclude direct read IO during direct write, since new data writing to new block address will no be valid until write finished. storage: zram time xfs_io -f -d /mnt/f2fs/file -c "pwrite 0 1073741824" -c "fsync" Before: real 0m13.061s user 0m0.327s sys 0m12.486s After: real 0m6.448s user 0m0.228s sys 0m6.212s Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 44 +++++++++++++++++++++++++++++++++++--------- fs/f2fs/f2fs.h | 45 +++++++++++++++++++++++++++++++++++++++++---- fs/f2fs/file.c | 3 ++- 3 files changed, 78 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3f01bc2d73eb..02d5ce888a4a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -893,7 +893,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); - if (dn->data_blkaddr == NEW_ADDR) + if (dn->data_blkaddr != NULL_ADDR) goto alloc; if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) @@ -947,7 +947,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) if (direct_io) { map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, WRITE) ? + flag = f2fs_force_buffered_io(inode, iocb, from) ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; goto map_blocks; @@ -1066,7 +1066,15 @@ next_block: goto sync_out; } - if (!is_valid_data_blkaddr(sbi, blkaddr)) { + if (is_valid_data_blkaddr(sbi, blkaddr)) { + /* use out-place-update for driect IO under LFS mode */ + if (test_opt(sbi, LFS) && create && + flag == F2FS_GET_BLOCK_DIO) { + err = __allocate_data_block(&dn, map->m_seg_type); + if (!err) + set_inode_flag(inode, FI_APPEND_WRITE); + } + } else { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -2486,36 +2494,53 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; int rw = iov_iter_rw(iter); int err; enum rw_hint hint = iocb->ki_hint; int whint_mode = F2FS_OPTION(sbi).whint_mode; + bool do_opu; err = check_direct_IO(inode, iter, offset); if (err) return err < 0 ? err : 0; - if (f2fs_force_buffered_io(inode, rw)) + if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; + do_opu = allow_outplace_dio(inode, iocb, iter); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); if (rw == WRITE && whint_mode == WHINT_MODE_OFF) iocb->ki_hint = WRITE_LIFE_NOT_SET; - if (!down_read_trylock(&F2FS_I(inode)->i_gc_rwsem[rw])) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { iocb->ki_hint = hint; err = -EAGAIN; goto out; } - down_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[rw]); + iocb->ki_hint = hint; + err = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[rw]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); } err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - up_read(&F2FS_I(inode)->i_gc_rwsem[rw]); + + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + + up_read(&fi->i_gc_rwsem[rw]); if (rw == WRITE) { if (whint_mode == WHINT_MODE_OFF) @@ -2523,7 +2548,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err > 0) { f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, err); - set_inode_flag(inode, FI_UPDATE_WRITE); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); } else if (err < 0) { f2fs_write_failed(mapping, offset + count); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 05217aad4c0b..95d9edd8ff6e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -8,6 +8,7 @@ #ifndef _LINUX_F2FS_H #define _LINUX_F2FS_H +#include #include #include #include @@ -3491,11 +3492,47 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) +static inline int block_unaligned_IO(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) { - return (f2fs_post_read_required(inode) || - (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); + unsigned int i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned int blocksize_mask = (1 << i_blkbits) - 1; + loff_t offset = iocb->ki_pos; + unsigned long align = offset | iov_iter_alignment(iter); + + return align & blocksize_mask; +} + +static inline int allow_outplace_dio(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + return (test_opt(sbi, LFS) && (rw == WRITE) && + !block_unaligned_IO(inode, iocb, iter)); +} + +static inline bool f2fs_force_buffered_io(struct inode *inode, + struct kiocb *iocb, struct iov_iter *iter) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int rw = iov_iter_rw(iter); + + if (f2fs_post_read_required(inode)) + return true; + if (sbi->s_ndevs) + return true; + /* + * for blkzoned device, fallback direct IO to buffered IO, so + * all IOs can be serialized by log-structured write. + */ + if (f2fs_sb_has_blkzoned(sbi->sb)) + return true; + if (test_opt(sbi, LFS) && (rw == WRITE) && + block_unaligned_IO(inode, iocb, iter)) + return true; + return false; } #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6db808c80813..e29715ea736f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3001,7 +3001,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!f2fs_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)) || f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, WRITE)) { + f2fs_force_buffered_io(inode, + iocb, from)) { clear_inode_flag(inode, FI_NO_PREALLOC); inode_unlock(inode); From fb7d70db305a1446864227abf711b756568f8242 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 25 Sep 2018 13:54:33 -0700 Subject: [PATCH 40/62] f2fs: clear PageError on the read path When running fault injection test, I hit somewhat wrong behavior in f2fs_gc -> gc_data_segment(): 0. fault injection generated some PageError'ed pages 1. gc_data_segment -> f2fs_get_read_data_page(REQ_RAHEAD) 2. move_data_page -> f2fs_get_lock_data_page() -> f2f_get_read_data_page() -> f2fs_submit_page_read() -> submit_bio(READ) -> return EIO due to PageError -> fail to move data Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 02d5ce888a4a..873f9ea7769f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -77,7 +77,8 @@ static void __read_end_io(struct bio *bio) /* PG_error was set if any post_read step failed */ if (bio->bi_status || PageError(page)) { ClearPageUptodate(page); - SetPageError(page); + /* will re-read again later */ + ClearPageError(page); } else { SetPageUptodate(page); } @@ -591,6 +592,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, bio_put(bio); return -EFAULT; } + ClearPageError(page); __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1579,6 +1581,7 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + ClearPageError(page); last_block_in_bio = block_nr; goto next_page; set_error_page: From 4354994f097d068a894aa1a0860da54571df3582 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Mon, 20 Aug 2018 19:21:43 -0700 Subject: [PATCH 41/62] f2fs: checkpoint disabling Note that, it requires "f2fs: return correct errno in f2fs_gc". This adds a lightweight non-persistent snapshotting scheme to f2fs. To use, mount with the option checkpoint=disable, and to return to normal operation, remount with checkpoint=enable. If the filesystem is shut down before remounting with checkpoint=enable, it will revert back to its apparent state when it was first mounted with checkpoint=disable. This is useful for situations where you wish to be able to roll back the state of the disk in case of some critical failure. Signed-off-by: Daniel Rosenberg [Jaegeuk Kim: use SB_RDONLY instead of MS_RDONLY] Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 5 ++ fs/f2fs/checkpoint.c | 12 +++ fs/f2fs/data.c | 14 +++- fs/f2fs/debug.c | 3 +- fs/f2fs/f2fs.h | 18 ++++- fs/f2fs/file.c | 12 ++- fs/f2fs/gc.c | 9 ++- fs/f2fs/inode.c | 6 +- fs/f2fs/namei.c | 19 +++++ fs/f2fs/segment.c | 98 +++++++++++++++++++++- fs/f2fs/segment.h | 15 ++++ fs/f2fs/super.c | 126 ++++++++++++++++++++++++++++- include/linux/f2fs_fs.h | 1 + 13 files changed, 324 insertions(+), 14 deletions(-) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index bde3e91e5372..e46c2147ddf8 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -212,6 +212,11 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix", non-atomic files likewise "nobarrier" mount option. test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt context. The fake fscrypt context is used by xfstests. +checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable" + to reenable checkpointing. Is enabled by default. While + disabled, any unmounting or unexpected shutdowns will cause + the filesystem contents to appear as they did when the + filesystem was mounted with that option. ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 97b429ba2911..eb6ac79640f8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1210,6 +1210,11 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) + __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); @@ -1417,6 +1422,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + sbi->unusable_block_count = 0; __set_cp_next_pack(sbi); /* @@ -1441,6 +1447,12 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (cpc->reason != CP_PAUSE) + return 0; + f2fs_msg(sbi->sb, KERN_WARNING, + "Start checkpoint disabled!"); + } mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 873f9ea7769f..9ef6f1f01eda 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -537,7 +537,8 @@ skip: if (fio->in_list) goto next; out: - if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) + if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_is_checkpoint_ready(sbi)) __submit_merged_bio(io); up_write(&io->io_rwsem); } @@ -1703,6 +1704,10 @@ static inline bool check_inplace_update_policy(struct inode *inode, is_inode_flag_set(inode, FI_NEED_IPU)) return true; + if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + !f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; + return false; } @@ -1733,6 +1738,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + f2fs_is_checkpointed_data(sbi, fio->old_blkaddr))) + return true; } return false; } @@ -2353,6 +2361,10 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); + err = f2fs_is_checkpoint_ready(sbi); + if (err) + goto fail; + if ((f2fs_is_atomic_file(inode) && !f2fs_available_free_memory(sbi, INMEM_PAGES)) || is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 75bc62edc4c1..026e10f30889 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -272,7 +272,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", - f2fs_cp_error(si->sbi) ? "Error": "Good"); + is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? + "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 95d9edd8ff6e..ff540f523a3b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -99,6 +99,7 @@ extern char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_QUOTA 0x00400000 #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 +#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -178,6 +179,7 @@ enum { #define CP_RECOVERY 0x00000008 #define CP_DISCARD 0x00000010 #define CP_TRIMMED 0x00000020 +#define CP_PAUSE 0x00000040 #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ @@ -187,6 +189,7 @@ enum { #define DEF_DISCARD_URGENT_UTIL 80 /* do more discard over 80% */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -1092,6 +1095,7 @@ enum { SBI_NEED_CP, /* need to checkpoint */ SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ + SBI_CP_DISABLED, /* CP was disabled last mount */ }; enum { @@ -1099,6 +1103,7 @@ enum { REQ_TIME, DISCARD_TIME, GC_TIME, + DISABLE_TIME, MAX_TIME, }; @@ -1225,6 +1230,9 @@ struct f2fs_sb_info { block_t reserved_blocks; /* configurable reserved blocks */ block_t current_reserved_blocks; /* current reserved blocks */ + /* Additional tracking for no checkpoint mode */ + block_t unusable_block_count; /* # of blocks saved by last cp */ + unsigned int nquota_files; /* # of quota sysfile */ u32 s_next_generation; /* for NFS support */ @@ -1735,7 +1743,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; - + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + avail_user_block_count -= sbi->unusable_block_count; if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { diff = sbi->total_valid_block_count - avail_user_block_count; if (diff > *count) @@ -1942,6 +1951,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + valid_block_count += sbi->unusable_block_count; if (unlikely(valid_block_count > sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); @@ -2945,6 +2956,8 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); @@ -3532,6 +3545,9 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (test_opt(sbi, LFS) && (rw == WRITE) && block_unaligned_IO(inode, iocb, iter)) return true; + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) + return true; + return false; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e29715ea736f..b1aaa73e1eeb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -210,7 +210,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb))) + if (unlikely(f2fs_readonly(inode->i_sb) || + is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; trace_f2fs_sync_file_enter(inode); @@ -2157,6 +2158,12 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + f2fs_msg(sbi->sb, KERN_INFO, + "Skipping Checkpoint. Checkpoints currently disabled."); + return -EINVAL; + } + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -2528,6 +2535,9 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return -EINVAL; + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, sizeof(range))) return -EFAULT; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 99ed8a5d9249..78288c54b68c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -370,6 +370,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && + get_ckpt_valid_blocks(sbi, segno))) + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; @@ -1189,7 +1193,8 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - if (prefree_segments(sbi)) { + if (prefree_segments(sbi) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { ret = f2fs_write_checkpoint(sbi, &cpc); if (ret) goto stop; @@ -1241,7 +1246,7 @@ gc_more: segno = NULL_SEGNO; goto gc_more; } - if (gc_type == FG_GC) + if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) ret = f2fs_write_checkpoint(sbi, &cpc); } stop: diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 86e7333d60c1..4ee9d6c4b719 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -607,6 +607,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + if (f2fs_is_checkpoint_ready(sbi)) + return -ENOSPC; + /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. @@ -688,7 +691,8 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))) + if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED))) f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); else f2fs_inode_synced(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ded185baa9ae..a14632744a6a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include "xattr.h" #include "acl.h" #include @@ -269,6 +270,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -315,6 +319,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = fscrypt_prepare_link(old_dentry, dir, dentry); if (err) @@ -561,6 +568,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -690,6 +700,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; err = dquot_initialize(dir); if (err) @@ -824,6 +837,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, @@ -1014,6 +1030,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(f2fs_cp_error(sbi))) return -EIO; + err = f2fs_is_checkpoint_ready(sbi); + if (err) + return err; if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e16dae0f0a5b..195dc8142bff 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -176,6 +176,8 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) return false; if (sbi->gc_mode == GC_URGENT) return true; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); @@ -480,6 +482,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); + if (f2fs_is_checkpoint_ready(sbi)) + return; + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -796,7 +801,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned short valid_blocks; + unsigned short valid_blocks, ckpt_valid_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; @@ -804,8 +809,10 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); - if (valid_blocks == 0) { + if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || + ckpt_valid_blocks == sbi->blocks_per_seg)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < sbi->blocks_per_seg) { @@ -818,6 +825,66 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ +void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (IS_CURSEG(sbi, segno)) + continue; + __locate_dirty_segment(sbi, segno, PRE); + __remove_dirty_segment(sbi, segno, DIRTY); + } + mutex_unlock(&dirty_i->seglist_lock); +} + +int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + block_t ovp = overprovision_segments(sbi) << sbi->log_blocks_per_seg; + block_t holes[2] = {0, 0}; /* DATA and NODE */ + struct seg_entry *se; + unsigned int segno; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + se = get_seg_entry(sbi, segno); + if (IS_NODESEG(se->type)) + holes[NODE] += sbi->blocks_per_seg - se->valid_blocks; + else + holes[DATA] += sbi->blocks_per_seg - se->valid_blocks; + } + mutex_unlock(&dirty_i->seglist_lock); + + if (holes[DATA] > ovp || holes[NODE] > ovp) + return -EAGAIN; + return 0; +} + +/* This is only used by SBI_CP_DISABLED */ +static unsigned int get_free_segment(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int segno = 0; + + mutex_lock(&dirty_i->seglist_lock); + for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { + if (get_valid_blocks(sbi, segno, false)) + continue; + if (get_ckpt_valid_blocks(sbi, segno)) + continue; + mutex_unlock(&dirty_i->seglist_lock); + return segno; + } + mutex_unlock(&dirty_i->seglist_lock); + return NULL_SEGNO; +} + static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) @@ -2028,7 +2095,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ - if (IS_NODESEG(se->type)) { + if (IS_NODESEG(se->type) && + !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks++; } @@ -2050,6 +2118,15 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) f2fs_bug_on(sbi, 1); se->valid_blocks++; del = 0; + } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + /* + * If checkpoints are off, we must not reuse data that + * was used in the previous checkpoint. If it was used + * before, we must track that to know how much space we + * really have. + */ + if (f2fs_test_bit(offset, se->ckpt_valid_map)) + sbi->unusable_block_count++; } if (f2fs_test_and_clear_bit(offset, se->discard_map)) @@ -2332,6 +2409,9 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) if (sbi->segs_per_sec != 1) return CURSEG_I(sbi, type)->segno; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) return 0; @@ -2476,6 +2556,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) return 1; } } + + /* find valid_blocks=0 in dirty list */ + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + segno = get_free_segment(sbi); + if (segno != NULL_SEGNO) { + curseg->next_segno = segno; + return 1; + } + } return 0; } @@ -2493,7 +2582,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); - else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type) && + likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) new_curseg(sbi, type, false); else if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 086150028c6d..ab3465faddf1 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -339,6 +339,12 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, return get_seg_entry(sbi, segno)->valid_blocks; } +static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; +} + static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { @@ -576,6 +582,15 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, reserved_sections(sbi) + needed); } +static inline int f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) +{ + if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; + if (likely(!has_not_enough_free_secs(sbi, 0, 0))) + return 0; + return -ENOSPC; +} + static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) { return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a44913224e3b..19933d839008 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -136,6 +136,7 @@ enum { Opt_alloc, Opt_fsync, Opt_test_dummy_encryption, + Opt_checkpoint, Opt_err, }; @@ -194,6 +195,7 @@ static match_table_t f2fs_tokens = { {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, + {Opt_checkpoint, "checkpoint=%s"}, {Opt_err, NULL}, }; @@ -769,6 +771,23 @@ static int parse_options(struct super_block *sb, char *options) "Test dummy encryption mount option ignored"); #endif break; + case Opt_checkpoint: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + + if (strlen(name) == 6 && + !strncmp(name, "enable", 6)) { + clear_opt(sbi, DISABLE_CHECKPOINT); + } else if (strlen(name) == 7 && + !strncmp(name, "disable", 7)) { + set_opt(sbi, DISABLE_CHECKPOINT); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -827,6 +846,12 @@ static int parse_options(struct super_block *sb, char *options) } } + if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "LFS not compatible with checkpoint=disable\n"); + return -EINVAL; + } + /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_TYPE. */ @@ -1014,8 +1039,8 @@ static void f2fs_put_super(struct super_block *sb) * But, the previous checkpoint was not done by umount, it needs to do * clean checkpoint again. */ - if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + if ((is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG))) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -1087,6 +1112,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (unlikely(f2fs_cp_error(sbi))) return 0; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + return 0; trace_f2fs_sync_fs(sb, sync); @@ -1186,6 +1213,11 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) - sbi->current_reserved_blocks; + if (unlikely(buf->f_bfree <= sbi->unusable_block_count)) + buf->f_bfree = 0; + else + buf->f_bfree -= sbi->unusable_block_count; + if (buf->f_bfree > F2FS_OPTION(sbi).root_reserved_blocks) buf->f_bavail = buf->f_bfree - F2FS_OPTION(sbi).root_reserved_blocks; @@ -1365,6 +1397,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) seq_printf(seq, ",alloc_mode=%s", "reuse"); + if (test_opt(sbi, DISABLE_CHECKPOINT)) + seq_puts(seq, ",checkpoint=disable"); + if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_POSIX) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) @@ -1392,6 +1427,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); + clear_opt(sbi, DISABLE_CHECKPOINT); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); @@ -1413,6 +1449,57 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_QUOTA static int f2fs_enable_quotas(struct super_block *sb); #endif + +static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) +{ + struct cp_control cpc; + int err; + + sbi->sb->s_flags |= SB_ACTIVE; + + mutex_lock(&sbi->gc_mutex); + f2fs_update_time(sbi, DISABLE_TIME); + + while (!f2fs_time_over(sbi, DISABLE_TIME)) { + err = f2fs_gc(sbi, true, false, NULL_SEGNO); + if (err == -ENODATA) + break; + if (err && err != -EAGAIN) { + mutex_unlock(&sbi->gc_mutex); + return err; + } + } + mutex_unlock(&sbi->gc_mutex); + + err = sync_filesystem(sbi->sb); + if (err) + return err; + + if (f2fs_disable_cp_again(sbi)) + return -EAGAIN; + + mutex_lock(&sbi->gc_mutex); + cpc.reason = CP_PAUSE; + set_sbi_flag(sbi, SBI_CP_DISABLED); + f2fs_write_checkpoint(sbi, &cpc); + + sbi->unusable_block_count = 0; + mutex_unlock(&sbi->gc_mutex); + return 0; +} + +static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) +{ + mutex_lock(&sbi->gc_mutex); + f2fs_dirty_to_prefree(sbi); + + clear_sbi_flag(sbi, SBI_CP_DISABLED); + set_sbi_flag(sbi, SBI_IS_DIRTY); + mutex_unlock(&sbi->gc_mutex); + + f2fs_sync_fs(sbi->sb, 1); +} + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -1422,6 +1509,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); + bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool checkpoint_changed; #ifdef CONFIG_QUOTA int i, j; #endif @@ -1466,6 +1555,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = parse_options(sb, data); if (err) goto restore_opts; + checkpoint_changed = + disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -1479,7 +1570,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; - } else if (f2fs_readonly(sb) && !(*flags & MS_RDONLY)) { + } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) { /* dquot_resume needs RW */ sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { @@ -1499,6 +1590,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { + err = -EINVAL; + f2fs_msg(sbi->sb, KERN_WARNING, + "disabling checkpoint not compatible with read-only"); + goto restore_opts; + } + /* * We stop the GC thread if FS is mounted as RO * or if background_gc = off is passed in mount @@ -1527,6 +1625,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_gc; + } else { + f2fs_enable_checkpoint(sbi); + } + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -2485,6 +2593,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) @@ -3093,6 +3202,9 @@ try_onemore: if (err) goto free_meta; + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + goto skip_recovery; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -3132,6 +3244,14 @@ skip_recovery: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto free_meta; + } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { + f2fs_enable_checkpoint(sbi); + } + /* * If filesystem is not mounted as read-only then * do start the gc_thread. diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 1db13ff9a3f4..8b9c7dc0260c 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -116,6 +116,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_DISABLED_FLAG 0x00001000 #define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 #define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 From 48018b4cfd07dd2df9a067fb3a6a3221c19eed11 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 13 Sep 2018 07:40:53 +0800 Subject: [PATCH 42/62] f2fs: submit cached bio to avoid endless PageWriteback When migrating encrypted block from background GC thread, we only add them into f2fs inner bio cache, but forget to submit the cached bio, it may cause potential deadlock when we are waiting page writebacked, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 71 +++++++++++++++++++++++++++++++++++--------------- fs/f2fs/node.c | 13 ++++++--- 3 files changed, 61 insertions(+), 25 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff540f523a3b..bf81c60e12b8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2911,7 +2911,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); -void f2fs_move_node_page(struct page *node_page, int gc_type); +int f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 78288c54b68c..a07241fb8537 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -473,7 +473,7 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static void gc_node_segment(struct f2fs_sb_info *sbi, +static int gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { struct f2fs_summary *entry; @@ -481,6 +481,7 @@ static void gc_node_segment(struct f2fs_sb_info *sbi, int off; int phase = 0; bool fggc = (gc_type == FG_GC); + int submitted = 0; start_addr = START_BLOCK(sbi, segno); @@ -494,10 +495,11 @@ next_step: nid_t nid = le32_to_cpu(entry->nid); struct page *node_page; struct node_info ni; + int err; /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) - return; + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -534,7 +536,9 @@ next_step: continue; } - f2fs_move_node_page(node_page, gc_type); + err = f2fs_move_node_page(node_page, gc_type); + if (!err && gc_type == FG_GC) + submitted++; stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -543,6 +547,7 @@ next_step: if (fggc) atomic_dec(&sbi->wb_sync_req[NODE]); + return submitted; } /* @@ -678,7 +683,7 @@ put_page: * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. */ -static void move_data_block(struct inode *inode, block_t bidx, +static int move_data_block(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct f2fs_io_info fio = { @@ -697,25 +702,29 @@ static void move_data_block(struct inode *inode, block_t bidx, struct node_info ni; struct page *page, *mpage; block_t newaddr; - int err; + int err = 0; bool lfs_mode = test_opt(fio.sbi, LFS); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); if (!page) - return; + return -ENOMEM; - if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; goto out; + } if (f2fs_is_atomic_file(inode)) { F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + err = -EAGAIN; goto out; } if (f2fs_is_pinned_file(inode)) { f2fs_pin_file_control(inode, true); + err = -EAGAIN; goto out; } @@ -726,6 +735,7 @@ static void move_data_block(struct inode *inode, block_t bidx, if (unlikely(dn.data_blkaddr == NULL_ADDR)) { ClearPageUptodate(page); + err = -ENOENT; goto put_out; } @@ -808,6 +818,7 @@ write_page: fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); if (fio.retry) { + err = -EAGAIN; if (PageWriteback(fio.encrypted_page)) end_page_writeback(fio.encrypted_page); goto put_page_out; @@ -831,34 +842,42 @@ put_out: f2fs_put_dnode(&dn); out: f2fs_put_page(page, 1); + return err; } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type, +static int move_data_page(struct inode *inode, block_t bidx, int gc_type, unsigned int segno, int off) { struct page *page; + int err = 0; page = f2fs_get_lock_data_page(inode, bidx, true); if (IS_ERR(page)) - return; + return PTR_ERR(page); - if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { + err = -ENOENT; goto out; + } if (f2fs_is_atomic_file(inode)) { F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++; F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++; + err = -EAGAIN; goto out; } if (f2fs_is_pinned_file(inode)) { if (gc_type == FG_GC) f2fs_pin_file_control(inode, true); + err = -EAGAIN; goto out; } if (gc_type == BG_GC) { - if (PageWriteback(page)) + if (PageWriteback(page)) { + err = -EAGAIN; goto out; + } set_page_dirty(page); set_cold_data(page); } else { @@ -876,7 +895,6 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); - int err; retry: set_page_dirty(page); @@ -901,6 +919,7 @@ retry: } out: f2fs_put_page(page, 1); + return err; } /* @@ -910,7 +929,7 @@ out: * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -918,6 +937,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t start_addr; int off; int phase = 0; + int submitted = 0; start_addr = START_BLOCK(sbi, segno); @@ -934,7 +954,7 @@ next_step: /* stop BG_GC if there is not enough free sections. */ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) - return; + return submitted; if (check_valid_map(sbi, segno, off) == 0) continue; @@ -1006,6 +1026,7 @@ next_step: if (inode) { struct f2fs_inode_info *fi = F2FS_I(inode); bool locked = false; + int err; if (S_ISREG(inode->i_mode)) { if (!down_write_trylock(&fi->i_gc_rwsem[READ])) @@ -1025,12 +1046,16 @@ next_step: start_bidx = f2fs_start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_post_read_required(inode)) - move_data_block(inode, start_bidx, gc_type, - segno, off); + err = move_data_block(inode, start_bidx, + gc_type, segno, off); else - move_data_page(inode, start_bidx, gc_type, + err = move_data_page(inode, start_bidx, gc_type, segno, off); + if (!err && (gc_type == FG_GC || + f2fs_post_read_required(inode))) + submitted++; + if (locked) { up_write(&fi->i_gc_rwsem[WRITE]); up_write(&fi->i_gc_rwsem[READ]); @@ -1042,6 +1067,8 @@ next_step: if (++phase < 5) goto next_step; + + return submitted; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -1069,6 +1096,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; + int submitted = 0; /* readahead multi ssa blocks those have contiguous address */ if (sbi->segs_per_sec > 1) @@ -1124,10 +1152,11 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - lock_page(sum_page) */ if (type == SUM_TYPE_NODE) - gc_node_segment(sbi, sum->entries, segno, gc_type); - else - gc_data_segment(sbi, sum->entries, gc_list, segno, + submitted += gc_node_segment(sbi, sum->entries, segno, gc_type); + else + submitted += gc_data_segment(sbi, sum->entries, gc_list, + segno, gc_type); stat_inc_seg_count(sbi, type, gc_type); @@ -1138,7 +1167,7 @@ next: f2fs_put_page(sum_page, 0); } - if (gc_type == FG_GC) + if (submitted) f2fs_submit_merged_write(sbi, (type == SUM_TYPE_NODE) ? NODE : DATA); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1e3ff8147f6..8f5719767178 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1587,8 +1587,10 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -void f2fs_move_node_page(struct page *node_page, int gc_type) +int f2fs_move_node_page(struct page *node_page, int gc_type) { + int err = 0; + if (gc_type == FG_GC) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1600,12 +1602,16 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) f2fs_wait_on_page_writeback(node_page, NODE, true); f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page)); - if (!clear_page_dirty_for_io(node_page)) + if (!clear_page_dirty_for_io(node_page)) { + err = -EAGAIN; goto out_page; + } if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO, NULL)) + &wbc, false, FS_GC_NODE_IO, NULL)) { + err = -EAGAIN; unlock_page(node_page); + } goto release_page; } else { /* set page dirty and write it */ @@ -1616,6 +1622,7 @@ out_page: unlock_page(node_page); release_page: f2fs_put_page(node_page, 0); + return err; } static int f2fs_write_node_page(struct page *page, From ef2a007134b4eaa39264c885999f296577bc87d2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 3 Oct 2018 22:32:44 +0800 Subject: [PATCH 43/62] f2fs: fix to recover cold bit of inode block during POR Testcase to reproduce this bug: 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. touch /mnt/f2fs/file 4. sync 5. chattr +A /mnt/f2fs/file 6. xfs_io -f /mnt/f2fs/file -c "fsync" 7. godown /mnt/f2fs 8. umount /mnt/f2fs 9. mount -t f2fs /dev/sdd /mnt/f2fs 10. chattr -A /mnt/f2fs/file 11. xfs_io -f /mnt/f2fs/file -c "fsync" 12. umount /mnt/f2fs 13. mount -t f2fs /dev/sdd /mnt/f2fs 14. lsattr /mnt/f2fs/file -----------------N- /mnt/f2fs/file But actually, we expect the corrct result is: -------A---------N- /mnt/f2fs/file The reason is in step 9) we missed to recover cold bit flag in inode block, so later, in fsync, we will skip write inode block due to below condition check, result in lossing data in another SPOR. f2fs_fsync_node_pages() if (!IS_DNODE(page) || !is_cold_node(page)) continue; Note that, I guess that some non-dir inode has already lost cold bit during POR, so in order to reenable recovery for those inode, let's try to recover cold bit in f2fs_iget() to save more fsynced data. Fixes: c56675750d7c ("f2fs: remove unneeded set_cold_node()") Cc: 4.17+ Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 6 ++++++ fs/f2fs/node.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4ee9d6c4b719..57a7a15239d6 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -365,6 +365,12 @@ static int do_read_inode(struct inode *inode) if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); + /* try to recover cold bit for non-dir inode */ + if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { + set_cold_node(node_page, false); + set_page_dirty(node_page); + } + /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8f5719767178..8c0367cfb1bb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2549,7 +2549,7 @@ retry: if (!PageUptodate(ipage)) SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); - set_cold_node(page, false); + set_cold_node(ipage, false); src = F2FS_INODE(page); dst = F2FS_INODE(ipage); From 4dada3fd7025e9dbc56c93d9996cba6e47915c62 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 4 Oct 2018 11:18:30 +0800 Subject: [PATCH 44/62] f2fs: use rb_*_cached friends As rbtree supports caching leftmost node natively, update f2fs codes to use rb_*_cached helpers to speed up leftmost node visiting. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 78 +++++++++++++++++++++++++----------------- fs/f2fs/f2fs.h | 17 ++++----- fs/f2fs/segment.c | 22 +++++++----- 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 904ad7ba5a45..1cb0fcc67d2d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -27,10 +27,10 @@ static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, return NULL; } -static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root_cached *root, unsigned int ofs) { - struct rb_node *node = root->rb_node; + struct rb_node *node = root->rb_root.rb_node; struct rb_entry *re; while (node) { @@ -46,7 +46,7 @@ static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, return NULL; } -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs) { struct rb_entry *re; @@ -59,22 +59,25 @@ struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, } struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root *root, struct rb_node **parent, - unsigned int ofs) + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost) { - struct rb_node **p = &root->rb_node; + struct rb_node **p = &root->rb_root.rb_node; struct rb_entry *re; while (*p) { *parent = *p; re = rb_entry(*parent, struct rb_entry, rb_node); - if (ofs < re->ofs) + if (ofs < re->ofs) { p = &(*p)->rb_left; - else if (ofs >= re->ofs + re->len) + } else if (ofs >= re->ofs + re->len) { p = &(*p)->rb_right; - else + *leftmost = false; + } else { f2fs_bug_on(sbi, 1); + } } return p; @@ -89,16 +92,16 @@ struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, * in order to simpfy the insertion after. * tree must stay unchanged between lookup and insertion. */ -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force) + bool force, bool *leftmost) { - struct rb_node **pnode = &root->rb_node; + struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct rb_entry *re = cached_re; @@ -107,7 +110,7 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, *prev_entry = NULL; *next_entry = NULL; - if (RB_EMPTY_ROOT(root)) + if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; if (re) { @@ -115,16 +118,22 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, goto lookup_neighbors; } + if (leftmost) + *leftmost = true; + while (*pnode) { parent = *pnode; re = rb_entry(*pnode, struct rb_entry, rb_node); - if (ofs < re->ofs) + if (ofs < re->ofs) { pnode = &(*pnode)->rb_left; - else if (ofs >= re->ofs + re->len) + } else if (ofs >= re->ofs + re->len) { pnode = &(*pnode)->rb_right; - else + if (leftmost) + *leftmost = false; + } else { goto lookup_neighbors; + } } *insert_p = pnode; @@ -157,10 +166,10 @@ lookup_neighbors: } bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root *root) + struct rb_root_cached *root) { #ifdef CONFIG_F2FS_CHECK_FS - struct rb_node *cur = rb_first(root), *next; + struct rb_node *cur = rb_first_cached(root), *next; struct rb_entry *cur_re, *next_re; if (!cur) @@ -193,7 +202,8 @@ static struct kmem_cache *extent_node_slab; static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, - struct rb_node *parent, struct rb_node **p) + struct rb_node *parent, struct rb_node **p, + bool leftmost) { struct extent_node *en; @@ -206,7 +216,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->et = et; rb_link_node(&en->rb_node, parent, p); - rb_insert_color(&en->rb_node, &et->root); + rb_insert_color_cached(&en->rb_node, &et->root, leftmost); atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; @@ -215,7 +225,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { - rb_erase(&en->rb_node, &et->root); + rb_erase_cached(&en->rb_node, &et->root); atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); @@ -254,7 +264,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; - et->root = RB_ROOT; + et->root = RB_ROOT_CACHED; et->cached_en = NULL; rwlock_init(&et->lock); INIT_LIST_HEAD(&et->list); @@ -275,10 +285,10 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { - struct rb_node **p = &et->root.rb_node; + struct rb_node **p = &et->root.rb_root.rb_node; struct extent_node *en; - en = __attach_extent_node(sbi, et, ei, NULL, p); + en = __attach_extent_node(sbi, et, ei, NULL, p, true); if (!en) return NULL; @@ -294,7 +304,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, struct extent_node *en; unsigned int count = atomic_read(&et->node_cnt); - node = rb_first(&et->root); + node = rb_first_cached(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); @@ -452,7 +462,8 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, - struct rb_node *insert_parent) + struct rb_node *insert_parent, + bool leftmost) { struct rb_node **p; struct rb_node *parent = NULL; @@ -464,9 +475,12 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); + leftmost = true; + + p = f2fs_lookup_rb_tree_for_insert(sbi, &et->root, &parent, + ei->fofs, &leftmost); do_insert: - en = __attach_extent_node(sbi, et, ei, parent, p); + en = __attach_extent_node(sbi, et, ei, parent, p, leftmost); if (!en) return NULL; @@ -492,6 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; + bool leftmost; if (!et) return; @@ -519,7 +534,8 @@ static void f2fs_update_extent_tree_range(struct inode *inode, (struct rb_entry *)et->cached_en, fofs, (struct rb_entry **)&prev_en, (struct rb_entry **)&next_en, - &insert_p, &insert_parent, false); + &insert_p, &insert_parent, false, + &leftmost); if (!en) en = next_en; @@ -546,7 +562,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, end - dei.fofs + dei.blk, org_end - end); en1 = __insert_extent_tree(sbi, et, &ei, - NULL, NULL); + NULL, NULL, true); next_en = en1; } else { en->ei.fofs = end; @@ -587,7 +603,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, fofs, blkaddr, len); if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) __insert_extent_tree(sbi, et, &ei, - insert_p, insert_parent); + insert_p, insert_parent, leftmost); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bf81c60e12b8..829355dc469b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -328,7 +328,7 @@ struct discard_cmd_control { atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ - struct rb_root root; /* root of discard rb-tree */ + struct rb_root_cached root; /* root of discard rb-tree */ bool rbtree_check; /* config for consistence check */ }; @@ -570,7 +570,7 @@ struct extent_node { struct extent_tree { nid_t ino; /* inode number */ - struct rb_root root; /* root of extent info rb-tree */ + struct rb_root_cached root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ struct list_head list; /* to be used by sbi->zombie_list */ @@ -3368,18 +3368,19 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -struct rb_entry *f2fs_lookup_rb_tree(struct rb_root *root, +struct rb_entry *f2fs_lookup_rb_tree(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs); struct rb_node **f2fs_lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, - struct rb_root *root, struct rb_node **parent, - unsigned int ofs); -struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root *root, + struct rb_root_cached *root, + struct rb_node **parent, + unsigned int ofs, bool *leftmost); +struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root, struct rb_entry *cached_re, unsigned int ofs, struct rb_entry **prev_entry, struct rb_entry **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent, - bool force); + bool force, bool *leftmost); bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi, - struct rb_root *root); + struct rb_root_cached *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 195dc8142bff..805c8310d7b0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -920,7 +920,8 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len, - struct rb_node *parent, struct rb_node **p) + struct rb_node *parent, struct rb_node **p, + bool leftmost) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; @@ -928,7 +929,7 @@ static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); - rb_insert_color(&dc->rb_node, &dcc->root); + rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); return dc; } @@ -940,7 +941,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, atomic_sub(dc->issuing, &dcc->issing_discard); list_del(&dc->list); - rb_erase(&dc->rb_node, &dcc->root); + rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->len; kmem_cache_free(discard_cmd_slab, dc); @@ -1177,6 +1178,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, struct rb_node **p; struct rb_node *parent = NULL; struct discard_cmd *dc = NULL; + bool leftmost = true; if (insert_p && insert_parent) { parent = insert_parent; @@ -1184,9 +1186,11 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, goto do_insert; } - p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); + p = f2fs_lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, + lstart, &leftmost); do_insert: - dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, + p, leftmost); if (!dc) return NULL; @@ -1254,7 +1258,7 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, NULL, lstart, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (dc) prev_dc = dc; @@ -1362,7 +1366,7 @@ static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, NULL, pos, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (!dc) dc = next_dc; @@ -1994,7 +1998,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; dcc->next_pos = 0; - dcc->root = RB_ROOT; + dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); @@ -2658,7 +2662,7 @@ next: NULL, start, (struct rb_entry **)&prev_dc, (struct rb_entry **)&next_dc, - &insert_p, &insert_parent, true); + &insert_p, &insert_parent, true, NULL); if (!dc) dc = next_dc; From ed15ba14155962c328ceb69712164aea8c444a82 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 4 Oct 2018 11:15:18 +0800 Subject: [PATCH 45/62] f2fs: shrink sbi->sb_lock coverage in set_file_temperature() file_set_{cold,hot} doesn't need holding sbi->sb_lock, so moving them out of the lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a14632744a6a..d5de8a99532d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -182,16 +182,19 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode * hot_count = sbi->raw_super->hot_ext_count; for (i = 0; i < cold_count + hot_count; i++) { - if (!is_extension_exist(name, extlist[i])) - continue; - if (i < cold_count) - file_set_cold(inode); - else - file_set_hot(inode); - break; + if (is_extension_exist(name, extlist[i])) + break; } up_read(&sbi->sb_lock); + + if (i == cold_count + hot_count) + return; + + if (i < cold_count) + file_set_cold(inode); + else + file_set_hot(inode); } int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, From 850971b23f0cabb5103a2de9f4b9fe7177c96a9d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 4 Oct 2018 11:15:19 +0800 Subject: [PATCH 46/62] f2fs: remove unused sbi->trigger_ssr_threshold Commit a2a12b679f36 ("f2fs: export SSR allocation threshold") introduced two threshold .min_ssr_sections and .trigger_ssr_threshold, but only .min_ssr_sections is used, so just remove redundant one for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 829355dc469b..cfc7e1b6c850 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1220,7 +1220,6 @@ struct f2fs_sb_info { unsigned int total_valid_node_count; /* valid node block count */ loff_t max_file_blocks; /* max block index of file */ int dir_level; /* directory level */ - unsigned int trigger_ssr_threshold; /* threshold to trigger ssr */ int readdir_ra; /* readahead inode in readdir */ block_t user_block_count; /* # of user blocks */ From 3b30eb19dcf1f79d1c3283da8c5f064be6175fbe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 4 Oct 2018 11:15:20 +0800 Subject: [PATCH 47/62] f2fs: remove unneeded disable_nat_bits() Commit 7735730d39d7 ("f2fs: fix to propagate error from __get_meta_page()") added disable_nat_bits() in error path of __get_nat_bitmaps(), but it's unneeded, beause we will fail mount, we won't have chance to change nid usage status w/o nat full/empty bitmaps. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8c0367cfb1bb..2b34206486d8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2879,10 +2879,8 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) struct page *page; page = f2fs_get_meta_page(sbi, nat_bits_addr++); - if (IS_ERR(page)) { - disable_nat_bits(sbi, true); + if (IS_ERR(page)) return PTR_ERR(page); - } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); From c75f2feb80eb8cbb579a3677dce4c165f3102a04 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 5 Oct 2018 10:47:39 +0530 Subject: [PATCH 48/62] f2fs: do not update REQ_TIME in case of error conditions The REQ_TIME should be updated only in case of success cases as followed at all other places in the file system. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/file.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c77a58038709..e02db5db62dc 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -655,9 +655,9 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) f2fs_put_page(page, 1); clear_inode_flag(inode, FI_NEW_INODE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); fail: up_write(&F2FS_I(inode)->i_sem); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b1aaa73e1eeb..543c742f8bd7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -106,6 +106,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) SetPageUptodate(page); f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: @@ -114,7 +115,6 @@ out_sem: f2fs_balance_fs(sbi, dn.node_changed); sb_end_pagefault(inode->i_sb); - f2fs_update_time(sbi, REQ_TIME); err: return block_page_mkwrite_return(err); } From 6390398ec78fa527dd8346f126eca06129155668 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 5 Oct 2018 10:47:40 +0530 Subject: [PATCH 49/62] f2fs: update REQ_TIME in f2fs_cross_rename() Update REQ_TIME in the missing path - f2fs_cross_rename(). Signed-off-by: Sahitya Tummala [Jaegeuk Kim: add it in f2fs_rename()] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index d5de8a99532d..a54577e417b1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1000,6 +1000,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); return 0; put_out_dir: @@ -1157,6 +1159,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_update_time(sbi, REQ_TIME); return 0; out_new_dir: if (new_dir_entry) { From 730746ce88da943088fe9aeb845a5daf6e315d98 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 2 Oct 2018 17:20:58 -0700 Subject: [PATCH 50/62] f2fs: allow to mount, if quota is failed Since we can use the filesystem without quotas till next boot. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19933d839008..262a27744eb1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3190,11 +3190,9 @@ try_onemore: /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); - if (err) { + if (err) f2fs_msg(sb, KERN_ERR, "Cannot turn on quotas: error %d", err); - goto free_sysfs; - } } #endif /* if there are nt orphan nodes free them */ @@ -3295,9 +3293,6 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); -#ifdef CONFIG_QUOTA -free_sysfs: -#endif f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); From b93f654d73fa09088041efa77972b1d616a75fcb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Oct 2018 08:34:50 -0600 Subject: [PATCH 51/62] f2fs: remove request_list check in is_idle() This doesn't work on stacked devices, and it doesn't work on blk-mq devices. The request_list is only used on legacy, which we don't have much of anymore, and soon won't have any of. Kill the check. Cc: Jaegeuk Kim Cc: linux-f2fs-devel@lists.sourceforge.net Signed-off-by: Jens Axboe Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cfc7e1b6c850..f865c5ed965d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1394,13 +1394,6 @@ static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, static inline bool is_idle(struct f2fs_sb_info *sbi, int type) { - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return false; - return f2fs_time_over(sbi, type); } From 4c58ed076875f36dae0f240da1e25e99e5d4afb8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Oct 2018 09:12:51 +0800 Subject: [PATCH 52/62] f2fs: fix to account IO correctly Below race can cause reversed reference on dirty count, fix it by relocating __submit_bio() and inc_page_count(). Thread A Thread B - f2fs_inplace_write_data - f2fs_submit_page_bio - __submit_bio - f2fs_write_end_io - dec_page_count - inc_page_count Cc: Fixes: d1b3e72d5490 ("f2fs: submit bio of in-place-update pages") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ef6f1f01eda..83b1983094bd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -462,10 +462,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, bio, fio->type); - if (!is_read_io(fio->op)) inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); + + __submit_bio(fio->sbi, bio, fio->type); return 0; } From 78efac537de33faab9a4302cc05a70bb4a8b3b63 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Oct 2018 23:24:28 +0800 Subject: [PATCH 53/62] f2fs: fix to account IO correctly for cgroup writeback Now, we have supported cgroup writeback, it depends on correctly IO account of specified filesystem. But in commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we split write paths from f2fs_submit_page_mbio() to two: - f2fs_submit_page_bio() for IPU path - f2fs_submit_page_bio() for OPU path But still we account write IO only in f2fs_submit_page_mbio(), result in incorrect IO account, fix it by adding missing IO account in IPU path. Fixes: d1b3e72d5490 ("f2fs: submit bio of in-place-update pages") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 83b1983094bd..4b312b73d8a8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -460,6 +460,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } + + if (fio->io_wbc && !is_read_io(fio->op)) + wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + bio_set_op_attrs(bio, fio->op, fio->op_flags); if (!is_read_io(fio->op)) From 5f9abab42b60e67846cd13dafc6a61d70d7a2682 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 Oct 2018 10:20:53 -0700 Subject: [PATCH 54/62] f2fs: account read IOs and use IO counts for is_idle This patch adds issued read IO counts which is under block layer. Chao modified a bit, since: Below race can cause reversed reference on F2FS_RD_DATA, there is the same issue in f2fs_submit_page_bio(), fix them by relocate __submit_bio() and inc_page_count. Thread A Thread B - f2fs_write_begin - f2fs_submit_page_read - __submit_bio - f2fs_read_end_io - __read_end_io - dec_page_count(, F2FS_RD_DATA) - inc_page_count(, F2FS_RD_DATA) Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 24 ++++++++++++++++++++++-- fs/f2fs/debug.c | 7 ++++++- fs/f2fs/f2fs.h | 22 ++++++++++++++++------ 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4b312b73d8a8..b5de10102775 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -52,6 +52,23 @@ static bool __is_cp_guaranteed(struct page *page) return false; } +static enum count_type __read_io_type(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi)) + return F2FS_RD_META; + + if (inode->i_ino == F2FS_NODE_INO(sbi)) + return F2FS_RD_NODE; + } + return F2FS_RD_DATA; +} + /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, @@ -82,6 +99,7 @@ static void __read_end_io(struct bio *bio) } else { SetPageUptodate(page); } + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); unlock_page(page); } if (bio->bi_private) @@ -466,8 +484,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); - if (!is_read_io(fio->op)) - inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); + inc_page_count(fio->sbi, is_read_io(fio->op) ? + __read_io_type(page): WB_DATA_TYPE(fio->page)); __submit_bio(fio->sbi, bio, fio->type); return 0; @@ -598,6 +616,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, return -EFAULT; } ClearPageError(page); + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1586,6 +1605,7 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA); ClearPageError(page); last_block_in_bio = block_nr; goto next_page; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 026e10f30889..139b4d5c83d5 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -55,6 +55,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + si->nr_rd_data = get_pages(sbi, F2FS_RD_DATA); + si->nr_rd_node = get_pages(sbi, F2FS_RD_NODE); + si->nr_rd_meta = get_pages(sbi, F2FS_RD_META); if (SM_I(sbi) && SM_I(sbi)->fcc_info) { si->nr_flushed = atomic_read(&SM_I(sbi)->fcc_info->issued_flush); @@ -371,7 +374,9 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " + seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n", + si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta); + seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), " "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, si->nr_flushing, si->nr_flushed, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f865c5ed965d..7f9cb0bba104 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -950,6 +950,9 @@ enum count_type { F2FS_DIRTY_IMETA, F2FS_WB_CP_DATA, F2FS_WB_DATA, + F2FS_RD_DATA, + F2FS_RD_NODE, + F2FS_RD_META, NR_COUNT_TYPE, }; @@ -1392,11 +1395,6 @@ static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, return wait_ms; } -static inline bool is_idle(struct f2fs_sb_info *sbi, int type) -{ - return f2fs_time_over(sbi, type); -} - /* * Inline functions */ @@ -1787,7 +1785,9 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || + count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || + count_type == F2FS_RD_META) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2124,6 +2124,15 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, return bio_alloc(GFP_KERNEL, npages); } +static inline bool is_idle(struct f2fs_sb_info *sbi, int type) +{ + if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || + get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || + get_pages(sbi, F2FS_WB_CP_DATA)) + return false; + return f2fs_time_over(sbi, type); +} + static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { @@ -3114,6 +3123,7 @@ struct f2fs_stat_info { int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_rd_data, nr_rd_node, nr_rd_meta; unsigned int io_skip_bggc, other_skip_bggc; int nr_flushing, nr_flushed, flush_list_empty; int nr_discarding, nr_discarded; From 164a63fa6b384e30ceb96ed80bc7dc3379bc0960 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 16 Oct 2018 19:30:13 -0700 Subject: [PATCH 55/62] Revert "f2fs: fix to clear PG_checked flag in set_page_dirty()" This reverts commit 66110abc4c931f879d70e83e1281f891699364bf. If we clear the cold data flag out of the writeback flow, we can miscount -1 by end_io, which incurs a deadlock caused by all I/Os being blocked during heavy GC. Balancing F2FS Async: - IO (CP: 1, Data: -1, Flush: ( 0 0 1), Discard: ( ... GC thread: IRQ - move_data_page() - set_page_dirty() - clear_cold_data() - f2fs_write_end_io() - type = WB_DATA_TYPE(page); here, we get wrong type - dec_page_count(sbi, type); - f2fs_wait_on_page_writeback() Cc: Reported-and-Tested-by: Park Ju Hyung Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b5de10102775..bd6434ad3aa0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2654,10 +2654,6 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - /* don't remain PG_checked flag which was set during GC */ - if (is_cold_data(page)) - clear_cold_data(page); - if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); From 2baf07818549c8bb8d7b3437e889b86eab56d38e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 27 Jul 2018 18:15:16 +0800 Subject: [PATCH 56/62] f2fs: fix to spread clear_cold_data() We need to drop PG_checked flag on page as well when we clear PG_uptodate flag, in order to avoid treating the page as GCing one later. Signed-off-by: Weichao Guo Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++++- fs/f2fs/dir.c | 1 + fs/f2fs/segment.c | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bd6434ad3aa0..8a831f17503e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1816,6 +1816,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) /* This page is already truncated */ if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); + clear_cold_data(page); goto out_writepage; } got_it: @@ -1991,8 +1992,10 @@ done: out: inode_dec_dirty_pages(inode); - if (err) + if (err) { ClearPageUptodate(page); + clear_cold_data(page); + } if (wbc->for_reclaim) { f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); @@ -2621,6 +2624,8 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, } } + clear_cold_data(page); + /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); @@ -2639,6 +2644,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + clear_cold_data(page); set_page_private(page, 0); ClearPagePrivate(page); return 1; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e02db5db62dc..2ef84b4590ea 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -730,6 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); + clear_cold_data(page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 805c8310d7b0..e38b2f69b68e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -266,8 +266,10 @@ retry: } next: /* we don't need to invalidate this in the sccessful status */ - if (drop || recover) + if (drop || recover) { ClearPageUptodate(page); + clear_cold_data(page); + } set_page_private(page, 0); ClearPagePrivate(page); f2fs_put_page(page, 1); From 9149a5eb606152df158eb7d7da5a34e84b574189 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 7 Oct 2018 19:06:15 +0800 Subject: [PATCH 57/62] f2fs: spread f2fs_set_inode_flags() This patch changes codes as below: - use f2fs_set_inode_flags() to update i_flags atomically to avoid potential race. - synchronize F2FS_I(inode)->i_flags to inode->i_flags in f2fs_new_inode(). - use f2fs_set_inode_flags() to simply codes in f2fs_quota_{on,off}. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/namei.c | 2 ++ fs/f2fs/super.c | 5 ++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7f9cb0bba104..c25c67868b98 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3422,7 +3422,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION file_set_encrypt(inode); - inode->i_flags |= S_ENCRYPTED; + f2fs_set_inode_flags(inode); #endif } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a54577e417b1..54295b5c1822 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -122,6 +122,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) set_inode_flag(inode, FI_PROJ_INHERIT); + f2fs_set_inode_flags(inode); + trace_f2fs_new_inode(inode, 0); return inode; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 262a27744eb1..bb88ae62ae6f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1939,8 +1939,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL; - inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, - S_NOATIME | S_IMMUTABLE); + f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1965,7 +1964,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) inode_lock(inode); F2FS_I(inode)->i_flags &= ~(F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL); - inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); out_put: From 0c093b590efb5c1ccdc835868dc2ae94bd2e14dc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 7 Oct 2018 03:03:38 +0800 Subject: [PATCH 58/62] f2fs: fix to recover inode->i_flags of inode block during POR Testcase to reproduce this bug: 1. mkfs.f2fs /dev/sdd 2. mount -t f2fs /dev/sdd /mnt/f2fs 3. touch /mnt/f2fs/file 4. sync 5. chattr +a /mnt/f2fs/file 6. xfs_io -a /mnt/f2fs/file -c "fsync" 7. godown /mnt/f2fs 8. umount /mnt/f2fs 9. mount -t f2fs /dev/sdd /mnt/f2fs 10. xfs_io /mnt/f2fs/file There is no error when opening this file w/o O_APPEND, but actually, we expect the correct result should be: /mnt/f2fs/file: Operation not permitted The root cause is, in recover_inode(), we recover inode->i_flags more than F2FS_I(inode)->i_flags, so fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 93e3b6c46aae..382779f7b2ed 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -234,6 +234,7 @@ static void recover_inode(struct inode *inode, struct page *page) F2FS_I(inode)->i_advise = raw->i_advise; F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); + f2fs_set_inode_flags(inode); F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = le16_to_cpu(raw->i_gc_failures); From 1e78e8bd9d107c351930cdb1e11202caec01b311 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 10 Oct 2018 10:56:22 +0530 Subject: [PATCH 59/62] f2fs: fix data corruption issue with hardware encryption Direct IO can be used in case of hardware encryption. The following scenario results into data corruption issue in this path - Thread A - Thread B- -> write file#1 in direct IO -> GC gets kicked in -> GC submitted bio on meta mapping for file#1, but pending completion -> write file#1 again with new data in direct IO -> GC bio gets completed now -> GC writes old data to the new location and thus file#1 is corrupted. Fix this by submitting and waiting for pending io on meta mapping for direct IO case in f2fs_map_blocks(). Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ fs/f2fs/f2fs.h | 2 ++ fs/f2fs/segment.c | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8a831f17503e..00b37a1bd15c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1053,6 +1053,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, map->m_flags = F2FS_MAP_MAPPED; if (map->m_next_extent) *map->m_next_extent = pgofs + map->m_len; + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); goto out; } @@ -1211,6 +1216,12 @@ skip: goto next_dnode; sync_out: + + /* for hardware encryption, but to avoid potential issue in future */ + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + f2fs_wait_on_block_writeback_range(inode, + map->m_pblk, map->m_len); + if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c25c67868b98..5c80eca194b5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2988,6 +2988,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len); void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e38b2f69b68e..6edcf8391dd3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3303,6 +3303,15 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) } } +void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, + block_t len) +{ + block_t i; + + for (i = 0; i < len; i++) + f2fs_wait_on_block_writeback(inode, blkaddr + i); +} + static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); From 26b5a079197c8cb6725565968b7fd3299bd1877b Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 12 Oct 2018 18:49:26 +0800 Subject: [PATCH 60/62] f2fs: cleanup dirty pages if recover failed During recover, we will try to create new dentries for inodes with dentry_mark. But if the parent is missing (e.g. killed by fsck), recover will break. But those recovered dirty pages are not cleanup. This will hit f2fs_bug_on: [ 53.519566] F2FS-fs (loop0): Found nat_bits in checkpoint [ 53.539354] F2FS-fs (loop0): recover_inode: ino = 5, name = file, inline = 3 [ 53.539402] F2FS-fs (loop0): recover_dentry: ino = 5, name = file, dir = 0, err = -2 [ 53.545760] F2FS-fs (loop0): Cannot recover all fsync data errno=-2 [ 53.546105] F2FS-fs (loop0): access invalid blkaddr:4294967295 [ 53.546171] WARNING: CPU: 1 PID: 1798 at fs/f2fs/checkpoint.c:163 f2fs_is_valid_blkaddr+0x26c/0x320 [ 53.546174] Modules linked in: [ 53.546183] CPU: 1 PID: 1798 Comm: mount Not tainted 4.19.0-rc2+ #1 [ 53.546186] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 53.546191] RIP: 0010:f2fs_is_valid_blkaddr+0x26c/0x320 [ 53.546195] Code: 85 bb 00 00 00 48 89 df 88 44 24 07 e8 ad a8 db ff 48 8b 3b 44 89 e1 48 c7 c2 40 03 72 a9 48 c7 c6 e0 01 72 a9 e8 84 3c ff ff <0f> 0b 0f b6 44 24 07 e9 8a 00 00 00 48 8d bf 38 01 00 00 e8 7c a8 [ 53.546201] RSP: 0018:ffff88006c067768 EFLAGS: 00010282 [ 53.546208] RAX: 0000000000000000 RBX: ffff880068844200 RCX: ffffffffa83e1a33 [ 53.546211] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88006d51e590 [ 53.546215] RBP: 0000000000000005 R08: ffffed000daa3cb3 R09: ffffed000daa3cb3 [ 53.546218] R10: 0000000000000001 R11: ffffed000daa3cb2 R12: 00000000ffffffff [ 53.546221] R13: ffff88006a1f8000 R14: 0000000000000200 R15: 0000000000000009 [ 53.546226] FS: 00007fb2f3646840(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000 [ 53.546229] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 53.546234] CR2: 00007f0fd77f0008 CR3: 00000000687e6002 CR4: 00000000000206e0 [ 53.546237] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 53.546240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 53.546242] Call Trace: [ 53.546248] f2fs_submit_page_bio+0x95/0x740 [ 53.546253] read_node_page+0x161/0x1e0 [ 53.546271] ? truncate_node+0x650/0x650 [ 53.546283] ? add_to_page_cache_lru+0x12c/0x170 [ 53.546288] ? pagecache_get_page+0x262/0x2d0 [ 53.546292] __get_node_page+0x200/0x660 [ 53.546302] f2fs_update_inode_page+0x4a/0x160 [ 53.546306] f2fs_write_inode+0x86/0xb0 [ 53.546317] __writeback_single_inode+0x49c/0x620 [ 53.546322] writeback_single_inode+0xe4/0x1e0 [ 53.546326] sync_inode_metadata+0x93/0xd0 [ 53.546330] ? sync_inode+0x10/0x10 [ 53.546342] ? do_raw_spin_unlock+0xed/0x100 [ 53.546347] f2fs_sync_inode_meta+0xe0/0x130 [ 53.546351] f2fs_fill_super+0x287d/0x2d10 [ 53.546367] ? vsnprintf+0x742/0x7a0 [ 53.546372] ? f2fs_commit_super+0x180/0x180 [ 53.546379] ? up_write+0x20/0x40 [ 53.546385] ? set_blocksize+0x5f/0x140 [ 53.546391] ? f2fs_commit_super+0x180/0x180 [ 53.546402] mount_bdev+0x181/0x200 [ 53.546406] mount_fs+0x94/0x180 [ 53.546411] vfs_kern_mount+0x6c/0x1e0 [ 53.546415] do_mount+0xe5e/0x1510 [ 53.546420] ? fs_reclaim_release+0x9/0x30 [ 53.546424] ? copy_mount_string+0x20/0x20 [ 53.546428] ? fs_reclaim_acquire+0xd/0x30 [ 53.546435] ? __might_sleep+0x2c/0xc0 [ 53.546440] ? ___might_sleep+0x53/0x170 [ 53.546453] ? __might_fault+0x4c/0x60 [ 53.546468] ? _copy_from_user+0x95/0xa0 [ 53.546474] ? memdup_user+0x39/0x60 [ 53.546478] ksys_mount+0x88/0xb0 [ 53.546482] __x64_sys_mount+0x5d/0x70 [ 53.546495] do_syscall_64+0x65/0x130 [ 53.546503] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 53.547639] ---[ end trace b804d1ea2fec893e ]--- So if recover fails, we need to drop all recovered data. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 32 +++++++++++++++++++++----------- fs/f2fs/super.c | 15 ++++++++++++++- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 382779f7b2ed..875d2e205791 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -96,8 +96,12 @@ err_out: return ERR_PTR(err); } -static void del_fsync_inode(struct fsync_inode_entry *entry) +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) { + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -338,12 +342,12 @@ next: return err; } -static void destroy_fsync_dnodes(struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, list) - del_fsync_inode(entry); + del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -580,7 +584,7 @@ out: } static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, - struct list_head *dir_list) + struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; struct page *page = NULL; @@ -634,7 +638,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, } if (entry->blkaddr == blkaddr) - del_fsync_inode(entry); + list_move_tail(&entry->list, tmp_inode_list); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -647,7 +651,7 @@ next: int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list; + struct list_head inode_list, tmp_inode_list; struct list_head dir_list; int err; int ret = 0; @@ -678,6 +682,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ @@ -696,11 +701,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, &dir_list); + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } skip: - destroy_fsync_dnodes(&inode_list); + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -709,13 +719,13 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); + } else { + clear_sbi_flag(sbi, SBI_POR_DOING); } - - clear_sbi_flag(sbi, SBI_POR_DOING); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ - destroy_fsync_dnodes(&dir_list); + destroy_fsync_dnodes(&dir_list, err); if (need_writecp) { set_sbi_flag(sbi, SBI_IS_RECOVERED); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bb88ae62ae6f..f7814bb26a13 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1991,6 +1991,19 @@ void f2fs_quota_off_umount(struct super_block *sb) } } +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + + static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; @@ -3281,10 +3294,10 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif - f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() From af033b2aa8a874fd5737fafe90d159136527b5b4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 20 Sep 2018 20:05:00 +0800 Subject: [PATCH 61/62] f2fs: guarantee journalled quota data by checkpoint For journalled quota mode, let checkpoint to flush dquot dirty data and quota file data to guarntee persistence of all quota sysfile in last checkpoint, by this way, we can avoid corrupting quota sysfile when encountering SPO. The implementation is as below: 1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is cached dquot metadata changes in quota subsystem, and later checkpoint should: a) flush dquot metadata into quota file. b) flush quota file to storage to keep file usage be consistent. 2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota operation failed due to -EIO or -ENOSPC, so later, a) checkpoint will skip syncing dquot metadata. b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a hint for fsck repairing. 3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota data updating is very heavy, it may cause hungtask in block_operation(). To avoid this, if our retry time exceed threshold, let's just skip flushing and retry in next checkpoint(). Signed-off-by: Weichao Guo Signed-off-by: Chao Yu [Jaegeuk Kim: avoid warnings and set fsck flag] Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 62 ++++++++++++++++++-- fs/f2fs/data.c | 16 ++++-- fs/f2fs/f2fs.h | 49 +++++++++++++--- fs/f2fs/file.c | 31 +++++++--- fs/f2fs/inline.c | 4 +- fs/f2fs/inode.c | 11 +++- fs/f2fs/namei.c | 4 -- fs/f2fs/recovery.c | 43 +++++++++++++- fs/f2fs/super.c | 121 ++++++++++++++++++++++++++++++++++++---- include/linux/f2fs_fs.h | 1 + 10 files changed, 294 insertions(+), 48 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index eb6ac79640f8..9c28ea439e0b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1079,6 +1079,21 @@ static void __prepare_cp_block(struct f2fs_sb_info *sbi) ckpt->next_free_nid = cpu_to_le32(last_nid); } +static bool __need_flush_quota(struct f2fs_sb_info *sbi) +{ + if (!is_journalled_quota(sbi)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + return false; + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) + return true; + if (get_pages(sbi, F2FS_DIRTY_QDATA)) + return true; + return false; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -1090,12 +1105,36 @@ static int block_operations(struct f2fs_sb_info *sbi) .for_reclaim = 0, }; struct blk_plug plug; - int err = 0; + int err = 0, cnt = 0; blk_start_plug(&plug); -retry_flush_dents: +retry_flush_quotas: + if (__need_flush_quota(sbi)) { + int locked; + + if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { + set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); + f2fs_lock_all(sbi); + goto retry_flush_dents; + } + clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + /* only failed during mount/umount/freeze/quotactl */ + locked = down_read_trylock(&sbi->sb->s_umount); + f2fs_quota_sync(sbi->sb, -1); + if (locked) + up_read(&sbi->sb->s_umount); + } + f2fs_lock_all(sbi); + if (__need_flush_quota(sbi)) { + f2fs_unlock_all(sbi); + cond_resched(); + goto retry_flush_quotas; + } + +retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); @@ -1103,7 +1142,7 @@ retry_flush_dents: if (err) goto out; cond_resched(); - goto retry_flush_dents; + goto retry_flush_quotas; } /* @@ -1112,6 +1151,12 @@ retry_flush_dents: */ down_write(&sbi->node_change); + if (__need_flush_quota(sbi)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + goto retry_flush_quotas; + } + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1119,7 +1164,7 @@ retry_flush_dents: if (err) goto out; cond_resched(); - goto retry_flush_dents; + goto retry_flush_quotas; } retry_flush_nodes: @@ -1215,6 +1260,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + + if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) + __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); @@ -1422,6 +1475,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); sbi->unusable_block_count = 0; __set_cp_next_pack(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 00b37a1bd15c..106f116466bf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -46,7 +46,7 @@ static bool __is_cp_guaranteed(struct page *page) inode->i_ino == F2FS_NODE_INO(sbi) || S_ISDIR(inode->i_mode) || (S_ISREG(inode->i_mode) && - is_inode_flag_set(inode, FI_ATOMIC_FILE)) || + (f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) || is_cold_data(page)) return true; return false; @@ -1766,6 +1766,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (S_ISDIR(inode->i_mode)) return true; + if (IS_NOQUOTA(inode)) + return true; if (f2fs_is_atomic_file(inode)) return true; if (fio) { @@ -2016,7 +2018,7 @@ out: } unlock_page(page); - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode)) f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { @@ -2207,6 +2209,8 @@ static inline bool __should_serialize_io(struct inode *inode, { if (!S_ISREG(inode->i_mode)) return false; + if (IS_NOQUOTA(inode)) + return false; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@ -2236,7 +2240,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && + if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) && + wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && f2fs_available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; @@ -2301,7 +2306,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true); + f2fs_truncate_blocks(inode, i_size, true, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2440,7 +2445,8 @@ repeat: if (err) goto fail; - if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + if (need_balance && !IS_NOQUOTA(inode) && + has_not_enough_free_secs(sbi, 0, 0)) { unlock_page(page); f2fs_balance_fs(sbi, true); lock_page(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5c80eca194b5..f447cbc2295f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -531,6 +531,9 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +/* maximum retry quota flush count */ +#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ @@ -1099,6 +1102,9 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ + SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ + SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ }; enum { @@ -1923,12 +1929,18 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, { block_t valid_block_count; unsigned int valid_node_count; - bool quota = inode && !is_inode; + int err; - if (quota) { - int ret = dquot_reserve_block(inode, 1); - if (ret) - return ret; + if (is_inode) { + if (inode) { + err = dquot_alloc_inode(inode); + if (err) + return err; + } + } else { + err = dquot_reserve_block(inode, 1); + if (err) + return err; } if (time_to_inject(sbi, FAULT_BLOCK)) { @@ -1972,8 +1984,12 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return 0; enospc: - if (quota) + if (is_inode) { + if (inode) + dquot_free_inode(inode); + } else { dquot_release_reservation_block(inode, 1); + } return -ENOSPC; } @@ -1994,7 +2010,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, spin_unlock(&sbi->stat_lock); - if (!is_inode) + if (is_inode) + dquot_free_inode(inode); + else f2fs_i_blocks_write(inode, 1, false, true); } @@ -2782,7 +2800,8 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, + bool buf_write); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -2870,6 +2889,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); +int f2fs_quota_sync(struct super_block *sb, int type); void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); @@ -3564,3 +3584,16 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #endif #endif + +static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) +{ +#ifdef CONFIG_QUOTA + if (f2fs_sb_has_quota_ino(sbi->sb)) + return true; + if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] || + F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] || + F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) + return true; +#endif + return false; +} diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 543c742f8bd7..971463e0589e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -586,7 +586,8 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, + bool buf_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -594,6 +595,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) int count = 0, err = 0; struct page *ipage; bool truncate_page = false; + int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -603,7 +605,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) goto free_partial; if (lock) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -641,7 +643,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); free_partial: /* lastly zero out the first data page */ if (!err) @@ -676,7 +678,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); if (err) return err; @@ -785,9 +787,24 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) !uid_eq(attr->ia_uid, inode->i_uid)) || (attr->ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) { + f2fs_lock_op(F2FS_I_SB(inode)); err = dquot_transfer(inode, attr); - if (err) + if (err) { + set_sbi_flag(F2FS_I_SB(inode), + SBI_QUOTA_NEED_REPAIR); + f2fs_unlock_op(F2FS_I_SB(inode)); return err; + } + /* + * update uid/gid under lock_op(), so that dquot and inode can + * be updated atomically. + */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + f2fs_mark_inode_dirty_sync(inode, true); + f2fs_unlock_op(F2FS_I_SB(inode)); } if (attr->ia_valid & ATTR_SIZE) { @@ -1242,7 +1259,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true); + ret = f2fs_truncate_blocks(inode, new_size, true, false); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1427,7 +1444,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 425d740f87fd..cb31a719b048 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false)) + if (f2fs_truncate_blocks(inode, 0, false, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false); + f2fs_truncate_blocks(dir, 0, false, false); f2fs_remove_dirty_inode(dir); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 57a7a15239d6..91ceee0ed4c4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -654,7 +654,11 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - dquot_initialize(inode); + err = dquot_initialize(inode); + if (err) { + err = 0; + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } f2fs_remove_ino_entry(sbi, inode->i_ino, APPEND_INO); f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -686,9 +690,10 @@ retry: goto retry; } - if (err) + if (err) { f2fs_update_inode_page(inode); - dquot_free_inode(inode); + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + } sb_end_intwrite(inode->i_sb); no_delete: dquot_drop(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 54295b5c1822..99299ede7429 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -72,10 +72,6 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; - err = dquot_alloc_inode(inode); - if (err) - goto fail_drop; - set_inode_flag(inode, FI_NEW_INODE); /* If the directory encrypted, then we should encrypt the inode. */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 875d2e205791..df2123759ac7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -195,6 +195,33 @@ out: return err; } +static int recover_quota_data(struct inode *inode, struct page *page) +{ + struct f2fs_inode *raw = F2FS_INODE(page); + struct iattr attr; + uid_t i_uid = le32_to_cpu(raw->i_uid); + gid_t i_gid = le32_to_cpu(raw->i_gid); + int err; + + memset(&attr, 0, sizeof(attr)); + + attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid); + attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid); + + if (!uid_eq(attr.ia_uid, inode->i_uid)) + attr.ia_valid |= ATTR_UID; + if (!gid_eq(attr.ia_gid, inode->i_gid)) + attr.ia_valid |= ATTR_GID; + + if (!attr.ia_valid) + return 0; + + err = dquot_transfer(inode, &attr); + if (err) + set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR); + return err; +} + static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) { if (ri->i_inline & F2FS_PIN_FILE) @@ -207,12 +234,18 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri) clear_inode_flag(inode, FI_DATA_EXIST); } -static void recover_inode(struct inode *inode, struct page *page) +static int recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); char *name; + int err; inode->i_mode = le16_to_cpu(raw->i_mode); + + err = recover_quota_data(inode, page); + if (err) + return err; + i_uid_write(inode, le32_to_cpu(raw->i_uid)); i_gid_write(inode, le32_to_cpu(raw->i_gid)); @@ -254,6 +287,7 @@ static void recover_inode(struct inode *inode, struct page *page) f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s, inline = %x", ino_of_node(page), name, raw->i_inline); + return 0; } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, @@ -622,8 +656,11 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(page)) - recover_inode(entry->inode, page); + if (IS_INODE(page)) { + err = recover_inode(entry->inode, page); + if (err) + break; + } if (entry->last_dentry == blkaddr) { err = recover_dentry(entry->inode, page, dir_list); if (err) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f7814bb26a13..af58b2cc21b8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1711,6 +1711,7 @@ repeat: congestion_wait(BLK_RW_ASYNC, HZ/50); goto repeat; } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return PTR_ERR(page); } @@ -1722,6 +1723,7 @@ repeat: } if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); return -EIO; } @@ -1763,6 +1765,7 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); break; } @@ -1799,6 +1802,12 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) { + if (is_set_ckpt_flags(sbi, CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_msg(sbi->sb, KERN_ERR, + "quota sysfile may be corrupted, skip loading it"); + return 0; + } + return dquot_quota_on_mount(sbi->sb, F2FS_OPTION(sbi).s_qf_names[type], F2FS_OPTION(sbi).s_jquota_fmt, type); } @@ -1869,7 +1878,14 @@ static int f2fs_enable_quotas(struct super_block *sb) test_opt(F2FS_SB(sb), PRJQUOTA), }; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + if (is_set_ckpt_flags(F2FS_SB(sb), CP_QUOTA_NEED_FSCK_FLAG)) { + f2fs_msg(sb, KERN_ERR, + "quota file may be corrupted, skip loading it"); + return 0; + } + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + for (type = 0; type < MAXQUOTAS; type++) { qf_inum = f2fs_qf_ino(sb, type); if (qf_inum) { @@ -1883,6 +1899,8 @@ static int f2fs_enable_quotas(struct super_block *sb) "fsck to fix.", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); + set_sbi_flag(F2FS_SB(sb), + SBI_QUOTA_NEED_REPAIR); return err; } } @@ -1890,35 +1908,51 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } -static int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_quota_sync(struct super_block *sb, int type) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret; ret = dquot_writeback_dquots(sb, type); if (ret) - return ret; + goto out; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct address_space *mapping; + if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + mapping = dqopt->files[cnt]->i_mapping; + + ret = filemap_fdatawrite(mapping); if (ret) - return ret; + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + continue; + + ret = filemap_fdatawait(mapping); + if (ret) + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } - return 0; +out: + if (ret) + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return ret; } static int f2fs_quota_on(struct super_block *sb, int type, int format_id, @@ -1986,7 +2020,7 @@ void f2fs_quota_off_umount(struct super_block *sb) "Fail to turn off disk quota " "(type: %d, err: %d, ret:%d), Please " "run fsck to fix it.", type, err, ret); - set_sbi_flag(F2FS_SB(sb), SBI_NEED_FSCK); + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } } @@ -2003,6 +2037,61 @@ static void f2fs_truncate_quota_inode_pages(struct super_block *sb) } } +static int f2fs_dquot_commit(struct dquot *dquot) +{ + int ret; + + ret = dquot_commit(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_acquire(struct dquot *dquot) +{ + int ret; + + ret = dquot_acquire(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + + return ret; +} + +static int f2fs_dquot_release(struct dquot *dquot) +{ + int ret; + + ret = dquot_release(dquot); + if (ret < 0) + set_sbi_flag(F2FS_SB(dquot->dq_sb), SBI_QUOTA_NEED_REPAIR); + return ret; +} + +static int f2fs_dquot_mark_dquot_dirty(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + int ret; + + ret = dquot_mark_dquot_dirty(dquot); + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); + + return ret; +} + +static int f2fs_dquot_commit_info(struct super_block *sb, int type) +{ + int ret; + + ret = dquot_commit_info(sb, type); + if (ret < 0) + set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + return ret; +} static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { @@ -2012,11 +2101,11 @@ static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, - .write_dquot = dquot_commit, - .acquire_dquot = dquot_acquire, - .release_dquot = dquot_release, - .mark_dirty = dquot_mark_dquot_dirty, - .write_info = dquot_commit_info, + .write_dquot = f2fs_dquot_commit, + .acquire_dquot = f2fs_dquot_acquire, + .release_dquot = f2fs_dquot_release, + .mark_dirty = f2fs_dquot_mark_dquot_dirty, + .write_info = f2fs_dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = f2fs_get_projid, @@ -2034,6 +2123,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_quota_sync(struct super_block *sb, int type) +{ + return 0; +} + void f2fs_quota_off_umount(struct super_block *sb) { } @@ -3104,6 +3198,9 @@ try_onemore: goto free_meta_inode; } + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + /* Initialize device list */ err = f2fs_scan_devices(sbi); if (err) { diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 8b9c7dc0260c..d7711048ef93 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -117,6 +117,7 @@ struct f2fs_super_block { * For checkpoint */ #define CP_DISABLED_FLAG 0x00001000 +#define CP_QUOTA_NEED_FSCK_FLAG 0x00000800 #define CP_LARGE_NAT_BITMAP_FLAG 0x00000400 #define CP_NOCRC_RECOVERY_FLAG 0x00000200 #define CP_TRIMMED_FLAG 0x00000100 From 78130819695f17f5c042d8ba097802639478faf5 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 25 Sep 2018 15:36:02 +0800 Subject: [PATCH 62/62] f2fs: fix to keep project quota consistent This patch does below changes to keep consistence of project quota data in sudden power-cut case: - update inode.i_projid and project quota atomically under lock_op() in f2fs_ioc_setproject() - recover inode.i_projid and project quota in recover_inode() Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/file.c | 37 ++++++++++++++++++++++++++++--------- fs/f2fs/recovery.c | 12 ++++++++++-- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f447cbc2295f..56204a8f8a12 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2811,6 +2811,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid); int f2fs_pin_file_control(struct inode *inode, bool inc); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 971463e0589e..88b124677189 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2613,13 +2613,29 @@ static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) } #ifdef CONFIG_QUOTA +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + int err = 0; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + if (err) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + dqput(transfer_to[PRJQUOTA]); + } + return err; +} + static int f2fs_ioc_setproject(struct file *filp, __u32 projid) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; - struct dquot *transfer_to[MAXQUOTAS] = {}; struct page *ipage; kprojid_t kprojid; int err; @@ -2660,21 +2676,24 @@ static int f2fs_ioc_setproject(struct file *filp, __u32 projid) if (err) return err; - transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (!IS_ERR(transfer_to[PRJQUOTA])) { - err = __dquot_transfer(inode, transfer_to); - dqput(transfer_to[PRJQUOTA]); - if (err) - goto out_dirty; - } + f2fs_lock_op(sbi); + err = f2fs_transfer_project_quota(inode, kprojid); + if (err) + goto out_unlock; F2FS_I(inode)->i_projid = kprojid; inode->i_ctime = current_time(inode); -out_dirty: f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + f2fs_unlock_op(sbi); return err; } #else +int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid) +{ + return 0; +} + static int f2fs_ioc_setproject(struct file *filp, __u32 projid) { if (projid != F2FS_DEF_PROJID) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index df2123759ac7..1dfb17f9f9ff 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -254,10 +254,18 @@ static int recover_inode(struct inode *inode, struct page *page) F2FS_FITS_IN_INODE(raw, le16_to_cpu(raw->i_extra_isize), i_projid)) { projid_t i_projid; + kprojid_t kprojid; i_projid = (projid_t)le32_to_cpu(raw->i_projid); - F2FS_I(inode)->i_projid = - make_kprojid(&init_user_ns, i_projid); + kprojid = make_kprojid(&init_user_ns, i_projid); + + if (!projid_eq(kprojid, F2FS_I(inode)->i_projid)) { + err = f2fs_transfer_project_quota(inode, + kprojid); + if (err) + return err; + F2FS_I(inode)->i_projid = kprojid; + } } }