From 43bbddc067883d94de7a43d5756a295439fbe37d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:57 +0800 Subject: [PATCH 01/60] ext4: add two helper functions extent_logical_end() and pa_logical_end() When we use lstart + len to calculate the end of free extent or prealloc space, it may exceed the maximum value of 4294967295(0xffffffff) supported by ext4_lblk_t and cause overflow, which may lead to various problems. Therefore, we add two helper functions, extent_logical_end() and pa_logical_end(), to limit the type of end to loff_t, and also convert lstart to loff_t for calculation to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 +++------ fs/ext4/mballoc.h | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e..4cb13b3e41b3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4432,7 +4432,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4766,7 +4766,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4861,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -5769,7 +5766,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c2274..d7aeb5da7d86 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, From bc056e7163ac7db945366de219745cf94f32a3e6 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:58 +0800 Subject: [PATCH 02/60] ext4: fix BUG in ext4_mb_new_inode_pa() due to overflow When we calculate the end position of ext4_free_extent, this position may be exactly where ext4_lblk_t (i.e. uint) overflows. For example, if ac_g_ex.fe_logical is 4294965248 and ac_orig_goal_len is 2048, then the computed end is 0x100000000, which is 0. If ac->ac_o_ex.fe_logical is not the first case of adjusting the best extent, that is, new_bex_end > 0, the following BUG_ON will be triggered: ========================================================= kernel BUG at fs/ext4/mballoc.c:5116! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 673 Comm: xfs_io Tainted: G E 6.5.0-rc1+ #279 RIP: 0010:ext4_mb_new_inode_pa+0xc5/0x430 Call Trace: ext4_mb_use_best_found+0x203/0x2f0 ext4_mb_try_best_found+0x163/0x240 ext4_mb_regular_allocator+0x158/0x1550 ext4_mb_new_blocks+0x86a/0xe10 ext4_ext_map_blocks+0xb0c/0x13a0 ext4_map_blocks+0x2cd/0x8f0 ext4_iomap_begin+0x27b/0x400 iomap_iter+0x222/0x3d0 __iomap_dio_rw+0x243/0xcb0 iomap_dio_rw+0x16/0x80 ========================================================= A simple reproducer demonstrating the problem: mkfs.ext4 -F /dev/sda -b 4096 100M mount /dev/sda /tmp/test fallocate -l1M /tmp/test/tmp fallocate -l10M /tmp/test/file fallocate -i -o 1M -l16777203M /tmp/test/file fsstress -d /tmp/test -l 0 -n 100000 -p 8 & sleep 10 && killall -9 fsstress rm -f /tmp/test/tmp xfs_io -c "open -ad /tmp/test/file" -c "pwrite -S 0xff 0 8192" We simply refactor the logic for adjusting the best extent by adding a temporary ext4_free_extent ex and use extent_logical_end() to avoid overflow, which also simplifies the code. Cc: stable@kernel.org # 6.4 Fixes: 93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()") Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4cb13b3e41b3..86bce870dc5a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5177,8 +5177,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -5197,29 +5200,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * still cover original start * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) + ex.fe_len = ac->ac_b_ex.fe_len; + + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; From bedc5d34632c21b5adb8ca7143d4c1f794507e4c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:59 +0800 Subject: [PATCH 03/60] ext4: avoid overlapping preallocations due to overflow Let's say we want to allocate 2 blocks starting from 4294966386, after predicting the file size, start is aligned to 4294965248, len is changed to 2048, then end = start + size = 0x100000000. Since end is of type ext4_lblk_t, i.e. uint, end is truncated to 0. This causes (pa->pa_lstart >= end) to always hold when checking if the current extent to be allocated crosses already preallocated blocks, so the resulting ac_g_ex may cross already preallocated blocks. Hence we convert the end type to loff_t and use pa_logical_end() to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 86bce870dc5a..78a4a24e2f57 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4222,12 +4222,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4237,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4259,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4285,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4365,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4404,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; From 1d40165047456923fa4343d519353d9440cd68df Mon Sep 17 00:00:00 2001 From: Guoqing Cai Date: Thu, 13 Apr 2023 17:57:39 +0800 Subject: [PATCH 04/60] fs: jbd2: fix an incorrect warn log In jbd2_journal_load(), when journal_reset fails, it prints an incorrect warn log. Fix this by changing the goto statement to return statement. Also, return actual error code from jbd2_journal_recover() and journal_reset(). Signed-off-by: Guoqing Cai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230413095740.2222066-1-u202112087@hust.edu.cn Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fbce16fedaa4..5c223032f77a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2089,8 +2089,11 @@ int jbd2_journal_load(journal_t *journal) /* Let the recovery code check whether it needs to recover any * data from the journal. */ - if (jbd2_journal_recover(journal)) - goto recovery_error; + err = jbd2_journal_recover(journal); + if (err) { + pr_warn("JBD2: journal recovery failed\n"); + return err; + } if (journal->j_failed_commit) { printk(KERN_ERR "JBD2: journal transaction %u on %s " @@ -2107,15 +2110,14 @@ int jbd2_journal_load(journal_t *journal) /* OK, we've finished with the dynamic journal bits: * reinitialise the dynamic contents of the superblock in memory * and reset them on disk. */ - if (journal_reset(journal)) - goto recovery_error; + err = journal_reset(journal); + if (err) { + pr_warn("JBD2: journal reset failed\n"); + return err; + } journal->j_flags |= JBD2_LOADED; return 0; - -recovery_error: - printk(KERN_WARNING "JBD2: recovery failed\n"); - return -EIO; } /** From 98175720c9ed3bac857b0364321517cc2d695a3f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:47 +0200 Subject: [PATCH 05/60] ext4: remove pointless sb_rdonly() checks from freezing code ext4_freeze() and ext4_unfreeze() checks for sb_rdonly(). However this check is pointless as VFS already checks for read-only filesystem before calling filesystem specific methods. Remove the pointless checks. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..ffc4fb73f133 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6347,12 +6347,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6386,7 +6381,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(EXT4_SB(sb))) return 0; if (EXT4_SB(sb)->s_journal) { From d5d020b3294b69eaf3b8985e7a37ba237849c390 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:48 +0200 Subject: [PATCH 06/60] ext4: use sb_rdonly() helper for checking read-only flag sb_rdonly() helper instead of directly checking sb->s_flags. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ffc4fb73f133..19514f98e2fe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6084,7 +6084,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6675,7 +6675,7 @@ restore_opts: * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); sb->s_flags = old_sb_flags; From eb8ab4443aec5ffe923a471b337568a8158cd32b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:49 +0200 Subject: [PATCH 07/60] ext4: make ext4_forced_shutdown() take struct super_block Currently ext4_forced_shutdown() takes struct ext4_sb_info but most callers need to get it from struct super_block anyway. So just pass in struct super_block to save all callers from some boilerplate code. No functional changes. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/ext4_jbd2.c | 2 +- fs/ext4/file.c | 13 ++++++------- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inline.c | 2 +- fs/ext4/inode.c | 24 ++++++++++++------------ fs/ext4/ioctl.c | 2 +- fs/ext4/namei.c | 8 ++++---- fs/ext4/page-io.c | 2 +- fs/ext4/super.c | 14 +++++++------- fs/ext4/xattr.c | 2 +- 12 files changed, 38 insertions(+), 39 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..feb38c9fe129 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2222,9 +2222,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_forced_shutdown(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..b72a22a57d20 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -67,7 +67,7 @@ static int ext4_journal_check_start(struct super_block *sb) might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (sb_rdonly(sb)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..2071b1e4322c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -709,7 +709,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX @@ -807,10 +807,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* @@ -886,7 +885,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1..bffc1d0994f5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -133,7 +133,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..060630c0b0ca 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..3623dfcc8fc7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..c6fa59e57f1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -2213,7 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || + if (ext4_forced_shutdown(sb) || ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* @@ -2540,7 +2540,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; @@ -2759,7 +2759,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2798,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2857,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -5135,7 +5135,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) sb_rdonly(inode->i_sb)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5255,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5676,7 +5676,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5702,7 +5702,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..0d3aef1118cb 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -801,7 +801,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..6298cfaaa0bd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3142,7 +3142,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in @@ -3301,7 +3301,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); @@ -3369,7 +3369,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, @@ -4189,7 +4189,7 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec671..dfdd7e5cf038 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 19514f98e2fe..0038233eafa8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -758,7 +758,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -783,7 +783,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -818,7 +818,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -898,7 +898,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -992,7 +992,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -6298,7 +6298,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); @@ -6381,7 +6381,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 05151d61b00b..7cc502c06246 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) From 22b8d707b07e6e06f50fe1d9ca8756e1f894eb0d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:50 +0200 Subject: [PATCH 08/60] ext4: make 'abort' mount option handling standard 'abort' mount option is the only mount option that has special handling and sets a bit in sbi->s_mount_flags. There is not strong reason for that so just simplify the code and make 'abort' set a bit in sbi->s_mount_opt2 as any other mount option. This simplifies the code and will allow us to drop EXT4_MF_FS_ABORTED completely in the following patch. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 16 ++-------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index feb38c9fe129..907829007f3f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1235,6 +1235,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0038233eafa8..f84142907cd5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1897,6 +1897,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1965,8 +1966,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -2117,12 +2116,6 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; @@ -2186,9 +2179,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2842,8 +2832,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -6497,7 +6485,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | From 95257987a6387f02970eda707e55a06cce734e18 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:51 +0200 Subject: [PATCH 09/60] ext4: drop EXT4_MF_FS_ABORTED flag EXT4_MF_FS_ABORTED flag has practically the same intent as EXT4_FLAGS_SHUTDOWN flag. The shutdown flag is checked in many more places than the aborted flag which is mostly the historical artifact where we were relying on SB_RDONLY checks instead of the aborted flag checks. There are only three places - ext4_sync_file(), __ext4_remount(), and mballoc debug code - which check aborted flag and not shutdown flag and this is arguably a bug. Avoid these inconsistencies by removing EXT4_MF_FS_ABORTED flag and using EXT4_FLAGS_SHUTDOWN everywhere. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/fsync.c | 7 +++---- fs/ext4/inode.c | 8 +++----- fs/ext4/mballoc.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 907829007f3f..89d76d50af4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index bffc1d0994f5..b40d3b29f7e5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,7 +131,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6fa59e57f1e..100c3ec6da6c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 78a4a24e2f57..1dc63e329e64 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5664,7 +5664,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5698,7 +5698,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f84142907cd5..20a8e64da4ac 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -657,7 +657,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -6502,7 +6502,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_error_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } From e0e985f3f8941438a66ab8abb94cb011b9fb39a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:52 +0200 Subject: [PATCH 10/60] ext4: avoid starting transaction on read-only fs in ext4_quota_off() When the filesystem gets first remounted read-only and then unmounted, ext4_quota_off() will try to start a transaction (and fail) on read-only filesystem to cleanup inode flags for legacy quota files. Just bail before trying to start a transaction instead since that is going to issue a warning. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20a8e64da4ac..a9a7c38c7442 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7072,6 +7072,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* From e7fc2b31e04c46c9e2098bba710c9951c6b968af Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:53 +0200 Subject: [PATCH 11/60] ext4: warn on read-only filesystem in ext4_journal_check_start() Now that filesystem abort marks the filesystem as shutdown, we shouldn't be ever hitting the sb_rdonly() check in ext4_journal_check_start(). Since this is a suitable place for catching all sorts of programming errors, convert the check to WARN_ON instead of dropping it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b72a22a57d20..ca0eaf2147b0 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -70,8 +70,9 @@ static int ext4_journal_check_start(struct super_block *sb) if (unlikely(ext4_forced_shutdown(sb))) return -EIO; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* From ffb6844e28ef6b9d76bee378774d7afbc3db6da9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:54 +0200 Subject: [PATCH 12/60] ext4: drop read-only check in ext4_init_inode_table() We better should not be initializing inode tables on read-only filesystem. The following transaction start will warn us and make the function bail anyway so drop the pointless check. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 060630c0b0ca..e0698f54e17a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; From f1128084b40e520bea8bb32b3ff4d03745ab7e64 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:55 +0200 Subject: [PATCH 13/60] ext4: drop read-only check in ext4_write_inode() We should not have dirty inodes on read-only filesystem. Also silently bailing without writing anything would be a problem when we enable quotas during remount while the filesystem is read-only. So drop the read-only check. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 100c3ec6da6c..1b9003840bc1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5129,8 +5129,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) From 889860e452d7436ca72018b8a03cbd89c38d6384 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:56 +0200 Subject: [PATCH 14/60] ext4: drop read-only check from ext4_force_commit() JBD2 code will quickly return without doing anything when there's nothing to commit so there's no point in the read-only check in ext4_force_commit(). Just drop it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-10-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9a7c38c7442..4613264344b0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6270,13 +6270,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) From 1e1566b9c85fbd6150657ea17f50fd42b9166d31 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:57 +0200 Subject: [PATCH 15/60] ext4: replace read-only check for shutdown check in mmp code The multi-mount protection kthread checks for read-only filesystem and aborts in that case. The remount code actually handles stopping of the kthread on remount so the only purpose of the check is in case of emergency remount read-only. Replace the check for read-only filesystem with a check for shutdown filesystem as running MMP on such is risky anyway and it makes ordering of things during remount simpler. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-11-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6e..bd946d0c71b7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); From 304749c0d5e216479ea4d553ad04ba1390d5c707 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 30 Jun 2023 14:29:27 +0530 Subject: [PATCH 16/60] ext4: replace CR_FAST macro with inline function for readability Replace CR_FAST with ext4_mb_cr_expensive() inline function for better readability. This function returns true if the criteria is one of the expensive/slower ones where lots of disk IO/prefetching is acceptable. No functional changes are intended in this patch. Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230630085927.140137-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ++++--- fs/ext4/mballoc.c | 13 +++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89d76d50af4c..532b70f613e9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -2924,6 +2921,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1dc63e329e64..a2d791953da5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2450,7 +2450,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2634,7 +2634,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2658,7 +2663,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2852,7 +2857,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr >= CR_FAST || + (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { From a9ce5993a0f5c0887c8a1b4ffa3b8046fbcfdc93 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:55 +0800 Subject: [PATCH 17/60] ext4: correct grp validation in ext4_mb_good_group Group corruption check will access memory of grp and will trigger kernel crash if grp is NULL. So do NULL check before corruption check. Fixes: 5354b2af3406 ("ext4: allow ext4_get_group_info() to fail") Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a2d791953da5..e07d2a4fbcd1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2553,7 +2553,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; From 60c672b7f2d1e5dd1774f2399b355c9314e709f8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:56 +0800 Subject: [PATCH 18/60] ext4: avoid potential data overflow in next_linear_group ngroups is ext4_group_t (unsigned int) while next_linear_group treat it in int. If ngroups is bigger than max number described by int, it will be treat as a negative number. Then "return group + 1 >= ngroups ? 0 : group + 1;" may keep returning 0. Switch int to ext4_group_t in next_linear_group to fix the overflow. Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e07d2a4fbcd1..bf041932c599 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1080,8 +1080,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) * Return next linear group for allocation. If linear traversal should not be * performed, this function just returns the same group */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static ext4_group_t +next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, + ext4_group_t ngroups) { if (!should_optimize_scan(ac)) goto inc_and_return; From 919eb90cec4049cecf4a9f996afb0f14e3864fca Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:57 +0800 Subject: [PATCH 19/60] ext4: return found group directly in ext4_mb_choose_next_group_p2_aligned Return good group when it's found in loop to remove unnecessary NULL initialization of grp and futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bf041932c599..880b9731edaa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -874,7 +874,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; + struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -883,7 +883,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; @@ -892,28 +891,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; + *group = iter->bb_group; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; } - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - } + /* Increment cr and search again if no group is found */ + *new_cr = CR_GOAL_LEN_FAST; } /* From bb60caa2db6697c20a0842b5b3c192aa1800da1a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:58 +0800 Subject: [PATCH 20/60] ext4: use is_power_of_2 helper in ext4_mb_regular_allocator Use intuitive is_power_of_2 helper in ext4_mb_regular_allocator. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 880b9731edaa..3dd2609ea133 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2799,10 +2799,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } From ad635507b5b22d59457b6db6d8a0e4ddf7ad2b4c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:59 +0800 Subject: [PATCH 21/60] ext4: remove unnecessary return for void function The return at end of void function is unnecessary, just remove it. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3dd2609ea133..154ae1fca10c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4976,7 +4976,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } - return; } /* @@ -5727,12 +5726,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5973,12 +5970,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* @@ -6632,7 +6626,6 @@ do_more: error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; } /** @@ -6735,7 +6728,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** From de8bf0e5ee7482585450357c6d4eddec8efc5cb7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:00 +0800 Subject: [PATCH 22/60] ext4: replace the traditional ternary conditional operator with with max()/min() Replace the traditional ternary conditional operator with with max()/min() Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 154ae1fca10c..f0deb5f2f81d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6917,8 +6917,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -7135,8 +7134,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; From f6c72fef1272e65eff8d5ecef8c744686f6b7745 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:01 +0800 Subject: [PATCH 23/60] ext4: remove unused ext4_{set}/{clear}_bit_atomic Remove ext4_set_bit_atomic and ext4_clear_bit_atomic which are defined but not used. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 532b70f613e9..fb4d914ea888 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1250,10 +1250,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le From b50675a4a6a69110c0c2baadebd2075d3b31b25c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:02 +0800 Subject: [PATCH 24/60] ext4: return found group directly in ext4_mb_choose_next_group_goal_fast Return good group when it's found in loop to remove futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f0deb5f2f81d..3b1f90dfb119 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -959,16 +959,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { - *new_cr = CR_BEST_AVAIL_LEN; - } + *new_cr = CR_BEST_AVAIL_LEN; } /* From bcb123ac9b9887478da4185b55dfbf1a72550848 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:03 +0800 Subject: [PATCH 25/60] ext4: return found group directly in ext4_mb_choose_next_group_best_avail Return good group when it's found in loop to remove futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3b1f90dfb119..f9189f7566fb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1042,18 +1042,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; - } + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) From 4eea9fbed950f240bf6e627e1c784b8d54c54988 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:04 +0800 Subject: [PATCH 26/60] ext4: correct some stale comment of criteria We named criteria with CR_XXX, correct stale comment to criteria with raw number. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f9189f7566fb..b89b5f0816e7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2782,8 +2782,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2840,8 +2840,8 @@ repeat: /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups + * prefetching at inexpensive CR, otherwise mballoc + * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && (ext4_mb_cr_expensive(cr) || From 373ac521799d9e97061515aca6ec6621789036bb Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 14 Jul 2023 10:55:26 +0800 Subject: [PATCH 27/60] jbd2: fix checkpoint cleanup performance regression journal_clean_one_cp_list() has been merged into journal_shrink_one_cp_list(), but do chekpoint buffer cleanup from the committing process is just a best effort, it should stop scan once it meet a busy buffer, or else it will cause a lot of invalid buffer scan and checks. We catch a performance regression when doing fs_mark tests below. Test cmd: ./fs_mark -d scratch -s 1024 -n 10000 -t 1 -D 100 -N 100 Before merging checkpoint buffer cleanup: FSUse% Count Size Files/sec App Overhead 95 10000 1024 8304.9 49033 After merging checkpoint buffer cleanup: FSUse% Count Size Files/sec App Overhead 95 10000 1024 7649.0 50012 FSUse% Count Size Files/sec App Overhead 95 10000 1024 2107.1 50871 After merging checkpoint buffer cleanup, the total loop count in journal_shrink_one_cp_list() could be up to 6,261,600+ (50,000+ ~ 100,000+ in general), most of them are invalid. This patch fix it through passing 'shrink_type' into journal_shrink_one_cp_list() and add a new 'SHRINK_BUSY_STOP' to indicate it should stop once meet a busy buffer. After fix, the loop count descending back to 10,000+. After this fix: FSUse% Count Size Files/sec App Overhead 95 10000 1024 8558.4 49109 Cc: stable@kernel.org Fixes: b98dba273a0e ("jbd2: remove journal_clean_one_cp_list()") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230714025528.564988-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9ec91017a7f3..936c6d758a65 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -349,6 +349,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ +enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; + /* * journal_shrink_one_cp_list * @@ -360,7 +362,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - bool destroy, bool *released) + enum shrink_type type, + bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; @@ -376,12 +379,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (destroy) { + if (type == SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); - if (ret < 0) - continue; + if (ret < 0) { + if (type == SHRINK_BUSY_SKIP) + continue; + break; + } } nr_freed++; @@ -445,7 +451,7 @@ again: tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - false, &released); + SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -485,19 +491,21 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; + enum shrink_type type; bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) return; + type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; journal_shrink_one_cp_list(transaction->t_checkpoint_list, - destroy, &released); + type, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if From 590a809ff743e7bd890ba5fb36bc38e20a36de53 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 14 Jul 2023 10:55:27 +0800 Subject: [PATCH 28/60] jbd2: check 'jh->b_transaction' before removing it from checkpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following process will corrupt ext4 image: Step 1: jbd2_journal_commit_transaction __jbd2_journal_insert_checkpoint(jh, commit_transaction) // Put jh into trans1->t_checkpoint_list journal->j_checkpoint_transactions = commit_transaction // Put trans1 into journal->j_checkpoint_transactions Step 2: do_get_write_access test_clear_buffer_dirty(bh) // clear buffer dirty,set jbd dirty __jbd2_journal_file_buffer(jh, transaction) // jh belongs to trans2 Step 3: drop_cache journal_shrink_one_cp_list jbd2_journal_try_remove_checkpoint if (!trylock_buffer(bh)) // lock bh, true if (buffer_dirty(bh)) // buffer is not dirty __jbd2_journal_remove_checkpoint(jh) // remove jh from trans1->t_checkpoint_list Step 4: jbd2_log_do_checkpoint trans1 = journal->j_checkpoint_transactions // jh is not in trans1->t_checkpoint_list jbd2_cleanup_journal_tail(journal) // trans1 is done Step 5: Power cut, trans2 is not committed, jh is lost in next mounting. Fix it by checking 'jh->b_transaction' before remove it from checkpoint. Cc: stable@kernel.org Fixes: 46f881b5b175 ("jbd2: fix a race when checking checkpoint buffer busy") Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230714025528.564988-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 936c6d758a65..f033ac807013 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -639,6 +639,8 @@ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + if (jh->b_transaction) + return -EBUSY; if (!trylock_buffer(bh)) return -EBUSY; if (buffer_dirty(bh)) { From 5f02a30eac5cc1c081cbdb42d19fd0ded00b0618 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 14 Jul 2023 10:55:28 +0800 Subject: [PATCH 29/60] jbd2: remove unused function '__cp_buffer_busy' The code calling function '__cp_buffer_busy' has been removed, so the function should also be removed. silence the warning: fs/jbd2/checkpoint.c:48:20: warning: unused function '__cp_buffer_busy' Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5518 Signed-off-by: Yang Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230714025528.564988-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index f033ac807013..118699fff2f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -40,18 +40,6 @@ static inline void __buffer_unlink(struct journal_head *jh) } } -/* - * Check a checkpoint buffer could be release or not. - * - * Requires j_list_lock - */ -static inline bool __cp_buffer_busy(struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); -} - /* * __jbd2_log_wait_for_space: wait until there is space in the journal. * From 7ca4b085f430f3774c3838b3da569ceccd6a0177 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= Date: Thu, 3 Aug 2023 10:17:13 +0100 Subject: [PATCH 30/60] ext4: fix memory leaks in ext4_fname_{setup_filename,prepare_lookup} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the filename casefolding fails, we'll be leaking memory from the fscrypt_name struct, namely from the 'crypto_buf.name' member. Make sure we free it in the error path on both ext4_fname_setup_filename() and ext4_fname_prepare_lookup() functions. Cc: stable@kernel.org Fixes: 1ae98e295fa2 ("ext4: optimize match for casefolded encrypted dirs") Signed-off-by: Luís Henriques Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20230803091713.13239-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o --- fs/ext4/crypto.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f..453d4da5de52 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } From e15e117bbbe18258a5ad506bbf6c58ff129c9576 Mon Sep 17 00:00:00 2001 From: Wang Jianjian Date: Wed, 2 Aug 2023 22:45:34 +0800 Subject: [PATCH 31/60] jbd2: remove unused t_handle_lock Since commit f7f497cb7024 ("jbd2: kill t_handle_lock transaction spinlock"), this lock has been no use. Fixes: f7f497cb7024 ("jbd2: kill t_handle_lock transaction spinlock") Signed-off-by: Wang Jianjian Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/tencent_8477CBE568348A1862C64E393D587B342008@qq.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 44c298aa58d4..52772c826c86 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -630,11 +630,6 @@ struct transaction_s */ struct list_head t_inode_list; - /* - * Protects info related to handles - */ - spinlock_t t_handle_lock; - /* * Longest time some handle had to wait for running transaction */ From 772c9f691dcf3a487f29ddb90a5a15c78d7328e1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 16 Jul 2023 19:33:34 +0530 Subject: [PATCH 32/60] ext4: don't use CR_BEST_AVAIL_LEN for non-regular files Using CR_BEST_AVAIL_LEN only make sense for regular files, as for non-regular files we never normalize the allocation request length i.e. goal len is same as original length (ac_g_ex.fe_len == ac_o_ex.fe_len). Hence there is no scope of trimming the goal length to make it satisfy original request len. Thus this patch avoids using CR_BEST_AVAIL_LEN criteria for non-regular files request. Cc: stable@kernel.org Fixes: 33122aa930f1 ("ext4: Add allocation criteria 1.5 (CR1_5)") Reported-by: Eric Whitney Signed-off-by: Ritesh Harjani (IBM) Tested-by: Eric Whitney Link: https://lore.kernel.org/r/2a694c748ff8b8c4b416995a24f06f07b55047a8.1689516047.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b89b5f0816e7..3d5b0b71d7f5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -966,7 +966,18 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * } } - *new_cr = CR_BEST_AVAIL_LEN; + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) + *new_cr = CR_BEST_AVAIL_LEN; + else + *new_cr = CR_GOAL_LEN_SLOW; } /* From 29a511e49f33426c8d24700db4842234a84678b2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:35:59 +0800 Subject: [PATCH 33/60] jbd2: move load_superblock() dependent functions Move load_superblock() declaration and the functions it calls before journal_init_common(). This is a preparation for moving a call to load_superblock() from jbd2_journal_load() and jbd2_journal_wipe() to journal_init_common(). No functional changes. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 337 +++++++++++++++++++++++----------------------- 1 file changed, 168 insertions(+), 169 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5c223032f77a..c3f968909618 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1336,6 +1336,174 @@ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, return count; } +/* + * If the journal init or create aborts, we need to mark the journal + * superblock as being NULL to prevent the journal destroy from writing + * back a bogus superblock. + */ +static void journal_fail_superblock(journal_t *journal) +{ + struct buffer_head *bh = journal->j_sb_buffer; + brelse(bh); + journal->j_sb_buffer = NULL; +} + +/* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ +static int journal_get_superblock(journal_t *journal) +{ + struct buffer_head *bh; + journal_superblock_t *sb; + int err; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (buffer_verified(bh)) + return 0; + + err = bh_read(bh, 0); + if (err < 0) { + printk(KERN_ERR + "JBD2: IO error reading journal superblock\n"); + goto out; + } + + sb = journal->j_superblock; + + err = -EINVAL; + + if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || + sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { + printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); + goto out; + } + + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { + printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); + goto out; + } + + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + printk(KERN_WARNING "JBD2: journal file too short\n"); + goto out; + } + + if (be32_to_cpu(sb->s_first) == 0 || + be32_to_cpu(sb->s_first) >= journal->j_total_len) { + printk(KERN_WARNING + "JBD2: Invalid start block of journal: %u\n", + be32_to_cpu(sb->s_first)); + goto out; + } + + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + goto out; + } + + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + goto out; + } + + if (!jbd2_verify_csum_type(journal, sb)) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + goto out; + } + + /* Load the checksum driver */ + if (jbd2_journal_has_csum_v2or3_feature(journal)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + err = PTR_ERR(journal->j_chksum_driver); + journal->j_chksum_driver = NULL; + goto out; + } + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + goto out; + } + } + set_buffer_verified(bh); + return 0; + +out: + journal_fail_superblock(journal); + return err; +} + +static int journal_revoke_records_per_block(journal_t *journal) +{ + int record_size; + int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); + + if (jbd2_has_feature_64bit(journal)) + record_size = 8; + else + record_size = 4; + + if (jbd2_journal_has_csum_v2or3(journal)) + space -= sizeof(struct jbd2_journal_block_tail); + return space / record_size; +} + +/* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ +static int load_superblock(journal_t *journal) +{ + int err; + journal_superblock_t *sb; + int num_fc_blocks; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); + journal->j_tail = be32_to_cpu(sb->s_start); + journal->j_first = be32_to_cpu(sb->s_first); + journal->j_errno = be32_to_cpu(sb->s_errno); + journal->j_last = be32_to_cpu(sb->s_maxlen); + + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + + if (jbd2_has_feature_fast_commit(journal)) { + journal->j_fc_last = be32_to_cpu(sb->s_maxlen); + num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); + if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) + journal->j_last = journal->j_fc_last - num_fc_blocks; + journal->j_fc_first = journal->j_last + 1; + journal->j_fc_off = 0; + } + + return 0; +} + + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1521,18 +1689,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return journal; } -/* - * If the journal init or create aborts, we need to mark the journal - * superblock as being NULL to prevent the journal destroy from writing - * back a bogus superblock. - */ -static void journal_fail_superblock(journal_t *journal) -{ - struct buffer_head *bh = journal->j_sb_buffer; - brelse(bh); - journal->j_sb_buffer = NULL; -} - /* * Given a journal_t structure, initialise the various fields for * startup of a new journaling session. We use this both when creating @@ -1889,163 +2045,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal) } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); -static int journal_revoke_records_per_block(journal_t *journal) -{ - int record_size; - int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t); - - if (jbd2_has_feature_64bit(journal)) - record_size = 8; - else - record_size = 4; - - if (jbd2_journal_has_csum_v2or3(journal)) - space -= sizeof(struct jbd2_journal_block_tail); - return space / record_size; -} - -/* - * Read the superblock for a given journal, performing initial - * validation of the format. - */ -static int journal_get_superblock(journal_t *journal) -{ - struct buffer_head *bh; - journal_superblock_t *sb; - int err; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - if (buffer_verified(bh)) - return 0; - - err = bh_read(bh, 0); - if (err < 0) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } - - sb = journal->j_superblock; - - err = -EINVAL; - - if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || - sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { - printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); - goto out; - } - - if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && - be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { - printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); - goto out; - } - - if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { - printk(KERN_WARNING "JBD2: journal file too short\n"); - goto out; - } - - if (be32_to_cpu(sb->s_first) == 0 || - be32_to_cpu(sb->s_first) >= journal->j_total_len) { - printk(KERN_WARNING - "JBD2: Invalid start block of journal: %u\n", - be32_to_cpu(sb->s_first)); - goto out; - } - - if (jbd2_has_feature_csum2(journal) && - jbd2_has_feature_csum3(journal)) { - /* Can't have checksum v2 and v3 at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " - "at the same time!\n"); - goto out; - } - - if (jbd2_journal_has_csum_v2or3_feature(journal) && - jbd2_has_feature_checksum(journal)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " - "at the same time!\n"); - goto out; - } - - if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; - } - - /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3_feature(journal)) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); - err = PTR_ERR(journal->j_chksum_driver); - journal->j_chksum_driver = NULL; - goto out; - } - /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } - } - set_buffer_verified(bh); - return 0; - -out: - journal_fail_superblock(journal); - return err; -} - -/* - * Load the on-disk journal superblock and read the key fields into the - * journal_t. - */ - -static int load_superblock(journal_t *journal) -{ - int err; - journal_superblock_t *sb; - int num_fc_blocks; - - err = journal_get_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; - - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); - journal->j_tail = be32_to_cpu(sb->s_start); - journal->j_first = be32_to_cpu(sb->s_first); - journal->j_errno = be32_to_cpu(sb->s_errno); - journal->j_last = be32_to_cpu(sb->s_maxlen); - - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); - - if (jbd2_has_feature_fast_commit(journal)) { - journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); - if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) - journal->j_last = journal->j_fc_last - num_fc_blocks; - journal->j_fc_first = journal->j_last + 1; - journal->j_fc_off = 0; - } - - return 0; -} - - /** * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. From c30713084ba5b6fa343129613ec349ea91f0c458 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:00 +0800 Subject: [PATCH 34/60] jbd2: move load_superblock() into journal_init_common() Move the call to load_superblock() from jbd2_journal_load() and jbd2_journal_wipe() early into journal_init_common(), the journal superblock gets read and the in-memory journal_t structure gets initialised after calling jbd2_journal_init_{dev,inode}, it's safe to do following initialization according to it. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c3f968909618..98b43a9dcabe 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1582,6 +1582,10 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; + err = load_superblock(journal); + if (err) + goto err_cleanup; + journal->j_shrink_transaction = NULL; journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; journal->j_shrinker.count_objects = jbd2_journal_shrink_count; @@ -2056,13 +2060,7 @@ EXPORT_SYMBOL(jbd2_journal_update_sb_errno); int jbd2_journal_load(journal_t *journal) { int err; - journal_superblock_t *sb; - - err = load_superblock(journal); - if (err) - return err; - - sb = journal->j_superblock; + journal_superblock_t *sb = journal->j_superblock; /* * If this is a V2 superblock, then we have to check the @@ -2523,10 +2521,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) J_ASSERT (!(journal->j_flags & JBD2_LOADED)); - err = load_superblock(journal); - if (err) - return err; - if (!journal->j_tail) goto no_recovery; From 9600f3e5cfd0360b10c271149032c77917baedc5 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:01 +0800 Subject: [PATCH 35/60] jbd2: don't load superblock in jbd2_journal_check_used_features() Since load_superblock() has been moved to journal_init_common(), the in-memory superblock structure is initialized and contains valid data once the file system has a journal_t object, so it's safe to access it, let's drop the call to journal_get_superblock() from jbd2_journal_check_used_features() and also drop the setting/clearing of the veirfy bit of the superblock buffer. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 98b43a9dcabe..95499b3184aa 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1361,8 +1361,6 @@ static int journal_get_superblock(journal_t *journal) bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); - if (buffer_verified(bh)) - return 0; err = bh_read(bh, 0); if (err < 0) { @@ -1437,7 +1435,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } } - set_buffer_verified(bh); return 0; out: @@ -2226,8 +2223,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - if (journal_get_superblock(journal)) - return 0; if (!jbd2_format_support_feature(journal)) return 0; From e4adf8b837087b5bb57fff6827e10ec877a50f64 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:02 +0800 Subject: [PATCH 36/60] jbd2: checking valid features early in journal_get_superblock() journal_get_superblock() is used to check validity of the jounal supberblock, so move the features checks from jbd2_journal_load() to journal_get_superblock(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-5-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 95499b3184aa..4d4494b42b39 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1398,6 +1398,21 @@ static int journal_get_superblock(journal_t *journal) goto out; } + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (!jbd2_format_support_feature(journal)) + return 0; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { + printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); + goto out; + } + if (jbd2_has_feature_csum2(journal) && jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ @@ -2059,21 +2074,6 @@ int jbd2_journal_load(journal_t *journal) int err; journal_superblock_t *sb = journal->j_superblock; - /* - * If this is a V2 superblock, then we have to check the - * features flags on it. - */ - if (jbd2_format_support_feature(journal)) { - if ((sb->s_feature_ro_compat & - ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || - (sb->s_feature_incompat & - ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { - printk(KERN_WARNING - "JBD2: Unrecognised features on journal\n"); - return -EINVAL; - } - } - /* * Create a slab for this blocksize */ From 18dad509e7bd3189ac1e7f7904faf1561a908871 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:03 +0800 Subject: [PATCH 37/60] jbd2: open code jbd2_verify_csum_type() helper jbd2_verify_csum_type() helper check checksum type in the superblock for v2 or v3 checksum feature, it always return true if these features are not enabled, and it has only one user, so open code it is more clear. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 4d4494b42b39..5d4744203e39 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -115,14 +115,6 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3_feature(j)) - return 1; - - return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; -} - static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) { __u32 csum; @@ -1429,13 +1421,13 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (!jbd2_verify_csum_type(journal, sb)) { - printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; - } - /* Load the checksum driver */ if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { + printk(KERN_ERR "JBD2: Unknown checksum type\n"); + goto out; + } + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); From 054d9c8fef14d476f1a9c6434de86813c5990052 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:04 +0800 Subject: [PATCH 38/60] jbd2: cleanup load_superblock() Rename load_superblock() to journal_load_superblock(), move getting and reading superblock from journal_init_common() and journal_get_superblock() to this function, and also rename journal_get_superblock() to journal_check_superblock(), make it a pure check helper to check superblock validity from disk. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-7-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 85 +++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 50 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5d4744203e39..89f9eb35323d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1341,45 +1341,29 @@ static void journal_fail_superblock(journal_t *journal) } /* - * Read the superblock for a given journal, performing initial + * Check the superblock for a given journal, performing initial * validation of the format. */ -static int journal_get_superblock(journal_t *journal) +static int journal_check_superblock(journal_t *journal) { - struct buffer_head *bh; - journal_superblock_t *sb; - int err; - - bh = journal->j_sb_buffer; - - J_ASSERT(bh != NULL); - - err = bh_read(bh, 0); - if (err < 0) { - printk(KERN_ERR - "JBD2: IO error reading journal superblock\n"); - goto out; - } - - sb = journal->j_superblock; - - err = -EINVAL; + journal_superblock_t *sb = journal->j_superblock; + int err = -EINVAL; if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) { printk(KERN_WARNING "JBD2: no valid journal superblock found\n"); - goto out; + return err; } if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); - goto out; + return err; } if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); - goto out; + return err; } if (be32_to_cpu(sb->s_first) == 0 || @@ -1387,7 +1371,7 @@ static int journal_get_superblock(journal_t *journal) printk(KERN_WARNING "JBD2: Invalid start block of journal: %u\n", be32_to_cpu(sb->s_first)); - goto out; + return err; } /* @@ -1402,7 +1386,7 @@ static int journal_get_superblock(journal_t *journal) (sb->s_feature_incompat & ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) { printk(KERN_WARNING "JBD2: Unrecognised features on journal\n"); - goto out; + return err; } if (jbd2_has_feature_csum2(journal) && @@ -1410,7 +1394,7 @@ static int journal_get_superblock(journal_t *journal) /* Can't have checksum v2 and v3 at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " "at the same time!\n"); - goto out; + return err; } if (jbd2_journal_has_csum_v2or3_feature(journal) && @@ -1418,14 +1402,14 @@ static int journal_get_superblock(journal_t *journal) /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " "at the same time!\n"); - goto out; + return err; } /* Load the checksum driver */ if (jbd2_journal_has_csum_v2or3_feature(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); - goto out; + return err; } journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); @@ -1433,20 +1417,17 @@ static int journal_get_superblock(journal_t *journal) printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); err = PTR_ERR(journal->j_chksum_driver); journal->j_chksum_driver = NULL; - goto out; + return err; } /* Check superblock checksum */ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; - goto out; + return err; } } - return 0; -out: - journal_fail_superblock(journal); - return err; + return 0; } static int journal_revoke_records_per_block(journal_t *journal) @@ -1468,17 +1449,31 @@ static int journal_revoke_records_per_block(journal_t *journal) * Load the on-disk journal superblock and read the key fields into the * journal_t. */ -static int load_superblock(journal_t *journal) +static int journal_load_superblock(journal_t *journal) { int err; + struct buffer_head *bh; journal_superblock_t *sb; int num_fc_blocks; - err = journal_get_superblock(journal); - if (err) - return err; + bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, + journal->j_blocksize); + if (bh) + err = bh_read(bh, 0); + if (!bh || err < 0) { + pr_err("%s: Cannot read journal superblock\n", __func__); + brelse(bh); + return -EIO; + } - sb = journal->j_superblock; + journal->j_sb_buffer = bh; + sb = (journal_superblock_t *)bh->b_data; + journal->j_superblock = sb; + err = journal_check_superblock(journal); + if (err) { + journal_fail_superblock(journal); + return err; + } journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); journal->j_tail = be32_to_cpu(sb->s_start); @@ -1524,7 +1519,6 @@ static journal_t *journal_init_common(struct block_device *bdev, static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; - struct buffer_head *bh; int n; journal = kzalloc(sizeof(*journal), GFP_KERNEL); @@ -1577,16 +1571,7 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; - bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize); - if (!bh) { - pr_err("%s: Cannot get buffer for journal superblock\n", - __func__); - goto err_cleanup; - } - journal->j_sb_buffer = bh; - journal->j_superblock = (journal_superblock_t *)bh->b_data; - - err = load_superblock(journal); + err = journal_load_superblock(journal); if (err) goto err_cleanup; From 0dbc759ae9971568af24def1b01d5b1aa87bd546 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:05 +0800 Subject: [PATCH 39/60] jbd2: add fast_commit space check If JBD2_FEATURE_INCOMPAT_FAST_COMMIT bit is set, it means the journal have fast commit records need to recover, so the fast commit size should not be too large, and the leftover normal journal size should never less than JBD2_MIN_JOURNAL_BLOCKS. If it happens, the journal->j_last is likely to be wrong and will probably lead to incorrect journal recovery. So add a check into the journal_check_superblock(), and drop the pointless check when initializing the fastcommit parameters. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-8-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 89f9eb35323d..ef9d75cca362 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1347,6 +1347,7 @@ static void journal_fail_superblock(journal_t *journal) static int journal_check_superblock(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + int num_fc_blks; int err = -EINVAL; if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) || @@ -1389,6 +1390,15 @@ static int journal_check_superblock(journal_t *journal) return err; } + num_fc_blks = jbd2_has_feature_fast_commit(journal) ? + jbd2_journal_get_num_fc_blks(sb) : 0; + if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS || + be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) { + printk(KERN_ERR "JBD2: journal file too short %u,%d\n", + be32_to_cpu(sb->s_maxlen), num_fc_blks); + return err; + } + if (jbd2_has_feature_csum2(journal) && jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ @@ -1454,7 +1464,6 @@ static int journal_load_superblock(journal_t *journal) int err; struct buffer_head *bh; journal_superblock_t *sb; - int num_fc_blocks; bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset, journal->j_blocksize); @@ -1492,9 +1501,8 @@ static int journal_load_superblock(journal_t *journal) if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); - num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); - if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS) - journal->j_last = journal->j_fc_last - num_fc_blocks; + journal->j_last = journal->j_fc_last - + jbd2_journal_get_num_fc_blks(sb); journal->j_fc_first = journal->j_last + 1; journal->j_fc_off = 0; } From 49887e47a5262cc7b87d547de57a21a072c6ea5e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:06 +0800 Subject: [PATCH 40/60] jbd2: cleanup journal_init_common() Adjust the initialization sequence and error handle of journal_t, moving load superblock to the begin, and classify others initialization. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-9-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ef9d75cca362..04b67844118c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1533,6 +1533,16 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal) return NULL; + journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_total_len = len; + + err = journal_load_superblock(journal); + if (err) + goto err_cleanup; + init_waitqueue_head(&journal->j_wait_transaction_locked); init_waitqueue_head(&journal->j_wait_done_commit); init_waitqueue_head(&journal->j_wait_commit); @@ -1544,12 +1554,15 @@ static journal_t *journal_init_common(struct block_device *bdev, mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); + spin_lock_init(&journal->j_history_lock); rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; journal->j_max_batch_time = 15000; /* 15ms */ atomic_set(&journal->j_reserved_credits, 0); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1559,18 +1572,10 @@ static journal_t *journal_init_common(struct block_device *bdev, if (err) goto err_cleanup; - spin_lock_init(&journal->j_history_lock); - - lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", - &jbd2_trans_commit_key, 0); - - /* journal descriptor can store up to n blocks -bzzz */ - journal->j_blocksize = blocksize; - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_total_len = len; - /* We need enough buffers to write out full descriptor block. */ + /* + * journal descriptor can store up to n blocks, we need enough + * buffers to write out full descriptor block. + */ n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1579,7 +1584,8 @@ static journal_t *journal_init_common(struct block_device *bdev, if (!journal->j_wbuf) goto err_cleanup; - err = journal_load_superblock(journal); + err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0, + GFP_KERNEL); if (err) goto err_cleanup; @@ -1588,21 +1594,18 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_shrinker.count_objects = jbd2_journal_shrink_count; journal->j_shrinker.seeks = DEFAULT_SEEKS; journal->j_shrinker.batch = journal->j_max_transaction_buffers; - - if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (err) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { - percpu_counter_destroy(&journal->j_checkpoint_jh_count); - goto err_cleanup; - } return journal; err_cleanup: - brelse(journal->j_sb_buffer); + percpu_counter_destroy(&journal->j_checkpoint_jh_count); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); + journal_fail_superblock(journal); kfree(journal); return NULL; } From d9a45496019a73c240bd22912ae18a04b8496364 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:07 +0800 Subject: [PATCH 41/60] jbd2: drop useless error tag in jbd2_journal_wipe() no_recovery is redundant, just drop it. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-10-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 04b67844118c..6482fcca3dc6 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2500,12 +2500,12 @@ out: int jbd2_journal_wipe(journal_t *journal, int write) { - int err = 0; + int err; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); if (!journal->j_tail) - goto no_recovery; + return 0; printk(KERN_WARNING "JBD2: %s recovery information on journal\n", write ? "Clearing" : "Ignoring"); @@ -2518,7 +2518,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) mutex_unlock(&journal->j_checkpoint_mutex); } - no_recovery: return err; } From 8e6cf5fbb7b47d337998faab3fcacdceaa547ead Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:08 +0800 Subject: [PATCH 42/60] jbd2: jbd2_journal_init_{dev,inode} return proper error return value Current jbd2_journal_init_{dev,inode} return NULL if some error happens, make them to pass out proper error return value. [ Fix from Yang Yingliang folded in. ] Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-11-yi.zhang@huaweicloud.com Link: https://lore.kernel.org/r/20230822030018.644419-1-yangyingliang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- fs/jbd2/journal.c | 19 +++++++++---------- fs/ocfs2/journal.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4613264344b0..279e37c3b275 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5815,7 +5815,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, return NULL; journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return NULL; @@ -5894,7 +5894,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, journal = jbd2_journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); goto out_bdev; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6482fcca3dc6..15e33c26c6cd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1531,7 +1531,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - return NULL; + return ERR_PTR(-ENOMEM); journal->j_blocksize = blocksize; journal->j_dev = bdev; @@ -1576,6 +1576,7 @@ static journal_t *journal_init_common(struct block_device *bdev, * journal descriptor can store up to n blocks, we need enough * buffers to write out full descriptor block. */ + err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1607,7 +1608,7 @@ err_cleanup: jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); kfree(journal); - return NULL; + return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1640,8 +1641,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); @@ -1667,11 +1668,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) blocknr = 0; err = bmap(inode, &blocknr); - if (err || !blocknr) { - pr_err("%s: Cannot locate journal superblock\n", - __func__); - return NULL; + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", @@ -1681,8 +1680,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 25d8072ccfce..1d2960e8ce74 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -911,9 +911,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -1687,9 +1687,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } From bc74e6a38d16d745a9bc28a7e343494019066492 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:09 +0800 Subject: [PATCH 43/60] ext4: cleanup ext4_get_dev_journal() and ext4_get_journal() Factor out a new helper form ext4_get_dev_journal() to get external journal bdev and check validation of this device, drop ext4_blkdev_get() helper, and also remove duplicate check of journal feature. It makes ext4_get_dev_journal() more clear than before. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-12-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 109 ++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 60 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 279e37c3b275..0eee238b290e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1105,26 +1105,6 @@ static const struct blk_holder_ops ext4_holder_ops = { .mark_dead = ext4_bdev_mark_dead, }; -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - /* * Release the journal device */ @@ -5768,14 +5748,14 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return NULL; } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5807,9 +5787,6 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); if (!journal_inode) return NULL; @@ -5826,25 +5803,25 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - - bdev = ext4_blkdev_get(j_dev, sb); - if (bdev == NULL) + bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, + &ext4_holder_ops); + if (IS_ERR(bdev)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); return NULL; + } blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); @@ -5857,7 +5834,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); goto out_bdev; @@ -5867,56 +5845,67 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev; - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); +out_bh: + brelse(bh); +out_bdev: + blkdev_put(bdev, sb); + return NULL; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct block_device *journal_bdev; + + journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (!journal_bdev) + return NULL; + + journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev = journal_bdev; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(bdev, sb); + blkdev_put(journal_bdev, sb); return NULL; } From ee5c807137ce283acebd83297f8855428cdd839a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:10 +0800 Subject: [PATCH 44/60] ext4: ext4_get_{dev}_journal return proper error value ext4_get_journal() and ext4_get_dev_journal() return NULL if they failed to init journal, making them return proper error value instead, also rename them to ext4_open_{inode,dev}_journal(). [ Folded fix to ext4_calculate_overhead() to check for an ERR_PTR instead of NULL. ] Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-13-yi.zhang@huaweicloud.com Reported-by: syzbot+b3123e6d9842e526de39@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20230826011029.2023140-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 53 +++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0eee238b290e..6edf7deeb2db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4200,7 +4200,7 @@ int ext4_calculate_overhead(struct super_block *sb) else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -5740,18 +5740,18 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", @@ -5781,21 +5781,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5813,6 +5813,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; + int errno; bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, &ext4_holder_ops); @@ -5820,7 +5821,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); - return NULL; + return ERR_CAST(bdev); } blocksize = sb->s_blocksize; @@ -5828,6 +5829,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } @@ -5838,6 +5840,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5846,6 +5849,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; goto out_bh; } @@ -5853,11 +5857,13 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + errno = -EFSCORRUPTED; goto out_bh; } @@ -5870,31 +5876,34 @@ out_bh: brelse(bh); out_bdev: blkdev_put(bdev, sb); - return NULL; + return ERR_PTR(errno); } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct block_device *journal_bdev; + int errno = 0; journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (!journal_bdev) - return NULL; + if (IS_ERR(journal_bdev)) + return ERR_CAST(journal_bdev); journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } journal->j_private = sb; @@ -5906,7 +5915,7 @@ out_journal: jbd2_journal_destroy(journal); out_bdev: blkdev_put(journal_bdev, sb); - return NULL; + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5938,13 +5947,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); From 2dfba3bb40ad8536b9fa802364f2d40da31aa88e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 26 Jun 2023 15:33:22 +0800 Subject: [PATCH 45/60] jbd2: correct the end of the journal recovery scan range We got a filesystem inconsistency issue below while running generic/475 I/O failure pressure test with fast_commit feature enabled. Symlink /p3/d3/d1c/d6c/dd6/dce/l101 (inode #132605) is invalid. If fast_commit feature is enabled, a special fast_commit journal area is appended to the end of the normal journal area. The journal->j_last point to the first unused block behind the normal journal area instead of the whole log area, and the journal->j_fc_last point to the first unused block behind the fast_commit journal area. While doing journal recovery, do_one_pass(PASS_SCAN) should first scan the normal journal area and turn around to the first block once it meet journal->j_last, but the wrap() macro misuse the journal->j_fc_last, so the recovering could not read the next magic block (commit block perhaps) and would end early mistakenly and missing tN and every transaction after it in the following example. Finally, it could lead to filesystem inconsistency. | normal journal area | fast commit area | +-------------------------------------------------+------------------+ | tN(rere) | tN+1 |~| tN-x |...| tN-1 | tN(front) | .... | +-------------------------------------------------+------------------+ / / / start journal->j_last journal->j_fc_last This patch fix it by use the correct ending journal->j_last. Fixes: 5b849b5f96b4 ("jbd2: fast commit recovery path") Cc: stable@kernel.org Reported-by: Theodore Ts'o Link: https://lore.kernel.org/linux-ext4/20230613043120.GB1584772@mit.edu/ Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230626073322.3956567-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0184931d47f7..c269a7d29a46 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -230,12 +230,8 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) /* Make sure we wrap around the log correctly! */ #define wrap(journal, var) \ do { \ - unsigned long _wrap_last = \ - jbd2_has_feature_fast_commit(journal) ? \ - (journal)->j_fc_last : (journal)->j_last; \ - \ - if (var >= _wrap_last) \ - var -= (_wrap_last - (journal)->j_first); \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ } while (0) static int fc_do_one_pass(journal_t *journal, @@ -524,9 +520,7 @@ static int do_one_pass(journal_t *journal, break; jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", - next_commit_ID, next_log_block, - jbd2_has_feature_fast_commit(journal) ? - journal->j_fc_last : journal->j_last); + next_commit_ID, next_log_block, journal->j_last); /* Skip over each chunk of the transaction looking * either the next descriptor block or the final commit From 1524773425ae8113b0b782886366e68656b34e53 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 28 Jun 2023 21:20:11 +0800 Subject: [PATCH 46/60] ext4: fix unttached inode after power cut with orphan file feature enabled Running generic/475(filesystem consistent tests after power cut) could easily trigger unattached inode error while doing fsck: Unattached zero-length inode 39405. Clear? no Unattached inode 39405 Connect to /lost+found? no Above inconsistence is caused by following process: P1 P2 ext4_create inode = ext4_new_inode_start_handle // itable records nlink=1 ext4_add_nondir err = ext4_add_entry // ENOSPC ext4_append ext4_bread ext4_getblk ext4_map_blocks // returns ENOSPC drop_nlink(inode) // won't be updated into disk inode ext4_orphan_add(handle, inode) ext4_orphan_file_add ext4_journal_stop(handle) jbd2_journal_commit_transaction // commit success >> power cut << ext4_fill_super ext4_load_and_init_journal // itable records nlink=1 ext4_orphan_cleanup ext4_process_orphan if (inode->i_nlink) // true, inode won't be deleted Then, allocated inode will be reserved on disk and corresponds to no dentries, so e2fsck reports 'unattached inode' problem. The problem won't happen if orphan file feature is disabled, because ext4_orphan_add() will update disk inode in orphan list mode. There are several places not updating disk inode while putting inode into orphan area, such as ext4_add_nondir(), ext4_symlink() and whiteout in ext4_rename(). Fix it by updating inode into disk in all error branches of these places. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217605 Fixes: 02f310fcf47f ("ext4: Speedup ext4 orphan inode handling") Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230628132011.650383-1-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6298cfaaa0bd..9a13431656cd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2799,6 +2799,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -3436,6 +3437,7 @@ retry: err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -4021,6 +4023,7 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); From 89cadf6e22a958014d09c901caf0cd2105780dbe Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Fri, 7 Jul 2023 18:55:16 +0800 Subject: [PATCH 47/60] ext4: change the type of blocksize in ext4_mb_init_cache() The return value type of i_blocksize() is 'unsigned int', so the type of blocksize has been modified from 'int' to 'unsigned int' to ensure data type consistency. Signed-off-by: Lu Hongfei Link: https://lore.kernel.org/r/20230707105516.9156-1-luhongfei@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3d5b0b71d7f5..96068d687d9d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1256,7 +1256,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; + unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; From 79ebf48c44b5ba05a98af23f8830883daf36f4d3 Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Fri, 7 Jul 2023 19:59:07 +0800 Subject: [PATCH 48/60] ext4: use sbi instead of EXT4_SB(sb) in ext4_mb_new_blocks_simple() Signed-off-by: Lu Hongfei Link: https://lore.kernel.org/r/20230707115907.26637-1-luhongfei@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 96068d687d9d..a807e8bf8664 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6092,7 +6092,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || From a50bda147421e24c1a5d47ddcc0675360b7cb3ac Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 25 Jul 2023 12:33:11 +0800 Subject: [PATCH 49/60] ext4: mballoc: avoid garbage value from err clang's static analysis warning: fs/ext4/mballoc.c line 4178, column 6, Branch condition evaluates to a garbage value. err is uninitialized and will be judged when 'len <= 0' or it first enters the loop while the condition "!ext4_sb_block_valid()" is true. Although this can't make problems now, it's better to correct it. Signed-off-by: Su Hui Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20230725043310.1227621-1-suhui@nfschina.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a807e8bf8664..1e4c667812a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4087,7 +4087,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; + int i, err = 0; int already; unsigned int clen, clen_changed, thisgrp_len; From b6c7d6dc8aebc04cefd342d6cccd24932be37d12 Mon Sep 17 00:00:00 2001 From: Cai Xinchen Date: Wed, 2 Aug 2023 03:00:25 +0000 Subject: [PATCH 50/60] ext4: remove unused function declaration These functions do not have its function implementation. So those function declaration is useless. Remove these Signed-off-by: Cai Xinchen Link: https://lore.kernel.org/r/20230802030025.173148-1-caixinchen1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb4d914ea888..ae458cde55d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2697,7 +2697,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2853,7 +2852,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2896,7 +2894,6 @@ extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); @@ -2976,7 +2973,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -3524,8 +3520,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); From 68228da51c9a436872a4ef4b5a7692e29f7e5bc7 Mon Sep 17 00:00:00 2001 From: Wang Jianjian Date: Thu, 3 Aug 2023 00:28:39 +0800 Subject: [PATCH 51/60] ext4: add correct group descriptors and reserved GDT blocks to system zone When setup_system_zone, flex_bg is not initialized so it is always 1. Use a new helper function, ext4_num_base_meta_blocks() which does not depend on sbi->s_log_groups_per_flex being initialized. [ Squashed two patches in the Link URL's below together into a single commit, which is simpler to review/understand. Also fix checkpatch warnings. --TYT ] Cc: stable@kernel.org Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/tencent_21AF0D446A9916ED5C51492CC6C9A0A77B05@qq.com Link: https://lore.kernel.org/r/tencent_D744D1450CC169AEA77FCF0A64719909ED05@qq.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 15 +++++++++++---- fs/ext4/block_validity.c | 8 ++++---- fs/ext4/ext4.h | 2 ++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6db..79b20d6ae39e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe..6fe3c941b565 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae458cde55d1..2c2c3191bf41 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3079,6 +3079,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, From 194505b55dd7899da114a4d47825204eefc0fff5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 10 Aug 2023 12:55:59 -0400 Subject: [PATCH 52/60] ext4: drop dio overwrite only flag and associated warning The commit referenced below opened up concurrent unaligned dio under shared locking for pure overwrites. In doing so, it enabled use of the IOMAP_DIO_OVERWRITE_ONLY flag and added a warning on unexpected -EAGAIN returns as an extra precaution, since ext4 does not retry writes in such cases. The flag itself is advisory in this case since ext4 checks for unaligned I/Os and uses appropriate locking up front, rather than on a retry in response to -EAGAIN. As it turns out, the warning check is susceptible to false positives because there are scenarios where -EAGAIN can be expected from lower layers without necessarily having IOCB_NOWAIT set on the iocb. For example, one instance of the warning has been seen where io_uring sets IOCB_HIPRI, which in turn results in REQ_POLLED|REQ_NOWAIT on the bio. This results in -EAGAIN if the block layer is unable to allocate a request, etc. [Note that there is an outstanding patch to untangle REQ_POLLED and REQ_NOWAIT such that the latter relies on IOCB_NOWAIT, which would also address this instance of the warning.] Another instance of the warning has been reproduced by syzbot. A dio write is interrupted down in __get_user_pages_locked() waiting on the mm lock and returns -EAGAIN up the stack. If the iomap dio iteration layer has made no progress on the write to this point, -EAGAIN returns up to the filesystem and triggers the warning. This use of the overwrite flag in ext4 is precautionary and half-baked. I.e., ext4 doesn't actually implement overwrite checking in the iomap callbacks when the flag is set, so the only extra verification it provides are i_size checks in the generic iomap dio layer. Combined with the tendency for false positives, the added verification is not worth the extra trouble. Remove the flag, associated warning, and update the comments to document when concurrent unaligned dio writes are allowed and why said flag is not used. Cc: stable@kernel.org Reported-by: syzbot+5050ad0fb47527b1808a@syzkaller.appspotmail.com Reported-by: Pengfei Xu Fixes: 310ee0902b8d ("ext4: allow concurrent unaligned dio overwrites") Signed-off-by: Brian Foster Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230810165559.946222-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2071b1e4322c..e99cc17b6bd2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -476,6 +476,11 @@ restart: * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +497,12 @@ restart: /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; From ff0722de896eb278fca193888d22278c28f2782c Mon Sep 17 00:00:00 2001 From: Vitaliy Kuznetsov Date: Thu, 10 Aug 2023 18:38:52 +0400 Subject: [PATCH 53/60] ext4: add periodic superblock update check This patch introduces a mechanism to periodically check and update the superblock within the ext4 file system. The main purpose of this patch is to keep the disk superblock up to date. The update will be performed if more than one hour has passed since the last update, and if more than 16MB of data have been written to disk. This check and update is performed within the ext4_journal_commit_callback function, ensuring that the superblock is written while the disk is active, rather than based on a timer that may trigger during disk idle periods. Discussion https://www.spinics.net/lists/linux-ext4/msg85865.html Signed-off-by: Vitaliy Kuznetsov Link: https://lore.kernel.org/r/20230810143852.40228-1-vk.en.mail@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6edf7deeb2db..bf0cfdffa9d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -433,6 +433,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than an hour has passed since the last superblock update, and + * 2. More than 16MB have been written since the last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || + !journal || (journal->j_flags & JBD2_UNMOUNT)) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + schedule_work(&EXT4_SB(sb)->s_error_work); +} + /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -459,6 +510,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { @@ -715,6 +767,7 @@ static void flush_stashed_error_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -722,6 +775,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -735,7 +792,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: From bb15cea20f211e110150e528fca806f38d5789e0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 22 Aug 2023 23:43:38 -0400 Subject: [PATCH 54/60] ext4: rename s_error_work to s_sb_upd_work The most common use that s_error_work will get scheduled is now the periodic update of the superblock. So rename it to s_sb_upd_work. Also rename the function flush_stashed_error_work() to update_super_work(). Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 9 ++++++--- fs/ext4/super.c | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2c2c3191bf41..84618c46f239 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1698,10 +1698,13 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf0cfdffa9d0..91f20afa1d71 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -481,7 +481,7 @@ static void ext4_maybe_update_superblock(struct super_block *sb) diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* @@ -723,7 +723,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * defer superblock flushing to a workqueue. */ if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -750,10 +750,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_flags |= SB_RDONLY; } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -1078,7 +1078,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1318,10 +1318,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1333,7 +1333,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -4998,8 +4998,8 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before destroying the journal. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; @@ -5322,7 +5322,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5666,16 +5666,16 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before journal destroy. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); @@ -6551,7 +6551,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_forced_shutdown(sb)) { From 03de20bed203b0819225d4de98353c1f8755a1dd Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 10 Aug 2023 23:43:33 +0800 Subject: [PATCH 55/60] ext4: do not mark inode dirty every time when appending using delalloc In the delalloc append write scenario, if inode's i_size is extended due to buffer write, there are delalloc writes pending in the range up to i_size, and no need to touch i_disksize since writeback will push i_disksize up to i_size eventually. Offers significant performance improvement in high-frequency append write scenarios. I conducted tests in my 32-core environment by launching 32 concurrent threads to append write to the same file. Each write operation had a length of 1024 bytes and was repeated 100000 times. Without using this patch, the test was completed in 7705 ms. However, with this patch, the test was completed in 5066 ms, resulting in a performance improvement of 34%. Moreover, in test scenarios of Kafka version 2.6.2, using packet size of 2K, with this patch resulted in a 10% performance improvement. Signed-off-by: Liu Song Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230810154333.84921-1-liusong@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 88 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 26 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b9003840bc1..c5d8f8933c8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2935,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2961,30 +3020,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* From 0f6bc57971c63f7352c3564d19a5dc707fe8332a Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Sat, 12 Aug 2023 15:18:39 +0800 Subject: [PATCH 56/60] ext4: use LIST_HEAD() to initialize the list_head in mballoc.c Use LIST_HEAD() to initialize the list_head instead of open-coding it. Signed-off-by: Ruan Jinjie Link: https://lore.kernel.org/r/20230812071839.3481909-1-ruanjinjie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1e4c667812a9..c91db9f57524 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3504,11 +3504,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3882,12 +3881,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; + LIST_HEAD(freed_data_list); struct list_head *cut_pos = NULL; bool wake; - INIT_LIST_HEAD(&freed_data_list); - spin_lock(&sbi->s_md_lock); list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { if (entry->efd_tid != commit_tid) @@ -5414,7 +5411,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5443,7 +5440,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5524,7 +5520,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; @@ -5541,8 +5537,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active), needed); - INIT_LIST_HEAD(&list); - if (needed == 0) needed = UINT_MAX; @@ -5858,13 +5852,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, From 8216776ccff6fcd40e3fdaa109aa4150ebe760b3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:01 -0700 Subject: [PATCH 57/60] ext4: reject casefold inode flag without casefold feature It is invalid for the casefold inode flag to be set without the casefold superblock feature flag also being set. e2fsck already considers this case to be invalid and handles it by offering to clear the casefold flag on the inode. __ext4_iget() also already considered this to be invalid, sort of, but it only got so far as logging an error message; it didn't actually reject the inode. Make it reject the inode so that other code doesn't have to handle this case. This matches what f2fs does. Note: we could check 's_encoding != NULL' instead of ext4_has_feature_casefold(). This would make the check robust against the casefold feature being enabled by userspace writing to the page cache of the mounted block device. However, it's unsolvable in general for filesystems to be robust against concurrent writes to the page cache of the mounted block device. Though this very particular scenario involving the casefold feature is solvable, we should not pretend that we can support this model, so let's just check the casefold feature. tune2fs already forbids enabling casefold on a mounted filesystem. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5d8f8933c8c..6c490f05e2ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4974,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; From b81427939590450172716093dafdda8ef52e020f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:02 -0700 Subject: [PATCH 58/60] ext4: remove redundant checks of s_encoding Now that ext4 does not allow inodes with the casefold flag to be instantiated when unsupported, it's unnecessary to repeatedly check for support later on during random filesystem operations. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/hash.c | 2 +- fs/ext4/namei.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa1..deabe29da7fb 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9a13431656cd..c0f0b4e2413b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1445,7 +1445,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; @@ -1496,7 +1496,7 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, @@ -2393,7 +2393,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif From af494af38580a35b92f921639a60630a2307bcc2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:03 -0700 Subject: [PATCH 59/60] libfs: remove redundant checks of s_encoding Now that neither ext4 nor f2fs allows inodes with the casefold flag to be instantiated when unsupported, it's unnecessary to repeatedly check for support later on during random filesystem operations. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-4-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/libfs.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..5197ea8c66d3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1381,16 +1381,6 @@ bool is_empty_dir_inode(struct inode *inode) } #if IS_ENABLED(CONFIG_UNICODE) -/* - * Determine if the name of a dentry should be casefolded. - * - * Return: if names will need casefolding - */ -static bool needs_casefold(const struct inode *dir) -{ - return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding; -} - /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against @@ -1411,7 +1401,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, char strbuf[DNAME_INLINE_LEN]; int ret; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) goto fallback; /* * If the dentry name is stored in-line, then it may be concurrently @@ -1453,7 +1443,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct unicode_map *um = sb->s_encoding; int ret = 0; - if (!dir || !needs_casefold(dir)) + if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); From 768d612f79822d30a1e7d132a4d4b05337ce42ec Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 15 Aug 2023 15:08:08 +0800 Subject: [PATCH 60/60] ext4: fix slab-use-after-free in ext4_es_insert_extent() Yikebaer reported an issue: ================================================================== BUG: KASAN: slab-use-after-free in ext4_es_insert_extent+0xc68/0xcb0 fs/ext4/extents_status.c:894 Read of size 4 at addr ffff888112ecc1a4 by task syz-executor/8438 CPU: 1 PID: 8438 Comm: syz-executor Not tainted 6.5.0-rc5 #1 Call Trace: [...] kasan_report+0xba/0xf0 mm/kasan/report.c:588 ext4_es_insert_extent+0xc68/0xcb0 fs/ext4/extents_status.c:894 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] Allocated by task 8438: [...] kmem_cache_zalloc include/linux/slab.h:693 [inline] __es_alloc_extent fs/ext4/extents_status.c:469 [inline] ext4_es_insert_extent+0x672/0xcb0 fs/ext4/extents_status.c:873 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] Freed by task 8438: [...] kmem_cache_free+0xec/0x490 mm/slub.c:3823 ext4_es_try_to_merge_right fs/ext4/extents_status.c:593 [inline] __es_insert_extent+0x9f4/0x1440 fs/ext4/extents_status.c:802 ext4_es_insert_extent+0x2ca/0xcb0 fs/ext4/extents_status.c:882 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] ================================================================== The flow of issue triggering is as follows: 1. remove es raw es es removed es1 |-------------------| -> |----|.......|------| 2. insert es es insert es1 merge with es es1 merge with es and free es1 |----|.......|------| -> |------------|------| -> |-------------------| es merges with newes, then merges with es1, frees es1, then determines if es1->es_len is 0 and triggers a UAF. The code flow is as follows: ext4_es_insert_extent es1 = __es_alloc_extent(true); es2 = __es_alloc_extent(true); __es_remove_extent(inode, lblk, end, NULL, es1) __es_insert_extent(inode, &newes, es1) ---> insert es1 to es tree __es_insert_extent(inode, &newes, es2) ext4_es_try_to_merge_right ext4_es_free_extent(inode, es1) ---> es1 is freed if (es1 && !es1->es_len) // Trigger UAF by determining if es1 is used. We determine whether es1 or es2 is used immediately after calling __es_remove_extent() or __es_insert_extent() to avoid triggering a UAF if es1 or es2 is freed. Reported-by: Yikebaer Aizezi Closes: https://lore.kernel.org/lkml/CALcu4raD4h9coiyEBL4Bm0zjDwxC2CyPiTwsP3zFuhot6y9Beg@mail.gmail.com Fixes: 2a69c450083d ("ext4: using nofail preallocation in ext4_es_insert_extent()") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230815070808.3377171-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 44 +++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb4..6f7de14c0fa8 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -878,23 +878,29 @@ retry: err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) @@ -1491,8 +1497,12 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; @@ -2047,19 +2057,25 @@ retry: err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (allocated) __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2)