From dfc5d03f12e706c19ee37734184ea96582ef931d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 May 2008 19:11:51 -0400 Subject: [PATCH 1/7] ext4: correct mount option parsing to detect when quota options can be changed We should not allow user to change quota mount options when quota is just suspended. It would make mount options and internal quota state inconsistent. Also we should not allow user to change quota format when quota is turned on. On the other hand we can just silently ignore when some option is set to the value it already has (mount does this on remount). Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 52dd0679a4e2..686ebcc2e6c7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -979,7 +979,7 @@ static int parse_options (char *options, struct super_block *sb, int data_opt = 0; int option; #ifdef CONFIG_QUOTA - int qtype; + int qtype, qfmt; char *qname; #endif @@ -1162,7 +1162,9 @@ static int parse_options (char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + !sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change journalled " "quota options when quota turned on.\n"); @@ -1200,7 +1202,9 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if (sb_any_quota_enabled(sb)) { + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change " "journalled quota options when " "quota turned on.\n"); @@ -1213,10 +1217,20 @@ clear_qf_name: sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: - sbi->s_jquota_fmt = QFMT_VFS_OLD; - break; + qfmt = QFMT_VFS_OLD; + goto set_qf_format; case Opt_jqfmt_vfsv0: - sbi->s_jquota_fmt = QFMT_VFS_V0; + qfmt = QFMT_VFS_V0; +set_qf_format: + if ((sb_any_quota_enabled(sb) || + sb_any_quota_suspended(sb)) && + sbi->s_jquota_fmt != qfmt) { + printk(KERN_ERR "EXT4-fs: Cannot change " + "journaled quota options when " + "quota turned on.\n"); + return 0; + } + sbi->s_jquota_fmt = qfmt; break; case Opt_quota: case Opt_usrquota: From cd59e7b9781a35716b8a3e8c4aa2d48081d7daf7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 May 2008 19:11:51 -0400 Subject: [PATCH 2/7] ext4: Fix mount messages when quota disabled When quota is disabled, we should not print 'journaled quota not supported' when user tried to mount non-journaled quota. Also fix typo in the message. Signed-off-by: Jan Kara Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 686ebcc2e6c7..94a527261cae 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1255,6 +1255,9 @@ set_qf_format: case Opt_quota: case Opt_usrquota: case Opt_grpquota: + printk(KERN_ERR + "EXT4-fs: quota options not supported.\n"); + break; case Opt_usrjquota: case Opt_grpjquota: case Opt_offusrjquota: @@ -1262,7 +1265,7 @@ set_qf_format: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: printk(KERN_ERR - "EXT4-fs: journalled quota options not " + "EXT4-fs: journaled quota options not " "supported.\n"); break; case Opt_noquota: From 0623543b3335c8e439cacf21af99bbf45da42c5a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 May 2008 19:11:51 -0400 Subject: [PATCH 3/7] ext4: fix synchronization of quota files in journal=data mode In journal=data mode, it is not enough to do write_inode_now as done in vfs_quota_on() to write all data to their final location (which is needed for quota_read to work correctly). Calling journal_flush() does its job. Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94a527261cae..cddf7f0e0fda 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3170,23 +3170,42 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (!test_opt(sb, QUOTA)) return -EINVAL; - /* Not journalling quota? */ - if ((!EXT4_SB(sb)->s_qf_names[USRQUOTA] && - !EXT4_SB(sb)->s_qf_names[GRPQUOTA]) || remount) + /* When remounting, no checks are needed and in fact, path is NULL */ + if (remount) return vfs_quota_on(sb, type, format_id, path, remount); + err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; + /* Quotafile not on the same filesystem? */ if (nd.path.mnt->mnt_sb != sb) { path_put(&nd.path); return -EXDEV; } - /* Quotafile not of fs root? */ - if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) - printk(KERN_WARNING - "EXT4-fs: Quota file not on filesystem root. " - "Journalled quota will not work.\n"); + /* Journaling quota? */ + if (EXT4_SB(sb)->s_qf_names[type]) { + /* Quotafile not of fs root? */ + if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) + printk(KERN_WARNING + "EXT4-fs: Quota file not on filesystem root. " + "Journaled quota will not work.\n"); + } + + /* + * When we journal data on quota file, we have to flush journal to see + * all updates to the file when we bypass pagecache... + */ + if (ext4_should_journal_data(nd.path.dentry->d_inode)) { + /* + * We don't need to lock updates but journal_flush() could + * otherwise be livelocked... + */ + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + path_put(&nd.path); return vfs_quota_on(sb, type, format_id, path, remount); } From 2c8be6b222f76c332d9faeb00c047996d340632c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 13 May 2008 21:27:55 -0400 Subject: [PATCH 4/7] ext4: fix typos in messages and comments (journalled -> journaled) Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cddf7f0e0fda..09d9359c8055 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1166,7 +1166,7 @@ set_qf_name: sb_any_quota_suspended(sb)) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR - "EXT4-fs: Cannot change journalled " + "EXT4-fs: Cannot change journaled " "quota options when quota turned on.\n"); return 0; } @@ -1206,7 +1206,7 @@ clear_qf_name: sb_any_quota_suspended(sb)) && sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change " - "journalled quota options when " + "journaled quota options when " "quota turned on.\n"); return 0; } @@ -1350,14 +1350,14 @@ set_qf_format: } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT4-fs: journalled quota format " + printk(KERN_ERR "EXT4-fs: journaled quota format " "not specified.\n"); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT4-fs: journalled quota format " - "specified with no journalling " + printk(KERN_ERR "EXT4-fs: journaled quota format " + "specified with no journaling " "enabled.\n"); return 0; } @@ -1598,7 +1598,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, int ret = ext4_quota_on_mount(sb, i); if (ret < 0) printk(KERN_ERR - "EXT4-fs: Cannot turn on journalled " + "EXT4-fs: Cannot turn on journaled " "quota: error %d\n", ret); } } @@ -3123,7 +3123,7 @@ static int ext4_release_dquot(struct dquot *dquot) static int ext4_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ + /* Are we journaling quotas? */ if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); From 1930479c4b6bbcb6f164a5b3498e0d98329967f4 Mon Sep 17 00:00:00 2001 From: Valerie Clement Date: Tue, 13 May 2008 19:31:14 -0400 Subject: [PATCH 5/7] ext4: mballoc fix mb_normalize_request algorithm for 1KB block size filesystems In case of inode preallocation, the number of blocks to allocate depends on the file size and it is calculated in ext4_mb_normalize_request(). Each group in the filesystem is then checked to find one that can be used for allocation; this is done in ext4_mb_good_group(). When a file bigger than 4MB is created, the requested number of blocks to preallocate, calculated by ext4_mb_normalize_request is 4096. However for a filesystem with 1KB block size, the maximum size of the block buddies used by the multiblock allocator is 2048, so none of groups in the filesystem satisfies the search criteria in ext4_mb_good_group(). Scanning all the filesystem groups impacts performance. This was demonstrated by using a freshly created, 70GB, 1k block filesystem, with caches dropped write before the test via /proc/sys/vm/drop_caches, and with the filesystem mounted with nodelalloc and nodealloc,nomballoc. The time to write an 8 megabyte file using "dd if=/dev/zero of=/mnt/test/fo bs=8k count=1k conv=fsync" took 35.5091 seconds (236kB/s) with nodellaloc, and 0.233754 seconds (35.9 MB/s) with the nodelloc,nomballoc options. With a 1TB partition, it took several minutes to write 8MB! This patch modifies the algorithm in ext4_mb_normalize_group_request to calculate the number of blocks to allocate by taking into account the maximum size of free blocks chunks handled by the multiblock allocator. It has also been tested for filesystems with 2KB and 4KB block sizes to ensure that those cases don't regress. Reviewed-by: Aneesh Kumar K.V Signed-off-by: Valerie Clement Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b128bdc0f55c..1d7fde994521 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2880,12 +2880,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); - /* max available blocks in a free group */ - max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 - - EXT4_SB(ac->ac_sb)->s_itb_per_group; + /* max size of free chunks */ + max = 2 << bsbits; -#define NRL_CHECK_SIZE(req, size, max,bits) \ - (req <= (size) || max <= ((size) >> bits)) +#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ + (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ @@ -2904,16 +2903,16 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { + } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> - (20 - bsbits)) << 20; - size = 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { + (21 - bsbits)) << 21; + size = 2 * 1024 * 1024; + } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, - (8<<20)>>bsbits, max, bsbits)) { + (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; From 519deca0496a4df07d15acf3181ca5d573bffdec Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 15 May 2008 14:43:20 -0400 Subject: [PATCH 6/7] ext4: Retry block allocation if new blocks are allocated from system zone. If the block allocator gets blocks out of system zone ext4 calls ext4_error. But if the file system is mounted with errors=continue retry block allocation. We need to mark the system zone blocks as in use to make sure retry don't pick them again System zone is the block range mapping block bitmap, inode bitmap and inode table. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 17 +++++++++++------ fs/ext4/mballoc.c | 47 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index da994374ec3b..30494c5da843 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -287,11 +287,11 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) (int)block_group, (unsigned long long)bitmap_blk); return NULL; } - if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) { - put_bh(bh); - return NULL; - } - + ext4_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, + * continue with corrupt bitmap + */ return bh; } /* @@ -1770,7 +1770,12 @@ allocated: "Allocating block in system zone - " "blocks from %llu, length %lu", ret_block, num); - goto out; + /* + * claim_block marked the blocks we allocated + * as in use. So we may want to selectively + * mark some of the blocks as free + */ + goto retry_alloc; } performed_allocation = 1; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1d7fde994521..873ad9b3418c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2736,7 +2736,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; - int err; + int err, len; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); @@ -2770,14 +2770,27 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + ac->ac_b_ex.fe_start + le32_to_cpu(es->s_first_data_block); - if (block == ext4_block_bitmap(sb, gdp) || - block == ext4_inode_bitmap(sb, gdp) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { - + len = ac->ac_b_ex.fe_len; + if (in_range(ext4_block_bitmap(sb, gdp), block, len) || + in_range(ext4_inode_bitmap(sb, gdp), block, len) || + in_range(block, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group) || + in_range(block + len - 1, ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, __func__, "Allocating block in system zone - block = %llu", block); + /* File system mounted not to panic on error + * Fix the bitmap and repeat the block allocation + * We leak some of the blocks here. + */ + mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), + bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + err = ext4_journal_dirty_metadata(handle, bitmap_bh); + if (!err) + err = -EAGAIN; + goto out_err; } #ifdef AGGRESSIVE_CHECK { @@ -4032,7 +4045,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); - repeat: /* allocate space in core */ ext4_mb_regular_allocator(ac); @@ -4046,10 +4058,21 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - ext4_mb_mark_diskspace_used(ac, handle); - *errp = 0; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - ar->len = ac->ac_b_ex.fe_len; + *errp = ext4_mb_mark_diskspace_used(ac, handle); + if (*errp == -EAGAIN) { + ac->ac_b_ex.fe_group = 0; + ac->ac_b_ex.fe_start = 0; + ac->ac_b_ex.fe_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) { + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); + } else { + block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); + ar->len = ac->ac_b_ex.fe_len; + } } else { freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) @@ -4236,6 +4259,8 @@ do_more: ext4_error(sb, __func__, "Freeing blocks in system zone - " "Block = %lu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); From 02c471cb17203c748e9bc87003052c1f46e5df69 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Thu, 15 May 2008 14:46:17 -0400 Subject: [PATCH 7/7] jbd2: update transaction t_state to T_COMMIT fix Updating the current transaction's t_state is protected by j_state_lock. We need to do the same when updating the t_state to T_COMMIT. Acked-by: Jan Kara Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" Signed-off-by: Andrew Morton --- fs/jbd2/commit.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e0139786f717..4d99685fdce4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -560,7 +560,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ + spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; + spin_unlock(&journal->j_state_lock); stats.u.run.rs_logging = jiffies; stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,