Many cleanups and bug fixes in ext4, especially for the fast commit
feature. Also some performance improvements; in particular, improving IOPS and throughput on fast devices running Async Direct I/O by up to 20% by optimizing jbd2_transaction_committed(). -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmaYiqsACgkQ8vlZVpUN gaOWpQf/d6Y9WGyjeC1jOc+vIBxLgL+X0kbzYkkjGTSIZ7mZJS9X4NMMEtqayJ4f 1zGobcGENc05l4LVxf3uMbDj1aGlHeI9X4GLGaP5s5NcaAl4HKjQ3aFs3MuiJHPj Ol2CebXJx+NKt1lkD8PSPGgaTb5zg+SeZifI+OZ1RpkcKmGnkSNa5NkUNAaBh6dl 5LLXTc2p9NcCwAwDAQSiAJCV35bAZpcp6fwLLaPQ6Eok9HxGcJuYXW2Fict4rbtV mXeogXVIo2bkMcfh6tDchDBrFvORYIA7uBVmaG1LgAMrtEnYxnxnEntD0h6j/bzF Fl4jjQfd8o2uYto/4eo+iY6Z0haxyQ== =rcOo -----END PGP SIGNATURE----- Merge tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates from Ted Ts'o: "Many cleanups and bug fixes in ext4, especially for the fast commit feature. Also some performance improvements; in particular, improving IOPS and throughput on fast devices running Async Direct I/O by up to 20% by optimizing jbd2_transaction_committed()" * tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (40 commits) ext4: make sure the first directory block is not a hole ext4: check dot and dotdot of dx_root before making dir indexed ext4: sanity check for NULL pointer after ext4_force_shutdown jbd2: increase maximum transaction size jbd2: drop pointless shrinker batch initialization jbd2: avoid infinite transaction commit loop jbd2: precompute number of transaction descriptor blocks jbd2: make jbd2_journal_get_max_txn_bufs() internal jbd2: avoid mount failed when commit block is partial submitted ext4: avoid writing unitialized memory to disk in EA inodes ext4: don't track ranges in fast_commit if inode has inlined data ext4: fix possible tid_t sequence overflows ext4: use ext4_update_inode_fsync_trans() helper in inode creation ext4: add missing MODULE_DESCRIPTION() jbd2: add missing MODULE_DESCRIPTION() ext4: use memtostr_pad() for s_volume_name jbd2: speed up jbd2_transaction_committed() ext4: make ext4_da_map_blocks() buffer_head unaware ext4: make ext4_insert_delayed_block() insert multi-blocks ext4: factor out a helper to check the cluster allocation state ...
This commit is contained in:
commit
51ed42a8a1
@ -2184,6 +2184,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
|
||||
struct buffer_head *bh, *head;
|
||||
|
||||
bh = head = folio_buffers(folio);
|
||||
if (!bh)
|
||||
return;
|
||||
blocksize = bh->b_size;
|
||||
|
||||
block_start = 0;
|
||||
|
@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
|
||||
{
|
||||
struct ext4_system_zone *new_entry, *entry;
|
||||
struct rb_node **n = &system_blks->root.rb_node, *node;
|
||||
struct rb_node *parent = NULL, *new_node = NULL;
|
||||
struct rb_node *parent = NULL, *new_node;
|
||||
|
||||
while (*n) {
|
||||
parent = *n;
|
||||
|
@ -1347,7 +1347,7 @@ struct ext4_super_block {
|
||||
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
|
||||
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
|
||||
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
|
||||
/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
|
||||
/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
|
||||
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
|
||||
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
|
||||
/*
|
||||
|
@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode,
|
||||
ext4_lblk_t lblk, ext4_lblk_t end,
|
||||
struct extent_status *es)
|
||||
{
|
||||
es->es_lblk = es->es_len = es->es_pblk = 0;
|
||||
|
||||
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|
||||
return;
|
||||
|
||||
@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
|
||||
}
|
||||
|
||||
/*
|
||||
* ext4_es_insert_delayed_block - adds a delayed block to the extents status
|
||||
* tree, adding a pending reservation where
|
||||
* needed
|
||||
* ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
|
||||
* status tree, adding a pending reservation
|
||||
* where needed
|
||||
*
|
||||
* @inode - file containing the newly added block
|
||||
* @lblk - logical block to be added
|
||||
* @allocated - indicates whether a physical cluster has been allocated for
|
||||
* the logical cluster that contains the block
|
||||
* @lblk - start logical block to be added
|
||||
* @len - length of blocks to be added
|
||||
* @lclu_allocated/end_allocated - indicates whether a physical cluster has
|
||||
* been allocated for the logical cluster
|
||||
* that contains the start/end block. Note that
|
||||
* end_allocated should always be set to false
|
||||
* if the start and the end block are in the
|
||||
* same cluster
|
||||
*/
|
||||
void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
|
||||
bool allocated)
|
||||
void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
|
||||
ext4_lblk_t len, bool lclu_allocated,
|
||||
bool end_allocated)
|
||||
{
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
struct extent_status newes;
|
||||
ext4_lblk_t end = lblk + len - 1;
|
||||
int err1 = 0, err2 = 0, err3 = 0;
|
||||
struct extent_status *es1 = NULL;
|
||||
struct extent_status *es2 = NULL;
|
||||
struct pending_reservation *pr = NULL;
|
||||
struct pending_reservation *pr1 = NULL;
|
||||
struct pending_reservation *pr2 = NULL;
|
||||
|
||||
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|
||||
return;
|
||||
|
||||
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
|
||||
lblk, inode->i_ino);
|
||||
es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
|
||||
lblk, len, inode->i_ino);
|
||||
if (!len)
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
|
||||
end_allocated);
|
||||
|
||||
newes.es_lblk = lblk;
|
||||
newes.es_len = 1;
|
||||
newes.es_len = len;
|
||||
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
|
||||
trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
|
||||
trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
|
||||
end_allocated);
|
||||
|
||||
ext4_es_insert_extent_check(inode, &newes);
|
||||
|
||||
@ -2088,11 +2105,15 @@ retry:
|
||||
es1 = __es_alloc_extent(true);
|
||||
if ((err1 || err2) && !es2)
|
||||
es2 = __es_alloc_extent(true);
|
||||
if ((err1 || err2 || err3) && allocated && !pr)
|
||||
pr = __alloc_pending(true);
|
||||
if (err1 || err2 || err3) {
|
||||
if (lclu_allocated && !pr1)
|
||||
pr1 = __alloc_pending(true);
|
||||
if (end_allocated && !pr2)
|
||||
pr2 = __alloc_pending(true);
|
||||
}
|
||||
write_lock(&EXT4_I(inode)->i_es_lock);
|
||||
|
||||
err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
|
||||
err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
|
||||
if (err1 != 0)
|
||||
goto error;
|
||||
/* Free preallocated extent if it didn't get used. */
|
||||
@ -2112,13 +2133,22 @@ retry:
|
||||
es2 = NULL;
|
||||
}
|
||||
|
||||
if (allocated) {
|
||||
err3 = __insert_pending(inode, lblk, &pr);
|
||||
if (lclu_allocated) {
|
||||
err3 = __insert_pending(inode, lblk, &pr1);
|
||||
if (err3 != 0)
|
||||
goto error;
|
||||
if (pr) {
|
||||
__free_pending(pr);
|
||||
pr = NULL;
|
||||
if (pr1) {
|
||||
__free_pending(pr1);
|
||||
pr1 = NULL;
|
||||
}
|
||||
}
|
||||
if (end_allocated) {
|
||||
err3 = __insert_pending(inode, end, &pr2);
|
||||
if (err3 != 0)
|
||||
goto error;
|
||||
if (pr2) {
|
||||
__free_pending(pr2);
|
||||
pr2 = NULL;
|
||||
}
|
||||
}
|
||||
error:
|
||||
|
@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
|
||||
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
|
||||
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
|
||||
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
|
||||
extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
|
||||
bool allocated);
|
||||
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
|
||||
ext4_lblk_t len, bool lclu_allocated,
|
||||
bool end_allocated);
|
||||
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
|
||||
ext4_lblk_t len);
|
||||
extern void ext4_clear_inode_es(struct inode *inode);
|
||||
|
@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
|
||||
read_unlock(&sbi->s_journal->j_state_lock);
|
||||
}
|
||||
spin_lock(&sbi->s_fc_lock);
|
||||
if (sbi->s_fc_ineligible_tid < tid)
|
||||
if (tid_gt(tid, sbi->s_fc_ineligible_tid))
|
||||
sbi->s_fc_ineligible_tid = tid;
|
||||
spin_unlock(&sbi->s_fc_lock);
|
||||
WARN_ON(reason >= EXT4_FC_REASON_MAX);
|
||||
@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
|
||||
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
|
||||
return;
|
||||
|
||||
if (ext4_has_inline_data(inode)) {
|
||||
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
|
||||
handle);
|
||||
return;
|
||||
}
|
||||
|
||||
args.start = start;
|
||||
args.end = end;
|
||||
|
||||
@ -1207,7 +1213,7 @@ restart_fc:
|
||||
if (ret == -EALREADY) {
|
||||
/* There was an ongoing commit, check if we need to restart */
|
||||
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
|
||||
commit_tid > journal->j_commit_sequence)
|
||||
tid_gt(commit_tid, journal->j_commit_sequence))
|
||||
goto restart_fc;
|
||||
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
|
||||
commit_tid);
|
||||
@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
|
||||
list_del_init(&iter->i_fc_list);
|
||||
ext4_clear_inode_state(&iter->vfs_inode,
|
||||
EXT4_STATE_FC_COMMITTING);
|
||||
if (iter->i_sync_tid <= tid)
|
||||
if (tid_geq(tid, iter->i_sync_tid))
|
||||
ext4_fc_reset_inode(&iter->vfs_inode);
|
||||
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
|
||||
smp_mb();
|
||||
@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
|
||||
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
|
||||
&sbi->s_fc_q[FC_Q_MAIN]);
|
||||
|
||||
if (tid >= sbi->s_fc_ineligible_tid) {
|
||||
if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
|
||||
sbi->s_fc_ineligible_tid = 0;
|
||||
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
|
||||
}
|
||||
|
@ -1336,10 +1336,7 @@ got:
|
||||
}
|
||||
}
|
||||
|
||||
if (ext4_handle_valid(handle)) {
|
||||
ei->i_sync_tid = handle->h_transaction->t_tid;
|
||||
ei->i_datasync_tid = handle->h_transaction->t_tid;
|
||||
}
|
||||
ext4_update_inode_fsync_trans(handle, inode, 1);
|
||||
|
||||
err = ext4_mark_inode_dirty(handle, inode);
|
||||
if (err) {
|
||||
|
@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
|
||||
hinfo->hash = EXT4_DIRENT_HASH(de);
|
||||
hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
|
||||
} else {
|
||||
ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
|
||||
err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
|
||||
if (err) {
|
||||
ret = err;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if ((hinfo->hash < start_hash) ||
|
||||
((hinfo->hash == start_hash) &&
|
||||
|
@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
|
||||
|
||||
kunit_test_suites(&ext4_inode_test_suite);
|
||||
|
||||
MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
259
fs/ext4/inode.c
259
fs/ext4/inode.c
@ -453,6 +453,35 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
|
||||
}
|
||||
#endif /* ES_AGGRESSIVE_TEST */
|
||||
|
||||
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
|
||||
struct ext4_map_blocks *map)
|
||||
{
|
||||
unsigned int status;
|
||||
int retval;
|
||||
|
||||
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
|
||||
retval = ext4_ext_map_blocks(handle, inode, map, 0);
|
||||
else
|
||||
retval = ext4_ind_map_blocks(handle, inode, map, 0);
|
||||
|
||||
if (retval <= 0)
|
||||
return retval;
|
||||
|
||||
if (unlikely(retval != map->m_len)) {
|
||||
ext4_warning(inode->i_sb,
|
||||
"ES len assertion failed for inode "
|
||||
"%lu: retval %d != map->m_len %d",
|
||||
inode->i_ino, retval, map->m_len);
|
||||
WARN_ON(1);
|
||||
}
|
||||
|
||||
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
|
||||
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
||||
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
|
||||
map->m_pblk, status);
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
* The ext4_map_blocks() function tries to look up the requested blocks,
|
||||
* and returns if the blocks are already mapped.
|
||||
@ -1450,9 +1479,9 @@ static int ext4_journalled_write_end(struct file *file,
|
||||
}
|
||||
|
||||
/*
|
||||
* Reserve space for a single cluster
|
||||
* Reserve space for 'nr_resv' clusters
|
||||
*/
|
||||
static int ext4_da_reserve_space(struct inode *inode)
|
||||
static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
|
||||
{
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
@ -1463,18 +1492,18 @@ static int ext4_da_reserve_space(struct inode *inode)
|
||||
* us from metadata over-estimation, though we may go over by
|
||||
* a small amount in the end. Here we just reserve for data.
|
||||
*/
|
||||
ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
|
||||
ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
spin_lock(&ei->i_block_reservation_lock);
|
||||
if (ext4_claim_free_clusters(sbi, 1, 0)) {
|
||||
if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
|
||||
spin_unlock(&ei->i_block_reservation_lock);
|
||||
dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
|
||||
dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
|
||||
return -ENOSPC;
|
||||
}
|
||||
ei->i_reserved_data_blocks++;
|
||||
trace_ext4_da_reserve_space(inode);
|
||||
ei->i_reserved_data_blocks += nr_resv;
|
||||
trace_ext4_da_reserve_space(inode, nr_resv);
|
||||
spin_unlock(&ei->i_block_reservation_lock);
|
||||
|
||||
return 0; /* success */
|
||||
@ -1621,24 +1650,58 @@ static void ext4_print_free_blocks(struct inode *inode)
|
||||
}
|
||||
|
||||
/*
|
||||
* ext4_insert_delayed_block - adds a delayed block to the extents status
|
||||
* tree, incrementing the reserved cluster/block
|
||||
* count or making a pending reservation
|
||||
* where needed
|
||||
* Check whether the cluster containing lblk has been allocated or has
|
||||
* delalloc reservation.
|
||||
*
|
||||
* @inode - file containing the newly added block
|
||||
* @lblk - logical block to be added
|
||||
*
|
||||
* Returns 0 on success, negative error code on failure.
|
||||
* Returns 0 if the cluster doesn't have either, 1 if it has delalloc
|
||||
* reservation, 2 if it's already been allocated, negative error code on
|
||||
* failure.
|
||||
*/
|
||||
static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
|
||||
static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
|
||||
{
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
int ret;
|
||||
bool allocated = false;
|
||||
|
||||
/* Has delalloc reservation? */
|
||||
if (ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk))
|
||||
return 1;
|
||||
|
||||
/* Already been allocated? */
|
||||
if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
|
||||
return 2;
|
||||
ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret > 0)
|
||||
return 2;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
|
||||
* status tree, incrementing the reserved
|
||||
* cluster/block count or making pending
|
||||
* reservations where needed
|
||||
*
|
||||
* @inode - file containing the newly added block
|
||||
* @lblk - start logical block to be added
|
||||
* @len - length of blocks to be added
|
||||
*
|
||||
* Returns 0 on success, negative error code on failure.
|
||||
*/
|
||||
static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
|
||||
ext4_lblk_t len)
|
||||
{
|
||||
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
|
||||
int ret;
|
||||
bool lclu_allocated = false;
|
||||
bool end_allocated = false;
|
||||
ext4_lblk_t resv_clu;
|
||||
ext4_lblk_t end = lblk + len - 1;
|
||||
|
||||
/*
|
||||
* If the cluster containing lblk is shared with a delayed,
|
||||
* If the cluster containing lblk or end is shared with a delayed,
|
||||
* written, or unwritten extent in a bigalloc file system, it's
|
||||
* already been accounted for and does not need to be reserved.
|
||||
* A pending reservation must be made for the cluster if it's
|
||||
@ -1649,81 +1712,84 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
|
||||
* extents status tree doesn't get a match.
|
||||
*/
|
||||
if (sbi->s_cluster_ratio == 1) {
|
||||
ret = ext4_da_reserve_space(inode);
|
||||
ret = ext4_da_reserve_space(inode, len);
|
||||
if (ret != 0) /* ENOSPC */
|
||||
return ret;
|
||||
} else { /* bigalloc */
|
||||
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
|
||||
if (!ext4_es_scan_clu(inode,
|
||||
&ext4_es_is_mapped, lblk)) {
|
||||
ret = ext4_clu_mapped(inode,
|
||||
EXT4_B2C(sbi, lblk));
|
||||
resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;
|
||||
|
||||
ret = ext4_clu_alloc_state(inode, lblk);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0) {
|
||||
ret = ext4_da_reserve_space(inode);
|
||||
if (ret != 0) /* ENOSPC */
|
||||
if (ret > 0) {
|
||||
resv_clu--;
|
||||
lclu_allocated = (ret == 2);
|
||||
}
|
||||
|
||||
if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
|
||||
ret = ext4_clu_alloc_state(inode, end);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
} else {
|
||||
allocated = true;
|
||||
}
|
||||
} else {
|
||||
allocated = true;
|
||||
}
|
||||
if (ret > 0) {
|
||||
resv_clu--;
|
||||
end_allocated = (ret == 2);
|
||||
}
|
||||
}
|
||||
|
||||
ext4_es_insert_delayed_block(inode, lblk, allocated);
|
||||
if (resv_clu) {
|
||||
ret = ext4_da_reserve_space(inode, resv_clu);
|
||||
if (ret != 0) /* ENOSPC */
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
|
||||
end_allocated);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is grabs code from the very beginning of
|
||||
* ext4_map_blocks, but assumes that the caller is from delayed write
|
||||
* time. This function looks up the requested blocks and sets the
|
||||
* buffer delay bit under the protection of i_data_sem.
|
||||
* Looks up the requested blocks and sets the delalloc extent map.
|
||||
* First try to look up for the extent entry that contains the requested
|
||||
* blocks in the extent status tree without i_data_sem, then try to look
|
||||
* up for the ondisk extent mapping with i_data_sem in read mode,
|
||||
* finally hold i_data_sem in write mode, looks up again and add a
|
||||
* delalloc extent entry if it still couldn't find any extent. Pass out
|
||||
* the mapped extent through @map and return 0 on success.
|
||||
*/
|
||||
static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
|
||||
struct ext4_map_blocks *map,
|
||||
struct buffer_head *bh)
|
||||
static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
|
||||
{
|
||||
struct extent_status es;
|
||||
int retval;
|
||||
sector_t invalid_block = ~((sector_t) 0xffff);
|
||||
#ifdef ES_AGGRESSIVE_TEST
|
||||
struct ext4_map_blocks orig_map;
|
||||
|
||||
memcpy(&orig_map, map, sizeof(*map));
|
||||
#endif
|
||||
|
||||
if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
|
||||
invalid_block = ~0;
|
||||
|
||||
map->m_flags = 0;
|
||||
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
|
||||
(unsigned long) map->m_lblk);
|
||||
|
||||
/* Lookup extent status tree firstly */
|
||||
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
|
||||
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
|
||||
map->m_len = min_t(unsigned int, map->m_len,
|
||||
es.es_len - (map->m_lblk - es.es_lblk));
|
||||
|
||||
if (ext4_es_is_hole(&es))
|
||||
goto add_delayed;
|
||||
|
||||
found:
|
||||
/*
|
||||
* Delayed extent could be allocated by fallocate.
|
||||
* So we need to check it.
|
||||
*/
|
||||
if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
|
||||
map_bh(bh, inode->i_sb, invalid_block);
|
||||
set_buffer_new(bh);
|
||||
set_buffer_delay(bh);
|
||||
if (ext4_es_is_delonly(&es)) {
|
||||
map->m_flags |= EXT4_MAP_DELAYED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
|
||||
retval = es.es_len - (iblock - es.es_lblk);
|
||||
if (retval > map->m_len)
|
||||
retval = map->m_len;
|
||||
map->m_len = retval;
|
||||
map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
|
||||
if (ext4_es_is_written(&es))
|
||||
map->m_flags |= EXT4_MAP_MAPPED;
|
||||
else if (ext4_es_is_unwritten(&es))
|
||||
@ -1734,7 +1800,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
|
||||
#ifdef ES_AGGRESSIVE_TEST
|
||||
ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
|
||||
#endif
|
||||
return retval;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1744,44 +1810,41 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
|
||||
down_read(&EXT4_I(inode)->i_data_sem);
|
||||
if (ext4_has_inline_data(inode))
|
||||
retval = 0;
|
||||
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
|
||||
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
|
||||
else
|
||||
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
|
||||
if (retval < 0) {
|
||||
up_read(&EXT4_I(inode)->i_data_sem);
|
||||
return retval;
|
||||
}
|
||||
if (retval > 0) {
|
||||
unsigned int status;
|
||||
|
||||
if (unlikely(retval != map->m_len)) {
|
||||
ext4_warning(inode->i_sb,
|
||||
"ES len assertion failed for inode "
|
||||
"%lu: retval %d != map->m_len %d",
|
||||
inode->i_ino, retval, map->m_len);
|
||||
WARN_ON(1);
|
||||
}
|
||||
|
||||
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
|
||||
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
|
||||
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
|
||||
map->m_pblk, status);
|
||||
up_read(&EXT4_I(inode)->i_data_sem);
|
||||
return retval;
|
||||
}
|
||||
retval = ext4_map_query_blocks(NULL, inode, map);
|
||||
up_read(&EXT4_I(inode)->i_data_sem);
|
||||
if (retval)
|
||||
return retval < 0 ? retval : 0;
|
||||
|
||||
add_delayed:
|
||||
down_write(&EXT4_I(inode)->i_data_sem);
|
||||
retval = ext4_insert_delayed_block(inode, map->m_lblk);
|
||||
up_write(&EXT4_I(inode)->i_data_sem);
|
||||
if (retval)
|
||||
return retval;
|
||||
/*
|
||||
* Page fault path (ext4_page_mkwrite does not take i_rwsem)
|
||||
* and fallocate path (no folio lock) can race. Make sure we
|
||||
* lookup the extent status tree here again while i_data_sem
|
||||
* is held in write mode, before inserting a new da entry in
|
||||
* the extent status tree.
|
||||
*/
|
||||
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
|
||||
map->m_len = min_t(unsigned int, map->m_len,
|
||||
es.es_len - (map->m_lblk - es.es_lblk));
|
||||
|
||||
if (!ext4_es_is_hole(&es)) {
|
||||
up_write(&EXT4_I(inode)->i_data_sem);
|
||||
goto found;
|
||||
}
|
||||
} else if (!ext4_has_inline_data(inode)) {
|
||||
retval = ext4_map_query_blocks(NULL, inode, map);
|
||||
if (retval) {
|
||||
up_write(&EXT4_I(inode)->i_data_sem);
|
||||
return retval < 0 ? retval : 0;
|
||||
}
|
||||
}
|
||||
|
||||
map->m_flags |= EXT4_MAP_DELAYED;
|
||||
retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
|
||||
up_write(&EXT4_I(inode)->i_data_sem);
|
||||
|
||||
map_bh(bh, inode->i_sb, invalid_block);
|
||||
set_buffer_new(bh);
|
||||
set_buffer_delay(bh);
|
||||
return retval;
|
||||
}
|
||||
|
||||
@ -1801,11 +1864,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh, int create)
|
||||
{
|
||||
struct ext4_map_blocks map;
|
||||
sector_t invalid_block = ~((sector_t) 0xffff);
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(create == 0);
|
||||
BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
|
||||
|
||||
if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
|
||||
invalid_block = ~0;
|
||||
|
||||
map.m_lblk = iblock;
|
||||
map.m_len = 1;
|
||||
|
||||
@ -1814,10 +1881,17 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||
* preallocated blocks are unmapped but should treated
|
||||
* the same as allocated blocks.
|
||||
*/
|
||||
ret = ext4_da_map_blocks(inode, iblock, &map, bh);
|
||||
if (ret <= 0)
|
||||
ret = ext4_da_map_blocks(inode, &map);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (map.m_flags & EXT4_MAP_DELAYED) {
|
||||
map_bh(bh, inode->i_sb, invalid_block);
|
||||
set_buffer_new(bh);
|
||||
set_buffer_delay(bh);
|
||||
return 0;
|
||||
}
|
||||
|
||||
map_bh(bh, inode->i_sb, map.m_pblk);
|
||||
ext4_update_bh_state(bh, map.m_flags);
|
||||
|
||||
@ -2945,6 +3019,11 @@ static int ext4_da_do_write_end(struct address_space *mapping,
|
||||
bool disksize_changed = false;
|
||||
loff_t new_i_size;
|
||||
|
||||
if (unlikely(!folio_buffers(folio))) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
return -EIO;
|
||||
}
|
||||
/*
|
||||
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
|
||||
* flag, which all that's needed to trigger page writeback.
|
||||
|
@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
|
||||
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
|
||||
|
||||
lock_buffer(sbi->s_sbh);
|
||||
strscpy_pad(label, sbi->s_es->s_volume_name);
|
||||
memtostr_pad(label, sbi->s_es->s_volume_name);
|
||||
unlock_buffer(sbi->s_sbh);
|
||||
|
||||
if (copy_to_user(user_label, label, sizeof(label)))
|
||||
|
@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
|
||||
|
||||
return bh;
|
||||
}
|
||||
if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
|
||||
/* The first directory block must not be a hole. */
|
||||
if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
|
||||
ext4_error_inode(inode, func, line, block,
|
||||
"Directory hole found for htree %s block",
|
||||
(type == INDEX) ? "index" : "leaf");
|
||||
"Directory hole found for htree %s block %u",
|
||||
(type == INDEX) ? "index" : "leaf", block);
|
||||
return ERR_PTR(-EFSCORRUPTED);
|
||||
}
|
||||
if (!bh)
|
||||
@ -2172,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
|
||||
return err ? err : err2;
|
||||
}
|
||||
|
||||
static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
|
||||
{
|
||||
struct fake_dirent *fde;
|
||||
const char *error_msg;
|
||||
unsigned int rlen;
|
||||
unsigned int blocksize = dir->i_sb->s_blocksize;
|
||||
char *blockend = (char *)root + dir->i_sb->s_blocksize;
|
||||
|
||||
fde = &root->dot;
|
||||
if (unlikely(fde->name_len != 1)) {
|
||||
error_msg = "invalid name_len for '.'";
|
||||
goto corrupted;
|
||||
}
|
||||
if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
|
||||
error_msg = "invalid name for '.'";
|
||||
goto corrupted;
|
||||
}
|
||||
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
|
||||
if (unlikely((char *)fde + rlen >= blockend)) {
|
||||
error_msg = "invalid rec_len for '.'";
|
||||
goto corrupted;
|
||||
}
|
||||
|
||||
fde = &root->dotdot;
|
||||
if (unlikely(fde->name_len != 2)) {
|
||||
error_msg = "invalid name_len for '..'";
|
||||
goto corrupted;
|
||||
}
|
||||
if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
|
||||
error_msg = "invalid name for '..'";
|
||||
goto corrupted;
|
||||
}
|
||||
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
|
||||
if (unlikely((char *)fde + rlen >= blockend)) {
|
||||
error_msg = "invalid rec_len for '..'";
|
||||
goto corrupted;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
corrupted:
|
||||
EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
|
||||
error_msg);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* This converts a one block unindexed directory to a 3 block indexed
|
||||
* directory, and adds the dentry to the indexed directory.
|
||||
@ -2206,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
|
||||
brelse(bh);
|
||||
return retval;
|
||||
}
|
||||
|
||||
root = (struct dx_root *) bh->b_data;
|
||||
if (!ext4_check_dx_root(dir, root)) {
|
||||
brelse(bh);
|
||||
return -EFSCORRUPTED;
|
||||
}
|
||||
|
||||
/* The 0th block becomes the root, move the dirents out */
|
||||
fde = &root->dotdot;
|
||||
de = (struct ext4_dir_entry_2 *)((char *)fde +
|
||||
ext4_rec_len_from_disk(fde->rec_len, blocksize));
|
||||
if ((char *) de >= (((char *) root) + blocksize)) {
|
||||
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
|
||||
brelse(bh);
|
||||
return -EFSCORRUPTED;
|
||||
}
|
||||
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
|
||||
|
||||
/* Allocate new block for the 0th block's dirents */
|
||||
@ -3038,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode)
|
||||
EXT4_ERROR_INODE(inode, "invalid size");
|
||||
return false;
|
||||
}
|
||||
/* The first directory block must not be a hole,
|
||||
* so treat it as DIRENT_HTREE
|
||||
*/
|
||||
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
|
||||
bh = ext4_read_dirblock(inode, 0, EITHER);
|
||||
if (IS_ERR(bh))
|
||||
return false;
|
||||
|
||||
@ -3483,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
|
||||
struct ext4_dir_entry_2 *de;
|
||||
unsigned int offset;
|
||||
|
||||
/* The first directory block must not be a hole, so
|
||||
* treat it as DIRENT_HTREE
|
||||
*/
|
||||
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
|
||||
bh = ext4_read_dirblock(inode, 0, EITHER);
|
||||
if (IS_ERR(bh)) {
|
||||
*retval = PTR_ERR(bh);
|
||||
return NULL;
|
||||
|
@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb)
|
||||
|
||||
ext4_group_desc_free(sbi);
|
||||
ext4_flex_groups_free(sbi);
|
||||
|
||||
WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
|
||||
percpu_counter_sum(&sbi->s_dirtyclusters_counter));
|
||||
ext4_percpu_param_destroy(sbi);
|
||||
#ifdef CONFIG_QUOTA
|
||||
for (int i = 0; i < EXT4_MAXQUOTAS; i++)
|
||||
@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode)
|
||||
dump_stack();
|
||||
}
|
||||
|
||||
if (EXT4_I(inode)->i_reserved_data_blocks)
|
||||
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
|
||||
WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
|
||||
ext4_msg(inode->i_sb, KERN_ERR,
|
||||
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
|
||||
inode->i_ino, EXT4_I(inode),
|
||||
|
@ -1433,6 +1433,12 @@ retry:
|
||||
goto out;
|
||||
|
||||
memcpy(bh->b_data, buf, csize);
|
||||
/*
|
||||
* Zero out block tail to avoid writing uninitialized memory
|
||||
* to disk.
|
||||
*/
|
||||
if (csize < blocksize)
|
||||
memset(bh->b_data + csize, 0, blocksize - csize);
|
||||
set_buffer_uptodate(bh);
|
||||
ext4_handle_dirty_metadata(handle, ea_inode, bh);
|
||||
|
||||
|
@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
||||
struct buffer_head *descriptor;
|
||||
struct buffer_head **wbuf = journal->j_wbuf;
|
||||
int bufs;
|
||||
int flags;
|
||||
int escape;
|
||||
int err;
|
||||
unsigned long long blocknr;
|
||||
ktime_t start_time;
|
||||
@ -660,10 +660,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
||||
*/
|
||||
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
|
||||
JBUFFER_TRACE(jh, "ph3: write metadata");
|
||||
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
|
||||
escape = jbd2_journal_write_metadata_buffer(commit_transaction,
|
||||
jh, &wbuf[bufs], blocknr);
|
||||
if (flags < 0) {
|
||||
jbd2_journal_abort(journal, flags);
|
||||
if (escape < 0) {
|
||||
jbd2_journal_abort(journal, escape);
|
||||
continue;
|
||||
}
|
||||
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
|
||||
@ -672,7 +672,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
|
||||
buffer */
|
||||
|
||||
tag_flag = 0;
|
||||
if (flags & 1)
|
||||
if (escape)
|
||||
tag_flag |= JBD2_FLAG_ESCAPE;
|
||||
if (!first_tag)
|
||||
tag_flag |= JBD2_FLAG_SAME_UUID;
|
||||
@ -766,7 +766,7 @@ start_journal_io:
|
||||
if (first_block < journal->j_tail)
|
||||
freed += journal->j_last - journal->j_first;
|
||||
/* Update tail only if we free significant amount of space */
|
||||
if (freed < jbd2_journal_get_max_txn_bufs(journal))
|
||||
if (freed < journal->j_max_transaction_buffers)
|
||||
update_tail = 0;
|
||||
}
|
||||
J_ASSERT(commit_transaction->t_state == T_COMMIT);
|
||||
@ -1107,7 +1107,7 @@ restart_loop:
|
||||
|
||||
commit_transaction->t_state = T_COMMIT_CALLBACK;
|
||||
J_ASSERT(commit_transaction == journal->j_committing_transaction);
|
||||
journal->j_commit_sequence = commit_transaction->t_tid;
|
||||
WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
|
||||
journal->j_committing_transaction = NULL;
|
||||
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
|
||||
|
||||
|
@ -220,19 +220,12 @@ loop:
|
||||
* so we don't sleep
|
||||
*/
|
||||
DEFINE_WAIT(wait);
|
||||
int should_sleep = 1;
|
||||
|
||||
prepare_to_wait(&journal->j_wait_commit, &wait,
|
||||
TASK_INTERRUPTIBLE);
|
||||
if (journal->j_commit_sequence != journal->j_commit_request)
|
||||
should_sleep = 0;
|
||||
transaction = journal->j_running_transaction;
|
||||
if (transaction && time_after_eq(jiffies,
|
||||
transaction->t_expires))
|
||||
should_sleep = 0;
|
||||
if (journal->j_flags & JBD2_UNMOUNT)
|
||||
should_sleep = 0;
|
||||
if (should_sleep) {
|
||||
if (transaction == NULL ||
|
||||
time_before(jiffies, transaction->t_expires)) {
|
||||
write_unlock(&journal->j_state_lock);
|
||||
schedule();
|
||||
write_lock(&journal->j_state_lock);
|
||||
@ -316,11 +309,8 @@ static void journal_kill_thread(journal_t *journal)
|
||||
*
|
||||
* Return value:
|
||||
* <0: Error
|
||||
* >=0: Finished OK
|
||||
*
|
||||
* On success:
|
||||
* Bit 0 set == escape performed on the data
|
||||
* Bit 1 set == buffer copy-out performed (kfree the data after IO)
|
||||
* =0: Finished OK without escape
|
||||
* =1: Finished OK with escape
|
||||
*/
|
||||
|
||||
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
|
||||
@ -328,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
|
||||
struct buffer_head **bh_out,
|
||||
sector_t blocknr)
|
||||
{
|
||||
int need_copy_out = 0;
|
||||
int done_copy_out = 0;
|
||||
int do_escape = 0;
|
||||
char *mapped_data;
|
||||
@ -355,7 +344,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
|
||||
atomic_set(&new_bh->b_count, 1);
|
||||
|
||||
spin_lock(&jh_in->b_state_lock);
|
||||
repeat:
|
||||
/*
|
||||
* If a new transaction has already done a buffer copy-out, then
|
||||
* we use that version of the data for the commit.
|
||||
@ -365,8 +353,8 @@ repeat:
|
||||
new_folio = virt_to_folio(jh_in->b_frozen_data);
|
||||
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
|
||||
} else {
|
||||
new_folio = jh2bh(jh_in)->b_folio;
|
||||
new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
|
||||
new_folio = bh_in->b_folio;
|
||||
new_offset = offset_in_folio(new_folio, bh_in->b_data);
|
||||
}
|
||||
|
||||
mapped_data = kmap_local_folio(new_folio, new_offset);
|
||||
@ -383,54 +371,52 @@ repeat:
|
||||
/*
|
||||
* Check for escaping
|
||||
*/
|
||||
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
|
||||
need_copy_out = 1;
|
||||
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
|
||||
do_escape = 1;
|
||||
}
|
||||
kunmap_local(mapped_data);
|
||||
|
||||
/*
|
||||
* Do we need to do a data copy?
|
||||
*/
|
||||
if (need_copy_out && !done_copy_out) {
|
||||
if (do_escape && !done_copy_out) {
|
||||
char *tmp;
|
||||
|
||||
spin_unlock(&jh_in->b_state_lock);
|
||||
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
|
||||
if (!tmp) {
|
||||
brelse(new_bh);
|
||||
free_buffer_head(new_bh);
|
||||
return -ENOMEM;
|
||||
}
|
||||
spin_lock(&jh_in->b_state_lock);
|
||||
if (jh_in->b_frozen_data) {
|
||||
jbd2_free(tmp, bh_in->b_size);
|
||||
goto repeat;
|
||||
goto copy_done;
|
||||
}
|
||||
|
||||
jh_in->b_frozen_data = tmp;
|
||||
memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
|
||||
|
||||
new_folio = virt_to_folio(tmp);
|
||||
new_offset = offset_in_folio(new_folio, tmp);
|
||||
done_copy_out = 1;
|
||||
|
||||
/*
|
||||
* This isn't strictly necessary, as we're using frozen
|
||||
* data for the escaping, but it keeps consistency with
|
||||
* b_frozen_data usage.
|
||||
*/
|
||||
jh_in->b_frozen_triggers = jh_in->b_triggers;
|
||||
|
||||
copy_done:
|
||||
new_folio = virt_to_folio(jh_in->b_frozen_data);
|
||||
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
|
||||
done_copy_out = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Did we need to do an escaping? Now we've done all the
|
||||
* copying, we can finally do so.
|
||||
* b_frozen_data is from jbd2_alloc() which always provides an
|
||||
* address from the direct kernels mapping.
|
||||
*/
|
||||
if (do_escape) {
|
||||
mapped_data = kmap_local_folio(new_folio, new_offset);
|
||||
*((unsigned int *)mapped_data) = 0;
|
||||
kunmap_local(mapped_data);
|
||||
}
|
||||
if (do_escape)
|
||||
*((unsigned int *)jh_in->b_frozen_data) = 0;
|
||||
|
||||
folio_set_bh(new_bh, new_folio, new_offset);
|
||||
new_bh->b_size = bh_in->b_size;
|
||||
@ -454,7 +440,7 @@ repeat:
|
||||
set_buffer_shadow(bh_in);
|
||||
spin_unlock(&jh_in->b_state_lock);
|
||||
|
||||
return do_escape | (done_copy_out << 1);
|
||||
return do_escape;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -789,17 +775,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
|
||||
/* Return 1 when transaction with given tid has already committed. */
|
||||
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
|
||||
{
|
||||
int ret = 1;
|
||||
|
||||
read_lock(&journal->j_state_lock);
|
||||
if (journal->j_running_transaction &&
|
||||
journal->j_running_transaction->t_tid == tid)
|
||||
ret = 0;
|
||||
if (journal->j_committing_transaction &&
|
||||
journal->j_committing_transaction->t_tid == tid)
|
||||
ret = 0;
|
||||
read_unlock(&journal->j_state_lock);
|
||||
return ret;
|
||||
return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
|
||||
}
|
||||
EXPORT_SYMBOL(jbd2_transaction_committed);
|
||||
|
||||
@ -1451,6 +1427,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
|
||||
return space / record_size;
|
||||
}
|
||||
|
||||
static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
|
||||
{
|
||||
return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Base amount of descriptor blocks we reserve for each transaction.
|
||||
*/
|
||||
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
|
||||
{
|
||||
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
|
||||
int tags_per_block;
|
||||
|
||||
/* Subtract UUID */
|
||||
tag_space -= 16;
|
||||
if (jbd2_journal_has_csum_v2or3(journal))
|
||||
tag_space -= sizeof(struct jbd2_journal_block_tail);
|
||||
/* Commit code leaves a slack space of 16 bytes at the end of block */
|
||||
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
|
||||
/*
|
||||
* Revoke descriptors are accounted separately so we need to reserve
|
||||
* space for commit block and normal transaction descriptor blocks.
|
||||
*/
|
||||
return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
|
||||
tags_per_block);
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize number of blocks each transaction reserves for its bookkeeping
|
||||
* and maximum number of blocks a transaction can use. This needs to be called
|
||||
* after the journal size and the fastcommit area size are initialized.
|
||||
*/
|
||||
static void jbd2_journal_init_transaction_limits(journal_t *journal)
|
||||
{
|
||||
journal->j_revoke_records_per_block =
|
||||
journal_revoke_records_per_block(journal);
|
||||
journal->j_transaction_overhead_buffers =
|
||||
jbd2_descriptor_blocks_per_trans(journal);
|
||||
journal->j_max_transaction_buffers =
|
||||
jbd2_journal_get_max_txn_bufs(journal);
|
||||
}
|
||||
|
||||
/*
|
||||
* Load the on-disk journal superblock and read the key fields into the
|
||||
* journal_t.
|
||||
@ -1492,8 +1510,8 @@ static int journal_load_superblock(journal_t *journal)
|
||||
if (jbd2_journal_has_csum_v2or3(journal))
|
||||
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
|
||||
sizeof(sb->s_uuid));
|
||||
journal->j_revoke_records_per_block =
|
||||
journal_revoke_records_per_block(journal);
|
||||
/* After journal features are set, we can compute transaction limits */
|
||||
jbd2_journal_init_transaction_limits(journal);
|
||||
|
||||
if (jbd2_has_feature_fast_commit(journal)) {
|
||||
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
|
||||
@ -1599,7 +1617,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
|
||||
|
||||
journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
|
||||
journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
|
||||
journal->j_shrinker->batch = journal->j_max_transaction_buffers;
|
||||
journal->j_shrinker->private_data = journal;
|
||||
|
||||
shrinker_register(journal->j_shrinker);
|
||||
@ -1743,8 +1760,6 @@ static int journal_reset(journal_t *journal)
|
||||
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
|
||||
journal->j_commit_request = journal->j_commit_sequence;
|
||||
|
||||
journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
|
||||
|
||||
/*
|
||||
* Now that journal recovery is done, turn fast commits off here. This
|
||||
* way, if fast commit was enabled before the crash but if now FS has
|
||||
@ -2285,8 +2300,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
|
||||
journal->j_fc_first = journal->j_last + 1;
|
||||
journal->j_fc_off = 0;
|
||||
journal->j_free = journal->j_last - journal->j_first;
|
||||
journal->j_max_transaction_buffers =
|
||||
jbd2_journal_get_max_txn_bufs(journal);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2374,8 +2387,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
|
||||
sb->s_feature_ro_compat |= cpu_to_be32(ro);
|
||||
sb->s_feature_incompat |= cpu_to_be32(incompat);
|
||||
unlock_buffer(journal->j_sb_buffer);
|
||||
journal->j_revoke_records_per_block =
|
||||
journal_revoke_records_per_block(journal);
|
||||
jbd2_journal_init_transaction_limits(journal);
|
||||
|
||||
return 1;
|
||||
#undef COMPAT_FEATURE_ON
|
||||
@ -2406,8 +2418,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
|
||||
sb->s_feature_compat &= ~cpu_to_be32(compat);
|
||||
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
|
||||
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
|
||||
journal->j_revoke_records_per_block =
|
||||
journal_revoke_records_per_block(journal);
|
||||
jbd2_journal_init_transaction_limits(journal);
|
||||
}
|
||||
EXPORT_SYMBOL(jbd2_journal_clear_features);
|
||||
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <linux/errno.h>
|
||||
#include <linux/crc32.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/string_choices.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -374,7 +375,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
|
||||
be32_to_cpu(journal->j_superblock->s_sequence);
|
||||
jbd2_debug(1,
|
||||
"JBD2: ignoring %d transaction%s from the journal.\n",
|
||||
dropped, (dropped == 1) ? "" : "s");
|
||||
dropped, str_plural(dropped));
|
||||
#endif
|
||||
journal->j_transaction_sequence = ++info.end_transaction;
|
||||
journal->j_head = info.head_block;
|
||||
@ -443,6 +444,27 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
|
||||
return provided == cpu_to_be32(calculated);
|
||||
}
|
||||
|
||||
static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
|
||||
{
|
||||
struct commit_header *h;
|
||||
__be32 provided;
|
||||
__u32 calculated;
|
||||
void *tmpbuf;
|
||||
|
||||
tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL);
|
||||
if (!tmpbuf)
|
||||
return false;
|
||||
|
||||
memcpy(tmpbuf, buf, sizeof(struct commit_header));
|
||||
h = tmpbuf;
|
||||
provided = h->h_chksum[0];
|
||||
h->h_chksum[0] = 0;
|
||||
calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
|
||||
kfree(tmpbuf);
|
||||
|
||||
return provided == cpu_to_be32(calculated);
|
||||
}
|
||||
|
||||
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
|
||||
journal_block_tag3_t *tag3,
|
||||
void *buf, __u32 sequence)
|
||||
@ -810,6 +832,13 @@ static int do_one_pass(journal_t *journal,
|
||||
if (pass == PASS_SCAN &&
|
||||
!jbd2_commit_block_csum_verify(journal,
|
||||
bh->b_data)) {
|
||||
if (jbd2_commit_block_csum_verify_partial(
|
||||
journal,
|
||||
bh->b_data)) {
|
||||
pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
|
||||
next_commit_ID, next_log_block);
|
||||
goto chksum_ok;
|
||||
}
|
||||
chksum_error:
|
||||
if (commit_time < last_trans_commit_time)
|
||||
goto ignore_crc_mismatch;
|
||||
@ -824,6 +853,7 @@ static int do_one_pass(journal_t *journal,
|
||||
}
|
||||
}
|
||||
if (pass == PASS_SCAN) {
|
||||
chksum_ok:
|
||||
last_trans_commit_time = commit_time;
|
||||
head_block = next_log_block;
|
||||
}
|
||||
@ -843,6 +873,7 @@ static int do_one_pass(journal_t *journal,
|
||||
next_log_block);
|
||||
need_check_commit_time = true;
|
||||
}
|
||||
|
||||
/* If we aren't in the REVOKE pass, then we can
|
||||
* just skip over this block. */
|
||||
if (pass != PASS_REVOKE) {
|
||||
|
@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
|
||||
kmem_cache_free(transaction_cache, transaction);
|
||||
}
|
||||
|
||||
/*
|
||||
* Base amount of descriptor blocks we reserve for each transaction.
|
||||
*/
|
||||
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
|
||||
{
|
||||
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
|
||||
int tags_per_block;
|
||||
|
||||
/* Subtract UUID */
|
||||
tag_space -= 16;
|
||||
if (jbd2_journal_has_csum_v2or3(journal))
|
||||
tag_space -= sizeof(struct jbd2_journal_block_tail);
|
||||
/* Commit code leaves a slack space of 16 bytes at the end of block */
|
||||
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
|
||||
/*
|
||||
* Revoke descriptors are accounted separately so we need to reserve
|
||||
* space for commit block and normal transaction descriptor blocks.
|
||||
*/
|
||||
return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
|
||||
tags_per_block);
|
||||
}
|
||||
|
||||
/*
|
||||
* jbd2_get_transaction: obtain a new transaction_t object.
|
||||
*
|
||||
@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
|
||||
transaction->t_expires = jiffies + journal->j_commit_interval;
|
||||
atomic_set(&transaction->t_updates, 0);
|
||||
atomic_set(&transaction->t_outstanding_credits,
|
||||
jbd2_descriptor_blocks_per_trans(journal) +
|
||||
journal->j_transaction_overhead_buffers +
|
||||
atomic_read(&journal->j_reserved_credits));
|
||||
atomic_set(&transaction->t_outstanding_revokes, 0);
|
||||
atomic_set(&transaction->t_handle_count, 0);
|
||||
@ -213,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
|
||||
wake_up(&journal->j_wait_reserved);
|
||||
}
|
||||
|
||||
/* Maximum number of blocks for user transaction payload */
|
||||
static int jbd2_max_user_trans_buffers(journal_t *journal)
|
||||
{
|
||||
return journal->j_max_transaction_buffers -
|
||||
journal->j_transaction_overhead_buffers;
|
||||
}
|
||||
|
||||
/*
|
||||
* Wait until we can add credits for handle to the running transaction. Called
|
||||
* with j_state_lock held for reading. Returns 0 if handle joined the running
|
||||
@ -262,12 +247,12 @@ __must_hold(&journal->j_state_lock)
|
||||
* big to fit this handle? Wait until reserved credits are freed.
|
||||
*/
|
||||
if (atomic_read(&journal->j_reserved_credits) + total >
|
||||
journal->j_max_transaction_buffers) {
|
||||
jbd2_max_user_trans_buffers(journal)) {
|
||||
read_unlock(&journal->j_state_lock);
|
||||
jbd2_might_wait_for_commit(journal);
|
||||
wait_event(journal->j_wait_reserved,
|
||||
atomic_read(&journal->j_reserved_credits) + total <=
|
||||
journal->j_max_transaction_buffers);
|
||||
jbd2_max_user_trans_buffers(journal));
|
||||
__acquire(&journal->j_state_lock); /* fake out sparse */
|
||||
return 1;
|
||||
}
|
||||
@ -307,14 +292,14 @@ __must_hold(&journal->j_state_lock)
|
||||
|
||||
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
|
||||
/* We allow at most half of a transaction to be reserved */
|
||||
if (needed > journal->j_max_transaction_buffers / 2) {
|
||||
if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
|
||||
sub_reserved_credits(journal, rsv_blocks);
|
||||
atomic_sub(total, &t->t_outstanding_credits);
|
||||
read_unlock(&journal->j_state_lock);
|
||||
jbd2_might_wait_for_commit(journal);
|
||||
wait_event(journal->j_wait_reserved,
|
||||
atomic_read(&journal->j_reserved_credits) + rsv_blocks
|
||||
<= journal->j_max_transaction_buffers / 2);
|
||||
<= jbd2_max_user_trans_buffers(journal) / 2);
|
||||
__acquire(&journal->j_state_lock); /* fake out sparse */
|
||||
return 1;
|
||||
}
|
||||
@ -344,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
|
||||
* size and limit the number of total credits to not exceed maximum
|
||||
* transaction size per operation.
|
||||
*/
|
||||
if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
|
||||
(rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
|
||||
if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
|
||||
rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
|
||||
printk(KERN_ERR "JBD2: %s wants too many credits "
|
||||
"credits:%d rsv_credits:%d max:%d\n",
|
||||
current->comm, blocks, rsv_blocks,
|
||||
journal->j_max_transaction_buffers);
|
||||
jbd2_max_user_trans_buffers(journal));
|
||||
WARN_ON(1);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
@ -1085,6 +1085,13 @@ struct journal_s
|
||||
*/
|
||||
int j_revoke_records_per_block;
|
||||
|
||||
/**
|
||||
* @j_transaction_overhead:
|
||||
*
|
||||
* Number of blocks each transaction needs for its own bookkeeping
|
||||
*/
|
||||
int j_transaction_overhead_buffers;
|
||||
|
||||
/**
|
||||
* @j_commit_interval:
|
||||
*
|
||||
@ -1660,11 +1667,6 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
|
||||
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
|
||||
int jbd2_fc_release_bufs(journal_t *journal);
|
||||
|
||||
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
|
||||
{
|
||||
return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
|
||||
}
|
||||
|
||||
/*
|
||||
* is_journal_abort
|
||||
*
|
||||
|
@ -1246,14 +1246,15 @@ TRACE_EVENT(ext4_da_update_reserve_space,
|
||||
);
|
||||
|
||||
TRACE_EVENT(ext4_da_reserve_space,
|
||||
TP_PROTO(struct inode *inode),
|
||||
TP_PROTO(struct inode *inode, int nr_resv),
|
||||
|
||||
TP_ARGS(inode),
|
||||
TP_ARGS(inode, nr_resv),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( dev_t, dev )
|
||||
__field( ino_t, ino )
|
||||
__field( __u64, i_blocks )
|
||||
__field( int, reserve_blocks )
|
||||
__field( int, reserved_data_blocks )
|
||||
__field( __u16, mode )
|
||||
),
|
||||
@ -1262,16 +1263,17 @@ TRACE_EVENT(ext4_da_reserve_space,
|
||||
__entry->dev = inode->i_sb->s_dev;
|
||||
__entry->ino = inode->i_ino;
|
||||
__entry->i_blocks = inode->i_blocks;
|
||||
__entry->reserve_blocks = nr_resv;
|
||||
__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
|
||||
__entry->mode = inode->i_mode;
|
||||
),
|
||||
|
||||
TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
|
||||
TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d"
|
||||
"reserved_data_blocks %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
(unsigned long) __entry->ino,
|
||||
__entry->mode, __entry->i_blocks,
|
||||
__entry->reserved_data_blocks)
|
||||
__entry->reserve_blocks, __entry->reserved_data_blocks)
|
||||
);
|
||||
|
||||
TRACE_EVENT(ext4_da_release_space,
|
||||
@ -2478,11 +2480,11 @@ TRACE_EVENT(ext4_es_shrink,
|
||||
__entry->scan_time, __entry->nr_skipped, __entry->retried)
|
||||
);
|
||||
|
||||
TRACE_EVENT(ext4_es_insert_delayed_block,
|
||||
TRACE_EVENT(ext4_es_insert_delayed_extent,
|
||||
TP_PROTO(struct inode *inode, struct extent_status *es,
|
||||
bool allocated),
|
||||
bool lclu_allocated, bool end_allocated),
|
||||
|
||||
TP_ARGS(inode, es, allocated),
|
||||
TP_ARGS(inode, es, lclu_allocated, end_allocated),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( dev_t, dev )
|
||||
@ -2491,7 +2493,8 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
|
||||
__field( ext4_lblk_t, len )
|
||||
__field( ext4_fsblk_t, pblk )
|
||||
__field( char, status )
|
||||
__field( bool, allocated )
|
||||
__field( bool, lclu_allocated )
|
||||
__field( bool, end_allocated )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
@ -2501,16 +2504,17 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
|
||||
__entry->len = es->es_len;
|
||||
__entry->pblk = ext4_es_show_pblock(es);
|
||||
__entry->status = ext4_es_status(es);
|
||||
__entry->allocated = allocated;
|
||||
__entry->lclu_allocated = lclu_allocated;
|
||||
__entry->end_allocated = end_allocated;
|
||||
),
|
||||
|
||||
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
|
||||
"allocated %d",
|
||||
"allocated %d %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
(unsigned long) __entry->ino,
|
||||
__entry->lblk, __entry->len,
|
||||
__entry->pblk, show_extent_status(__entry->status),
|
||||
__entry->allocated)
|
||||
__entry->lclu_allocated, __entry->end_allocated)
|
||||
);
|
||||
|
||||
/* fsmap traces */
|
||||
|
Loading…
Reference in New Issue
Block a user