ext4: use pre-zeroed blocks for DAX page faults
Make DAX fault path use pre-zeroed blocks to avoid races with extent conversion and zeroing when two page faults to the same block happen. Signed-off-by: Jan Kara <jack@suse.com> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
parent
c86d8db33a
commit
ba5843f51d
@ -2452,8 +2452,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
|
||||
int ext4_get_block_write(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create);
|
||||
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
|
||||
|
@ -193,18 +193,6 @@ out:
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
return;
|
||||
WARN_ON(!buffer_unwritten(bh));
|
||||
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
|
||||
}
|
||||
|
||||
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
int result;
|
||||
@ -225,8 +213,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
if (IS_ERR(handle))
|
||||
result = VM_FAULT_SIGBUS;
|
||||
else
|
||||
result = __dax_fault(vma, vmf, ext4_get_block_dax,
|
||||
ext4_end_io_unwritten);
|
||||
result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
|
||||
|
||||
if (write) {
|
||||
if (!IS_ERR(handle))
|
||||
@ -262,7 +249,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
|
||||
result = VM_FAULT_SIGBUS;
|
||||
else
|
||||
result = __dax_pmd_fault(vma, addr, pmd, flags,
|
||||
ext4_get_block_dax, ext4_end_io_unwritten);
|
||||
ext4_dax_mmap_get_block, NULL);
|
||||
|
||||
if (write) {
|
||||
if (!IS_ERR(handle))
|
||||
@ -283,8 +270,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vma->vm_file);
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
|
||||
ext4_end_io_unwritten);
|
||||
err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
|
||||
|
@ -723,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
|
||||
|
||||
map_bh(bh, inode->i_sb, map.m_pblk);
|
||||
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
|
||||
if (IS_DAX(inode) && buffer_unwritten(bh)) {
|
||||
/*
|
||||
* dgc: I suspect unwritten conversion on ext4+DAX is
|
||||
* fundamentally broken here when there are concurrent
|
||||
* read/write in progress on this inode.
|
||||
*/
|
||||
WARN_ON_ONCE(io_end);
|
||||
bh->b_assoc_map = inode->i_mapping;
|
||||
bh->b_private = (void *)(unsigned long)iblock;
|
||||
}
|
||||
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
|
||||
set_buffer_defer_completion(bh);
|
||||
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
|
||||
@ -3097,17 +3087,79 @@ static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
#ifdef CONFIG_FS_DAX
|
||||
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
|
||||
int ret, err;
|
||||
int credits;
|
||||
struct ext4_map_blocks map;
|
||||
handle_t *handle = NULL;
|
||||
int flags = 0;
|
||||
|
||||
if (create)
|
||||
flags |= EXT4_GET_BLOCKS_CREATE;
|
||||
ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
|
||||
ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
|
||||
inode->i_ino, create);
|
||||
return _ext4_get_block(inode, iblock, bh_result, flags);
|
||||
map.m_lblk = iblock;
|
||||
map.m_len = bh_result->b_size >> inode->i_blkbits;
|
||||
credits = ext4_chunk_trans_blocks(inode, map.m_len);
|
||||
if (create) {
|
||||
flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
|
||||
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ret = ext4_map_blocks(handle, inode, &map, flags);
|
||||
if (create) {
|
||||
err = ext4_journal_stop(handle);
|
||||
if (ret >= 0 && err < 0)
|
||||
ret = err;
|
||||
}
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
if (map.m_flags & EXT4_MAP_UNWRITTEN) {
|
||||
int err2;
|
||||
|
||||
/*
|
||||
* We are protected by i_mmap_sem so we know block cannot go
|
||||
* away from under us even though we dropped i_data_sem.
|
||||
* Convert extent to written and write zeros there.
|
||||
*
|
||||
* Note: We may get here even when create == 0.
|
||||
*/
|
||||
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = ext4_map_blocks(handle, inode, &map,
|
||||
EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
|
||||
if (err < 0)
|
||||
ret = err;
|
||||
err2 = ext4_journal_stop(handle);
|
||||
if (err2 < 0 && ret > 0)
|
||||
ret = err2;
|
||||
}
|
||||
out:
|
||||
WARN_ON_ONCE(ret == 0 && create);
|
||||
if (ret > 0) {
|
||||
map_bh(bh_result, inode->i_sb, map.m_pblk);
|
||||
bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
|
||||
map.m_flags;
|
||||
/*
|
||||
* At least for now we have to clear BH_New so that DAX code
|
||||
* doesn't attempt to zero blocks again in a racy way.
|
||||
*/
|
||||
bh_result->b_state &= ~(1 << BH_New);
|
||||
bh_result->b_size = map.m_len << inode->i_blkbits;
|
||||
ret = 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
|
||||
ssize_t size, void *private)
|
||||
|
Loading…
x
Reference in New Issue
Block a user